From 5d4f9f7e118eb22f9a62d7a2db3b33453ed994fa Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 29 Jul 2013 18:43:01 +0200 Subject: [PATCH 1/8] Add notification options to configuration. --- config/config.go | 33 ++++++- config/config.proto | 33 +++++++ config/config_test.go | 14 ++- config/fixtures/duplicate_nc_name.conf.input | 28 ++++++ .../missing_filter_name_re.conf.input | 6 ++ config/fixtures/missing_nc_name.conf.input | 27 ++++++ .../fixtures/nonexistent_nc_name.conf.input | 18 ++++ config/fixtures/sample.conf.input | 16 ++-- config/generated/config.pb.go | 90 +++++++++++++++++-- manager/aggregator.go | 6 +- 10 files changed, 252 insertions(+), 19 deletions(-) create mode 100644 config/fixtures/duplicate_nc_name.conf.input create mode 100644 config/fixtures/missing_filter_name_re.conf.input create mode 100644 config/fixtures/missing_nc_name.conf.input create mode 100644 config/fixtures/nonexistent_nc_name.conf.input diff --git a/config/config.go b/config/config.go index c20ae47486..641ce9fb3d 100644 --- a/config/config.go +++ b/config/config.go @@ -39,6 +39,29 @@ func (c Config) String() string { // Validate checks an entire parsed Config for the validity of its fields. func (c Config) Validate() error { + ncNames := map[string]bool{} + for _, nc := range c.NotificationConfig { + if nc.Name == nil { + return fmt.Errorf("Missing name in notification config: %s", proto.MarshalTextString(nc)) + } + for _, pdc := range nc.PagerdutyConfig { + if pdc.ServiceKey == nil { + return fmt.Errorf("Missing service key in PagerDuty notification config: %s", proto.MarshalTextString(pdc)) + } + } + for _, ec := range nc.EmailConfig { + if ec.Email == nil { + return fmt.Errorf("Missing email address in email notification config: %s", proto.MarshalTextString(ec)) + } + } + + if _, ok := ncNames[nc.GetName()]; ok { + return fmt.Errorf("Notification config name not unique: %s", nc.GetName()) + } + + ncNames[nc.GetName()] = true + } + for _, a := range c.AggregationRule { for _, f := range a.Filter { if f.NameRe == nil { @@ -48,7 +71,12 @@ func (c Config) Validate() error { return fmt.Errorf("Missing value pattern (value_re) in filter definition: %s", proto.MarshalTextString(f)) } } + + if _, ok := ncNames[a.GetNotificationConfig()]; !ok { + return fmt.Errorf("No such notification config: %s", a.GetNotificationConfig()) + } } + return nil } @@ -61,8 +89,9 @@ func (c Config) AggregationRules() manager.AggregationRules { filters = append(filters, manager.NewFilter(filter.GetNameRe(), filter.GetValueRe())) } rules = append(rules, &manager.AggregationRule{ - Filters: filters, - RepeatRate: time.Duration(r.GetRepeatRateSeconds()) * time.Second, + Filters: filters, + RepeatRate: time.Duration(r.GetRepeatRateSeconds()) * time.Second, + NotificationConfig: r.GetNotificationConfig(), }) } return rules diff --git a/config/config.proto b/config/config.proto index 87aa84f5d5..35e43dbd5b 100644 --- a/config/config.proto +++ b/config/config.proto @@ -13,6 +13,29 @@ package io.prometheus.alert_manager; +// Configuration for notification via PagerDuty. +message PagerDutyConfig { + // PagerDuty service key, see: + // http://developer.pagerduty.com/documentation/integration/events + optional string service_key = 1; +} + +// Configuration for notification via mail. +message EmailConfig { + // Email address to notify. + optional string email = 1; +} + +// Notification configuration definition. +message NotificationConfig { + // Name of this NotificationConfig. Referenced from AggregationRule. + optional string name = 1; + // Zero or more PagerDuty notification configurations. + repeated PagerDutyConfig pagerduty_config = 2; + // Zero or more email notification configurations. + repeated EmailConfig email_config = 3; +} + // A regex-based label filter used in aggregations. message Filter { // The regex matching the label name. @@ -21,11 +44,21 @@ message Filter { optional string value_re = 2; } +// Grouping and notification setting definitions for alerts. message AggregationRule { + // Filters that define which alerts are matched by this AggregationRule. repeated Filter filter = 1; + // How many seconds to wait before resending a notification for a specific alert. optional int32 repeat_rate_seconds = 2 [default = 7200]; + // Notification configuration to use for this AggregationRule, referenced by + // their name. + optional string notification_config = 3; } +// Global alert manager configuration. message AlertManagerConfig { + // Aggregation rule definitions. repeated AggregationRule aggregation_rule = 1; + // Notification configuration definitions. + repeated NotificationConfig notification_config = 2; } diff --git a/config/config_test.go b/config/config_test.go index 803d7e118e..01497d04b1 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -48,13 +48,25 @@ func TestConfigs(t *testing.T) { }, { inputFile: "sample.conf.input", }, { - inputFile: "missing_name_re.conf.input", + inputFile: "missing_filter_name_re.conf.input", shouldFail: true, errContains: "Missing name pattern", }, { inputFile: "invalid_proto_format.conf.input", shouldFail: true, errContains: "unknown field name", + }, { + inputFile: "duplicate_nc_name.conf.input", + shouldFail: true, + errContains: "not unique", + }, { + inputFile: "nonexistent_nc_name.conf.input", + shouldFail: true, + errContains: "No such notification config", + }, { + inputFile: "missing_nc_name.conf.input", + shouldFail: true, + errContains: "Missing name", }, } diff --git a/config/fixtures/duplicate_nc_name.conf.input b/config/fixtures/duplicate_nc_name.conf.input new file mode 100644 index 0000000000..c656972d2b --- /dev/null +++ b/config/fixtures/duplicate_nc_name.conf.input @@ -0,0 +1,28 @@ +notification_config { + name: "alertmanager_test" + pagerduty_config { + service_key: "supersecretapikey" + } + email_config { + email: "test@testservice.org" + } +} + +notification_config { + name: "alertmanager_test" + pagerduty_config { + service_key: "supersecretapikey" + } + email_config { + email: "test@testservice.org" + } +} + +aggregation_rule { + filter { + name_re: "service" + value_re: "test" + } + repeat_rate_seconds: 3600 + notification_config: "alertmanager_test" +} diff --git a/config/fixtures/missing_filter_name_re.conf.input b/config/fixtures/missing_filter_name_re.conf.input new file mode 100644 index 0000000000..2926f9e4a0 --- /dev/null +++ b/config/fixtures/missing_filter_name_re.conf.input @@ -0,0 +1,6 @@ +aggregation_rule { + filter { + value_re: "test" + } + repeat_rate_seconds: 3600 +} diff --git a/config/fixtures/missing_nc_name.conf.input b/config/fixtures/missing_nc_name.conf.input new file mode 100644 index 0000000000..6afb982356 --- /dev/null +++ b/config/fixtures/missing_nc_name.conf.input @@ -0,0 +1,27 @@ +notification_config { + pagerduty_config { + service_key: "supersecretapikey" + } + email_config { + email: "test@testservice.org" + } +} + +notification_config { + name: "alertmanager_test" + pagerduty_config { + service_key: "supersecretapikey" + } + email_config { + email: "test@testservice.org" + } +} + +aggregation_rule { + filter { + name_re: "service" + value_re: "test" + } + repeat_rate_seconds: 3600 + notification_config: "alertmanager_test" +} diff --git a/config/fixtures/nonexistent_nc_name.conf.input b/config/fixtures/nonexistent_nc_name.conf.input new file mode 100644 index 0000000000..954b0ba91e --- /dev/null +++ b/config/fixtures/nonexistent_nc_name.conf.input @@ -0,0 +1,18 @@ +notification_config { + name: "alertmanager_test" + pagerduty_config { + service_key: "supersecretapikey" + } + email_config { + email: "test@testservice.org" + } +} + +aggregation_rule { + filter { + name_re: "service" + value_re: "test" + } + repeat_rate_seconds: 3600 + notification_config: "alertmanager_test2" +} diff --git a/config/fixtures/sample.conf.input b/config/fixtures/sample.conf.input index 4922f502a8..d451d0d25f 100644 --- a/config/fixtures/sample.conf.input +++ b/config/fixtures/sample.conf.input @@ -1,13 +1,11 @@ -aggregation_rule { - filter { - name_re: "service" - value_re: "discovery" +notification_config { + name: "alertmanager_test" + pagerduty_config { + service_key: "supersecretapikey" } - filter { - name_re: "zone" - value_re: "aa" + email_config { + email: "test@testservice.org" } - repeat_rate_seconds: 3600 } aggregation_rule { @@ -15,4 +13,6 @@ aggregation_rule { name_re: "service" value_re: "test" } + repeat_rate_seconds: 3600 + notification_config: "alertmanager_test" } diff --git a/config/generated/config.pb.go b/config/generated/config.pb.go index ea62880093..a29870219d 100644 --- a/config/generated/config.pb.go +++ b/config/generated/config.pb.go @@ -13,6 +13,70 @@ var _ = proto.Marshal var _ = &json.SyntaxError{} var _ = math.Inf +type PagerDutyConfig struct { + ServiceKey *string `protobuf:"bytes,1,opt,name=service_key" json:"service_key,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *PagerDutyConfig) Reset() { *m = PagerDutyConfig{} } +func (m *PagerDutyConfig) String() string { return proto.CompactTextString(m) } +func (*PagerDutyConfig) ProtoMessage() {} + +func (m *PagerDutyConfig) GetServiceKey() string { + if m != nil && m.ServiceKey != nil { + return *m.ServiceKey + } + return "" +} + +type EmailConfig struct { + Email *string `protobuf:"bytes,1,opt,name=email" json:"email,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *EmailConfig) Reset() { *m = EmailConfig{} } +func (m *EmailConfig) String() string { return proto.CompactTextString(m) } +func (*EmailConfig) ProtoMessage() {} + +func (m *EmailConfig) GetEmail() string { + if m != nil && m.Email != nil { + return *m.Email + } + return "" +} + +type NotificationConfig struct { + Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"` + PagerdutyConfig []*PagerDutyConfig `protobuf:"bytes,2,rep,name=pagerduty_config" json:"pagerduty_config,omitempty"` + EmailConfig []*EmailConfig `protobuf:"bytes,3,rep,name=email_config" json:"email_config,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *NotificationConfig) Reset() { *m = NotificationConfig{} } +func (m *NotificationConfig) String() string { return proto.CompactTextString(m) } +func (*NotificationConfig) ProtoMessage() {} + +func (m *NotificationConfig) GetName() string { + if m != nil && m.Name != nil { + return *m.Name + } + return "" +} + +func (m *NotificationConfig) GetPagerdutyConfig() []*PagerDutyConfig { + if m != nil { + return m.PagerdutyConfig + } + return nil +} + +func (m *NotificationConfig) GetEmailConfig() []*EmailConfig { + if m != nil { + return m.EmailConfig + } + return nil +} + type Filter struct { NameRe *string `protobuf:"bytes,1,opt,name=name_re" json:"name_re,omitempty"` ValueRe *string `protobuf:"bytes,2,opt,name=value_re" json:"value_re,omitempty"` @@ -38,9 +102,10 @@ func (m *Filter) GetValueRe() string { } type AggregationRule struct { - Filter []*Filter `protobuf:"bytes,1,rep,name=filter" json:"filter,omitempty"` - RepeatRateSeconds *int32 `protobuf:"varint,2,opt,name=repeat_rate_seconds,def=7200" json:"repeat_rate_seconds,omitempty"` - XXX_unrecognized []byte `json:"-"` + Filter []*Filter `protobuf:"bytes,1,rep,name=filter" json:"filter,omitempty"` + RepeatRateSeconds *int32 `protobuf:"varint,2,opt,name=repeat_rate_seconds,def=7200" json:"repeat_rate_seconds,omitempty"` + NotificationConfig *string `protobuf:"bytes,3,opt,name=notification_config" json:"notification_config,omitempty"` + XXX_unrecognized []byte `json:"-"` } func (m *AggregationRule) Reset() { *m = AggregationRule{} } @@ -63,9 +128,17 @@ func (m *AggregationRule) GetRepeatRateSeconds() int32 { return Default_AggregationRule_RepeatRateSeconds } +func (m *AggregationRule) GetNotificationConfig() string { + if m != nil && m.NotificationConfig != nil { + return *m.NotificationConfig + } + return "" +} + type AlertManagerConfig struct { - AggregationRule []*AggregationRule `protobuf:"bytes,1,rep,name=aggregation_rule" json:"aggregation_rule,omitempty"` - XXX_unrecognized []byte `json:"-"` + AggregationRule []*AggregationRule `protobuf:"bytes,1,rep,name=aggregation_rule" json:"aggregation_rule,omitempty"` + NotificationConfig []*NotificationConfig `protobuf:"bytes,2,rep,name=notification_config" json:"notification_config,omitempty"` + XXX_unrecognized []byte `json:"-"` } func (m *AlertManagerConfig) Reset() { *m = AlertManagerConfig{} } @@ -79,5 +152,12 @@ func (m *AlertManagerConfig) GetAggregationRule() []*AggregationRule { return nil } +func (m *AlertManagerConfig) GetNotificationConfig() []*NotificationConfig { + if m != nil { + return m.NotificationConfig + } + return nil +} + func init() { } diff --git a/manager/aggregator.go b/manager/aggregator.go index a15428dbbb..936ab0652d 100644 --- a/manager/aggregator.go +++ b/manager/aggregator.go @@ -28,9 +28,9 @@ const ( // AggregationRule creates and manages the scope for received events. type AggregationRule struct { - Filters Filters - - RepeatRate time.Duration + Filters Filters + RepeatRate time.Duration + NotificationConfig string } type AggregationInstances []*AggregationInstance From 669f5ef916ead2309b2ed43f7e3d81abbf8d0104 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 29 Jul 2013 18:44:06 +0200 Subject: [PATCH 2/8] Fix /metrics endpoint name (was /metrics.json). --- web/web.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/web.go b/web/web.go index f2f919de9f..79abc7dc9f 100644 --- a/web/web.go +++ b/web/web.go @@ -63,7 +63,7 @@ func (w WebService) ServeForever() error { exp.Handle("/status", w.StatusHandler) exp.Handle("/api/", compressionHandler{handler: gorest.Handle()}) - exp.Handle("/metrics.json", prometheus.DefaultHandler) + exp.Handle("/metrics", prometheus.DefaultHandler) if *useLocalAssets { exp.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("web/static")))) } else { From dbdd7aef16b29d130211df0818506ce1850fa789 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 29 Jul 2013 18:45:33 +0200 Subject: [PATCH 3/8] Indicate silence status for each alert. --- main.go | 3 ++- web/alerts.go | 10 +++++++++- web/static/js/alerts.js | 4 ++++ web/templates/alerts.html | 12 ++++++++++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/main.go b/main.go index f7c5263335..ec38e3fcef 100644 --- a/main.go +++ b/main.go @@ -52,7 +52,8 @@ func main() { // Template-based page handlers. AlertsHandler: &web.AlertsHandler{ - Aggregator: aggregator, + Aggregator: aggregator, + IsInhibitedInterrogator: suppressor, }, SilencesHandler: &web.SilencesHandler{ Suppressor: suppressor, diff --git a/web/alerts.go b/web/alerts.go index 123e291c7d..6e9c173258 100644 --- a/web/alerts.go +++ b/web/alerts.go @@ -21,15 +21,23 @@ import ( type AlertStatus struct { AlertAggregates []*manager.AggregationInstance + SilenceForEvent func(*manager.Event) *manager.Suppression } type AlertsHandler struct { - Aggregator *manager.Aggregator + Aggregator *manager.Aggregator + IsInhibitedInterrogator manager.IsInhibitedInterrogator +} + +func (h *AlertsHandler) silenceForEvent(e *manager.Event) *manager.Suppression { + _, silence := h.IsInhibitedInterrogator.IsInhibited(e) + return silence } func (h *AlertsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { alertStatus := &AlertStatus{ AlertAggregates: h.Aggregator.AlertAggregates(), + SilenceForEvent: h.silenceForEvent, } executeTemplate(w, "alerts", alertStatus) } diff --git a/web/static/js/alerts.js b/web/static/js/alerts.js index 925aa1104b..006fb62bb0 100644 --- a/web/static/js/alerts.js +++ b/web/static/js/alerts.js @@ -195,6 +195,10 @@ function init() { $(".del_silence_btn").click(function() { deleteSilence(silenceId, silenceRow); }); + + $(".silence_link").click(function() { + alert("Linking to silence not yet supported"); + }); } $(init); diff --git a/web/templates/alerts.html b/web/templates/alerts.html index 315afcb331..e85c66ce18 100644 --- a/web/templates/alerts.html +++ b/web/templates/alerts.html @@ -17,10 +17,11 @@

Alerts

Labels Active Since Last Refreshed - Surpressed + Silenced + {{$silenceForEvent := .SilenceForEvent}} {{range .AlertAggregates}} @@ -45,7 +46,14 @@

Alerts

{{timeSince .Created}} ago {{timeSince .LastRefreshed}} ago - No + + {{$silence := call $silenceForEvent .Event}} + {{if $silence}} + by silence {{$silence.Id}} + {{else}} + not silenced + {{end}} + {{end}} From a64c37bb03ddb4e5404ffdf77e67f29b0cf09930 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 30 Jul 2013 13:12:24 +0200 Subject: [PATCH 4/8] Add missing format string and separators in event fingerprinting. --- manager/event.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/manager/event.go b/manager/event.go index 8d309c463a..f0ef689ad0 100644 --- a/manager/event.go +++ b/manager/event.go @@ -51,8 +51,9 @@ func (e Event) Fingerprint() EventFingerprint { summer := fnv.New64a() + separator := string([]byte{0}) for _, k := range keys { - fmt.Fprintf(summer, k, e.Labels[k]) + fmt.Fprintf(summer, "%s%s%s%s", k, separator, e.Labels[k], separator) } return EventFingerprint(summer.Sum64()) From f431335c69f3e9afbcfbcd3e0ba963904754606d Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 30 Jul 2013 13:18:11 +0200 Subject: [PATCH 5/8] Add more required fields to Event. This adds mandatory Summary and Description fields to Event. As for the alert name, there were two options: keep it a separate field and treat it separately everywhere (including in silence Filter matching), or make it a required field in the event's labels. The latter was causing far less trouble, so I went with that. The alertname label still doesn't have a special meaning to most parts of the code, except that the API checks its presence and the web UI displays it differently. --- manager/event.go | 11 +++++++---- web/api/event.go | 17 +++++++++++++++++ web/helpers.go | 26 ++++++++++++++++++++++++++ web/templates/alerts.html | 8 +++++--- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/manager/event.go b/manager/event.go index f0ef689ad0..8dc73e482d 100644 --- a/manager/event.go +++ b/manager/event.go @@ -19,7 +19,7 @@ import ( "sort" ) -const eventNameLabel = "name" +const EventNameLabel = "alertname" type EventFingerprint uint64 @@ -28,16 +28,19 @@ type EventPayload map[string]string // Event models an action triggered by Prometheus. type Event struct { + // Short summary of event. + Summary string + // Long description of event. + Description string // Label value pairs for purpose of aggregation, matching, and disposition - // dispatching. This must minimally include a "name" label. + // dispatching. This must minimally include an "alertname" label. Labels EventLabels // Extra key/value information which is not used for aggregation. Payload EventPayload } func (e Event) Name() string { - // BUG: ensure in a proper place that all events have a name? - return e.Labels[eventNameLabel] + return e.Labels[EventNameLabel] } func (e Event) Fingerprint() EventFingerprint { diff --git a/web/api/event.go b/web/api/event.go index 509f085d0c..90fa94884e 100644 --- a/web/api/event.go +++ b/web/api/event.go @@ -14,14 +14,31 @@ package api import ( + "log" "net/http" "github.com/prometheus/alert_manager/manager" ) func (s AlertManagerService) AddEvents(es manager.Events) { + for i, ev := range es { + if ev.Summary == "" || ev.Description == "" { + log.Printf("Missing field in event %d: %s", i, ev) + rb := s.ResponseBuilder() + rb.SetResponseCode(http.StatusBadRequest) + return + } + if _, ok := ev.Labels[manager.EventNameLabel]; !ok { + log.Printf("Missing alert name label in event %d: %s", i, ev) + rb := s.ResponseBuilder() + rb.SetResponseCode(http.StatusBadRequest) + return + } + } + err := s.Aggregator.Receive(es) if err != nil { + log.Println("Error during aggregation:", err) rb := s.ResponseBuilder() rb.SetResponseCode(http.StatusServiceUnavailable) } diff --git a/web/helpers.go b/web/helpers.go index ac5b6659ab..00631e9a44 100644 --- a/web/helpers.go +++ b/web/helpers.go @@ -15,6 +15,7 @@ package web import ( "html/template" + "reflect" "time" ) @@ -22,6 +23,31 @@ func timeSince(t time.Time) string { return time.Now().Round(time.Second / 10).Sub(t.Round(time.Second / 10)).String() } +// By Russ Cox, https://groups.google.com/d/msg/golang-nuts/OEdSDgEC7js/iyhU9DW_IKcJ. +func eq(args ...interface{}) bool { + if len(args) == 0 { + return false + } + x := args[0] + switch x := x.(type) { + case string, int, int64, byte, float32, float64: + for _, y := range args[1:] { + if x == y { + return true + } + } + return false + } + + for _, y := range args[1:] { + if reflect.DeepEqual(x, y) { + return true + } + } + return false +} + var webHelpers = template.FuncMap{ "timeSince": timeSince, + "eq": eq, } diff --git a/web/templates/alerts.html b/web/templates/alerts.html index e85c66ce18..9200b27cf3 100644 --- a/web/templates/alerts.html +++ b/web/templates/alerts.html @@ -27,14 +27,16 @@

Alerts

{{index .Event.Name}}
- - + + Silence Alert
{{range $label, $value := .Event.Labels}} - {{$label}}="{{$value}}" + {{if not (eq $label "alertname")}} + {{$label}}="{{$value}}" + {{end}} {{end}}
{{range $label, $value := .Event.Labels}} From 70e67b920ce590355eb311b151bf4e7dd3c13f99 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 30 Jul 2013 13:19:18 +0200 Subject: [PATCH 6/8] Implement PagerDuty notifications. --- main.go | 9 +- manager/aggregator.go | 30 +++--- manager/aggregator_test.go | 12 ++- manager/dispatcher.go | 138 --------------------------- manager/notifier.go | 187 +++++++++++++++++++++++++++++++++++++ 5 files changed, 215 insertions(+), 161 deletions(-) delete mode 100644 manager/dispatcher.go create mode 100644 manager/notifier.go diff --git a/main.go b/main.go index ec38e3fcef..c37a603c5c 100644 --- a/main.go +++ b/main.go @@ -38,9 +38,10 @@ func main() { suppressor := manager.NewSuppressor() defer suppressor.Close() - summarizer := manager.NewSummaryDispatcher() + notifier := manager.NewNotifier(conf.NotificationConfig) + defer notifier.Close() - aggregator := manager.NewAggregator(summarizer) + aggregator := manager.NewAggregator(notifier) defer aggregator.Close() webService := &web.WebService{ @@ -61,8 +62,8 @@ func main() { } go webService.ServeForever() - aggregator.SetRules(conf.AggregationRules()) + aggregator.SetRules(conf.AggregationRules()) log.Println("Running summary dispatcher...") - summarizer.Dispatch(suppressor) + notifier.Dispatch(suppressor) } diff --git a/manager/aggregator.go b/manager/aggregator.go index 936ab0652d..107eea2188 100644 --- a/manager/aggregator.go +++ b/manager/aggregator.go @@ -65,28 +65,26 @@ func (r *AggregationInstance) Ingest(e *Event) { r.expiryTimer.Reset(minimumRefreshPeriod) } -func (r *AggregationInstance) SendNotification(s SummaryReceiver) { +func (r *AggregationInstance) SendNotification(n Notifier) { if time.Since(r.lastNotificationSent) < r.Rule.RepeatRate { return } - err := s.Receive(&EventSummary{ - Rule: r.Rule, - Event: r.Event, - }) + err := n.QueueNotification(r.Event, r.Rule.NotificationConfig) if err != nil { + // BUG: Limit the number of retries. log.Printf("Error while sending notification: %s, retrying in %v", err, notificationRetryPeriod) - r.resendNotificationAfter(notificationRetryPeriod, s) + r.resendNotificationAfter(notificationRetryPeriod, n) return } - r.resendNotificationAfter(r.Rule.RepeatRate, s) + r.resendNotificationAfter(r.Rule.RepeatRate, n) r.lastNotificationSent = time.Now() } -func (r *AggregationInstance) resendNotificationAfter(d time.Duration, s SummaryReceiver) { +func (r *AggregationInstance) resendNotificationAfter(d time.Duration, n Notifier) { r.notificationResendTimer = time.AfterFunc(d, func() { - r.SendNotification(s) + r.SendNotification(n) }) } @@ -102,18 +100,18 @@ func (r *AggregationInstance) Close() { type AggregationRules []*AggregationRule type Aggregator struct { - Rules AggregationRules - Aggregates map[EventFingerprint]*AggregationInstance - SummaryReceiver SummaryReceiver + Rules AggregationRules + Aggregates map[EventFingerprint]*AggregationInstance + Notifier Notifier // Mutex to protect the above. mu sync.Mutex } -func NewAggregator(s SummaryReceiver) *Aggregator { +func NewAggregator(n Notifier) *Aggregator { return &Aggregator{ - Aggregates: make(map[EventFingerprint]*AggregationInstance), - SummaryReceiver: s, + Aggregates: make(map[EventFingerprint]*AggregationInstance), + Notifier: n, } } @@ -153,7 +151,7 @@ func (a *Aggregator) Receive(events Events) error { } aggregation.Ingest(e) - aggregation.SendNotification(a.SummaryReceiver) + aggregation.SendNotification(a.Notifier) break } } diff --git a/manager/aggregator_test.go b/manager/aggregator_test.go index b13b9582f9..2784f9ec39 100644 --- a/manager/aggregator_test.go +++ b/manager/aggregator_test.go @@ -15,14 +15,20 @@ package manager import ( "testing" + + pb "github.com/prometheus/alert_manager/config/generated" ) -type dummyReceiver struct{} +type dummyNotifier struct{} -func (d *dummyReceiver) Receive(*EventSummary) RemoteError { +func (d *dummyNotifier) QueueNotification(*Event, string) error { return nil } +func (d *dummyNotifier) SetNotificationConfigs([]pb.NotificationConfig) {} + +func (d *dummyNotifier) Dispatch(IsInhibitedInterrogator) {} + type testAggregatorScenario struct { rules AggregationRules inMatch Events @@ -30,7 +36,7 @@ type testAggregatorScenario struct { } func (s *testAggregatorScenario) test(i int, t *testing.T) { - a := NewAggregator(&dummyReceiver{}) + a := NewAggregator(&dummyNotifier{}) a.SetRules(s.rules) if len(s.inMatch) > 0 { diff --git a/manager/dispatcher.go b/manager/dispatcher.go deleted file mode 100644 index 5028ada494..0000000000 --- a/manager/dispatcher.go +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2013 Prometheus Team -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package manager - -import ( - "strings" -) - -type DestinationDispatcher interface { - Send(*EventSummary) error -} - -func DispatcherFor(destination string) DestinationDispatcher { - switch { - case strings.HasPrefix(destination, "IRC"): - case strings.HasPrefix(destination, "TRELLO"): - case strings.HasPrefix(destination, "MAIL"): - case strings.HasPrefix(destination, "PAGERDUTY"): - } - return nil -} - -type EventSummary struct { - Rule *AggregationRule - - Event *Event - - Destination string -} - -type SummaryDispatcher struct { - summaryReqs chan *summaryDispatchRequest - - closed chan bool -} - -type summaryDispatchRequest struct { - Summary *EventSummary - - Response chan *summaryDispatchResponse -} - -type Disposition int - -const ( - UNHANDLED Disposition = iota - DISPATCHED - SUPPRESSED -) - -type summaryDispatchResponse struct { - Disposition Disposition - Err RemoteError -} - -func (s *SummaryDispatcher) Close() { - close(s.summaryReqs) - <-s.closed -} - -func NewSummaryDispatcher() *SummaryDispatcher { - return &SummaryDispatcher{ - summaryReqs: make(chan *summaryDispatchRequest), - closed: make(chan bool), - } -} - -type RemoteError interface { - error - - Retryable() bool -} - -type remoteError struct { - error - - retryable bool -} - -func (e *remoteError) Retryable() bool { - return e.retryable -} - -func NewRemoteError(err error, retryable bool) RemoteError { - return &remoteError{ - err, - retryable, - } -} - -type SummaryReceiver interface { - Receive(*EventSummary) RemoteError -} - -func (d *SummaryDispatcher) Receive(s *EventSummary) RemoteError { - req := &summaryDispatchRequest{ - Summary: s, - Response: make(chan *summaryDispatchResponse), - } - - d.summaryReqs <- req - resp := <-req.Response - - return resp.Err -} - -func (d *SummaryDispatcher) dispatchSummary(r *summaryDispatchRequest, i IsInhibitedInterrogator) { - if inhibited, _ := i.IsInhibited(r.Summary.Event); inhibited { - r.Response <- &summaryDispatchResponse{ - Disposition: SUPPRESSED, - } - return - } - - // BUG: Perform sending of summaries. - r.Response <- &summaryDispatchResponse{ - Disposition: DISPATCHED, - } -} - -func (d *SummaryDispatcher) Dispatch(i IsInhibitedInterrogator) { - for req := range d.summaryReqs { - d.dispatchSummary(req, i) - } - - d.closed <- true -} diff --git a/manager/notifier.go b/manager/notifier.go new file mode 100644 index 0000000000..b2958afbf3 --- /dev/null +++ b/manager/notifier.go @@ -0,0 +1,187 @@ +// Copyright 2013 Prometheus Team +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package manager + +import ( + "bytes" + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "log" + "net/http" + "sync" + + pb "github.com/prometheus/alert_manager/config/generated" +) + +const contentTypeJson = "application/json" + +var ( + notificationBufferSize = flag.Int("notificationBufferSize", 1000, "Size of buffer for pending notifications.") + pagerdutyApiUrl = flag.String("pagerdutyApiUrl", "https://events.pagerduty.com/generic/2010-04-15/create_event.json", "PagerDuty API URL.") +) + +// A Notifier is responsible for sending notifications for events according to +// a provided notification configuration. +type Notifier interface { + // Queue a notification for asynchronous dispatching. + QueueNotification(e *Event, configName string) error + // Replace current notification configs. Already enqueued messages will remain + // unaffected. + SetNotificationConfigs([]*pb.NotificationConfig) + // Start event notification dispatch loop. + Dispatch(IsInhibitedInterrogator) + // Stop the event notification dispatch loop. + Close() +} + +// Request for sending a notification. +type notificationReq struct { + event *Event + notificationConfig *pb.NotificationConfig +} + +// Alert notification multiplexer and dispatcher. +type notifier struct { + // Notifications that are queued to be sent. + pendingNotifications chan *notificationReq + // Channel for stopping the dispatch loop. + stop chan bool + + // Mutex to protect the fields below. + mu sync.Mutex + // Map of notification configs by name. + notificationConfigs map[string]*pb.NotificationConfig +} + +// Construct a new notifier. +func NewNotifier(configs []*pb.NotificationConfig) *notifier { + notifier := ¬ifier{ + pendingNotifications: make(chan *notificationReq, *notificationBufferSize), + stop: make(chan bool), + } + notifier.SetNotificationConfigs(configs) + return notifier +} + +func (n *notifier) SetNotificationConfigs(configs []*pb.NotificationConfig) { + n.mu.Lock() + defer n.mu.Unlock() + + n.notificationConfigs = map[string]*pb.NotificationConfig{} + for _, c := range configs { + n.notificationConfigs[c.GetName()] = c + } +} + +func (n *notifier) QueueNotification(event *Event, configName string) error { + n.mu.Lock() + nc, ok := n.notificationConfigs[configName] + n.mu.Unlock() + + if !ok { + return fmt.Errorf("No such notification configuration %s", configName) + } + + // We need to save a reference to the notification config in the + // notificationReq since the config might be replaced or gone at the time the + // message gets dispatched. + n.pendingNotifications <- ¬ificationReq{ + event: event, + notificationConfig: nc, + } + return nil +} + +func (n *notifier) sendPagerDutyNotification(serviceKey string, event *Event) error { + // http://developer.pagerduty.com/documentation/integration/events/trigger + incidentKey := event.Fingerprint() + buf, err := json.Marshal(map[string]interface{}{ + "service_key": serviceKey, + "event_type": "trigger", + "description": event.Description, + "incident_key": incidentKey, + "details": map[string]interface{}{ + "grouping_labels": event.Labels, + "extra_labels": event.Payload, + }, + }) + if err != nil { + return err + } + + resp, err := http.Post( + *pagerdutyApiUrl, + contentTypeJson, + bytes.NewBuffer(buf), + ) + if err != nil { + return err + } + defer resp.Body.Close() + + respBuf, err := ioutil.ReadAll(resp.Body) + if err != nil { + return err + } + + log.Printf("Sent PagerDuty notification: %s: HTTP %d: %s", incidentKey, resp.StatusCode, respBuf) + // BUG: Check response for result of operation. + return nil +} + +func (n *notifier) sendEmailNotification(email string, event *Event) error { + // BUG: Implement email notifications. + log.Printf("Would send email notification for event %s to %s\n", event, email) + return nil +} + +func (n *notifier) handleNotification(event *Event, config *pb.NotificationConfig, i IsInhibitedInterrogator) { + if inhibited, _ := i.IsInhibited(event); !inhibited { + return + } + + for _, pdConfig := range config.PagerdutyConfig { + if err := n.sendPagerDutyNotification(pdConfig.GetServiceKey(), event); err != nil { + log.Printf("Error sending PagerDuty notification: %s", err) + } + } + for _, emailConfig := range config.EmailConfig { + if err := n.sendEmailNotification(emailConfig.GetEmail(), event); err != nil { + log.Printf("Error sending email notification: %s", err) + } + } +} + +func (n *notifier) Dispatch(i IsInhibitedInterrogator) { + for { + select { + case req := <-n.pendingNotifications: + n.handleNotification(req.event, req.notificationConfig, i) + case <-n.stop: + // We require that Close() is only called after nobody sends new + // notification requests anymore, so we only need to drain existing ones. + // BUG: We might want to add state validation for this. + close(n.pendingNotifications) + for req := range n.pendingNotifications { + n.handleNotification(req.event, req.notificationConfig, i) + } + } + } +} + +func (n *notifier) Close() { + n.stop <- true +} From d3f08b760badaf20147a4f944dcecfd5da6678d2 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 30 Jul 2013 13:19:32 +0200 Subject: [PATCH 7/8] Minor UI cleanup. --- web/static/js/alerts.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/static/js/alerts.js b/web/static/js/alerts.js index 006fb62bb0..f61e3f619b 100644 --- a/web/static/js/alerts.js +++ b/web/static/js/alerts.js @@ -14,8 +14,8 @@ function addSilenceLabel(label, value) { } $("#silence_filters_table").append( '' + - ' ' + - ' ' + + ' ' + + ' ' + ' ' + ''); bindDelLabel(); From 02ab1f904a84da0d4b9b785c841a16686307ad23 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 30 Jul 2013 14:49:16 +0200 Subject: [PATCH 8/8] PR comment fixups. --- config/config.go | 10 ++++---- config/config.proto | 2 +- config/fixtures/duplicate_nc_name.conf.input | 2 +- config/fixtures/missing_nc_name.conf.input | 2 +- .../fixtures/nonexistent_nc_name.conf.input | 2 +- config/fixtures/sample.conf.input | 2 +- config/generated/config.pb.go | 14 +++++------ manager/aggregator.go | 8 +++---- manager/aggregator_test.go | 4 +++- manager/notifier.go | 24 ++++--------------- 10 files changed, 29 insertions(+), 41 deletions(-) diff --git a/config/config.go b/config/config.go index 641ce9fb3d..3fa3e9e8e0 100644 --- a/config/config.go +++ b/config/config.go @@ -72,8 +72,8 @@ func (c Config) Validate() error { } } - if _, ok := ncNames[a.GetNotificationConfig()]; !ok { - return fmt.Errorf("No such notification config: %s", a.GetNotificationConfig()) + if _, ok := ncNames[a.GetNotificationConfigName()]; !ok { + return fmt.Errorf("No such notification config: %s", a.GetNotificationConfigName()) } } @@ -89,9 +89,9 @@ func (c Config) AggregationRules() manager.AggregationRules { filters = append(filters, manager.NewFilter(filter.GetNameRe(), filter.GetValueRe())) } rules = append(rules, &manager.AggregationRule{ - Filters: filters, - RepeatRate: time.Duration(r.GetRepeatRateSeconds()) * time.Second, - NotificationConfig: r.GetNotificationConfig(), + Filters: filters, + RepeatRate: time.Duration(r.GetRepeatRateSeconds()) * time.Second, + NotificationConfigName: r.GetNotificationConfigName(), }) } return rules diff --git a/config/config.proto b/config/config.proto index 35e43dbd5b..1efb986d8a 100644 --- a/config/config.proto +++ b/config/config.proto @@ -52,7 +52,7 @@ message AggregationRule { optional int32 repeat_rate_seconds = 2 [default = 7200]; // Notification configuration to use for this AggregationRule, referenced by // their name. - optional string notification_config = 3; + optional string notification_config_name = 3; } // Global alert manager configuration. diff --git a/config/fixtures/duplicate_nc_name.conf.input b/config/fixtures/duplicate_nc_name.conf.input index c656972d2b..3ba8c9e964 100644 --- a/config/fixtures/duplicate_nc_name.conf.input +++ b/config/fixtures/duplicate_nc_name.conf.input @@ -24,5 +24,5 @@ aggregation_rule { value_re: "test" } repeat_rate_seconds: 3600 - notification_config: "alertmanager_test" + notification_config_name: "alertmanager_test" } diff --git a/config/fixtures/missing_nc_name.conf.input b/config/fixtures/missing_nc_name.conf.input index 6afb982356..e80bb89bbd 100644 --- a/config/fixtures/missing_nc_name.conf.input +++ b/config/fixtures/missing_nc_name.conf.input @@ -23,5 +23,5 @@ aggregation_rule { value_re: "test" } repeat_rate_seconds: 3600 - notification_config: "alertmanager_test" + notification_config_name: "alertmanager_test" } diff --git a/config/fixtures/nonexistent_nc_name.conf.input b/config/fixtures/nonexistent_nc_name.conf.input index 954b0ba91e..09b0b7d75a 100644 --- a/config/fixtures/nonexistent_nc_name.conf.input +++ b/config/fixtures/nonexistent_nc_name.conf.input @@ -14,5 +14,5 @@ aggregation_rule { value_re: "test" } repeat_rate_seconds: 3600 - notification_config: "alertmanager_test2" + notification_config_name: "alertmanager_test2" } diff --git a/config/fixtures/sample.conf.input b/config/fixtures/sample.conf.input index d451d0d25f..fcf5119d5f 100644 --- a/config/fixtures/sample.conf.input +++ b/config/fixtures/sample.conf.input @@ -14,5 +14,5 @@ aggregation_rule { value_re: "test" } repeat_rate_seconds: 3600 - notification_config: "alertmanager_test" + notification_config_name: "alertmanager_test" } diff --git a/config/generated/config.pb.go b/config/generated/config.pb.go index a29870219d..267ccd3a24 100644 --- a/config/generated/config.pb.go +++ b/config/generated/config.pb.go @@ -102,10 +102,10 @@ func (m *Filter) GetValueRe() string { } type AggregationRule struct { - Filter []*Filter `protobuf:"bytes,1,rep,name=filter" json:"filter,omitempty"` - RepeatRateSeconds *int32 `protobuf:"varint,2,opt,name=repeat_rate_seconds,def=7200" json:"repeat_rate_seconds,omitempty"` - NotificationConfig *string `protobuf:"bytes,3,opt,name=notification_config" json:"notification_config,omitempty"` - XXX_unrecognized []byte `json:"-"` + Filter []*Filter `protobuf:"bytes,1,rep,name=filter" json:"filter,omitempty"` + RepeatRateSeconds *int32 `protobuf:"varint,2,opt,name=repeat_rate_seconds,def=7200" json:"repeat_rate_seconds,omitempty"` + NotificationConfigName *string `protobuf:"bytes,3,opt,name=notification_config_name" json:"notification_config_name,omitempty"` + XXX_unrecognized []byte `json:"-"` } func (m *AggregationRule) Reset() { *m = AggregationRule{} } @@ -128,9 +128,9 @@ func (m *AggregationRule) GetRepeatRateSeconds() int32 { return Default_AggregationRule_RepeatRateSeconds } -func (m *AggregationRule) GetNotificationConfig() string { - if m != nil && m.NotificationConfig != nil { - return *m.NotificationConfig +func (m *AggregationRule) GetNotificationConfigName() string { + if m != nil && m.NotificationConfigName != nil { + return *m.NotificationConfigName } return "" } diff --git a/manager/aggregator.go b/manager/aggregator.go index 107eea2188..b44e00b442 100644 --- a/manager/aggregator.go +++ b/manager/aggregator.go @@ -28,9 +28,9 @@ const ( // AggregationRule creates and manages the scope for received events. type AggregationRule struct { - Filters Filters - RepeatRate time.Duration - NotificationConfig string + Filters Filters + RepeatRate time.Duration + NotificationConfigName string } type AggregationInstances []*AggregationInstance @@ -70,7 +70,7 @@ func (r *AggregationInstance) SendNotification(n Notifier) { return } - err := n.QueueNotification(r.Event, r.Rule.NotificationConfig) + err := n.QueueNotification(r.Event, r.Rule.NotificationConfigName) if err != nil { // BUG: Limit the number of retries. log.Printf("Error while sending notification: %s, retrying in %v", err, notificationRetryPeriod) diff --git a/manager/aggregator_test.go b/manager/aggregator_test.go index 2784f9ec39..a878943b3b 100644 --- a/manager/aggregator_test.go +++ b/manager/aggregator_test.go @@ -25,10 +25,12 @@ func (d *dummyNotifier) QueueNotification(*Event, string) error { return nil } -func (d *dummyNotifier) SetNotificationConfigs([]pb.NotificationConfig) {} +func (d *dummyNotifier) SetNotificationConfigs([]*pb.NotificationConfig) {} func (d *dummyNotifier) Dispatch(IsInhibitedInterrogator) {} +func (d *dummyNotifier) Close() {} + type testAggregatorScenario struct { rules AggregationRules inMatch Events diff --git a/manager/notifier.go b/manager/notifier.go index b2958afbf3..bc8359267c 100644 --- a/manager/notifier.go +++ b/manager/notifier.go @@ -57,8 +57,6 @@ type notificationReq struct { type notifier struct { // Notifications that are queued to be sent. pendingNotifications chan *notificationReq - // Channel for stopping the dispatch loop. - stop chan bool // Mutex to protect the fields below. mu sync.Mutex @@ -70,7 +68,6 @@ type notifier struct { func NewNotifier(configs []*pb.NotificationConfig) *notifier { notifier := ¬ifier{ pendingNotifications: make(chan *notificationReq, *notificationBufferSize), - stop: make(chan bool), } notifier.SetNotificationConfigs(configs) return notifier @@ -137,7 +134,7 @@ func (n *notifier) sendPagerDutyNotification(serviceKey string, event *Event) er return err } - log.Printf("Sent PagerDuty notification: %s: HTTP %d: %s", incidentKey, resp.StatusCode, respBuf) + log.Printf("Sent PagerDuty notification: %v: HTTP %d: %s", incidentKey, resp.StatusCode, respBuf) // BUG: Check response for result of operation. return nil } @@ -149,7 +146,7 @@ func (n *notifier) sendEmailNotification(email string, event *Event) error { } func (n *notifier) handleNotification(event *Event, config *pb.NotificationConfig, i IsInhibitedInterrogator) { - if inhibited, _ := i.IsInhibited(event); !inhibited { + if inhibited, _ := i.IsInhibited(event); inhibited { return } @@ -166,22 +163,11 @@ func (n *notifier) handleNotification(event *Event, config *pb.NotificationConfi } func (n *notifier) Dispatch(i IsInhibitedInterrogator) { - for { - select { - case req := <-n.pendingNotifications: - n.handleNotification(req.event, req.notificationConfig, i) - case <-n.stop: - // We require that Close() is only called after nobody sends new - // notification requests anymore, so we only need to drain existing ones. - // BUG: We might want to add state validation for this. - close(n.pendingNotifications) - for req := range n.pendingNotifications { - n.handleNotification(req.event, req.notificationConfig, i) - } - } + for req := range n.pendingNotifications { + n.handleNotification(req.event, req.notificationConfig, i) } } func (n *notifier) Close() { - n.stop <- true + close(n.pendingNotifications) }