diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f44817f779..aa3eaed7c78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458 * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526 * [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590 +* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605 * [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533 * [ENHANCEMENT] StoreGateway: Emit more histogram buckets on the `cortex_querier_storegateway_refetches_per_query` metric. #6570 * [ENHANCEMENT] Querier: Apply bytes limiter to LabelNames and LabelValuesForLabelNames. #6568 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 5a806a33f14..b1bfd4509e8 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -3751,6 +3751,15 @@ query_rejection: # CLI flag: -alertmanager.max-alerts-size-bytes [alertmanager_max_alerts_size_bytes: | default = 0] +# Maximum number of silences that a single user can have, including expired +# silences. 0 = no limit. +# CLI flag: -alertmanager.max-silences-count +[alertmanager_max_silences_count: | default = 0] + +# Maximum size of individual silences that a single user can have. 0 = no limit. +# CLI flag: -alertmanager.max-silences-size-bytes +[alertmanager_max_silences_size_bytes: | default = 0] + # list of rule groups to disable [disabled_rule_groups: | default = []] ``` diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 51d37bed62e..1cdc3a80df3 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -228,11 +228,16 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.groupMarker = memMarker silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot) + am.silences, err = silence.New(silence.Options{ SnapshotFile: silencesFile, Retention: cfg.Retention, - Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")), - Metrics: am.registry, + Limits: silence.Limits{ + MaxSilences: func() int { return cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID) }, + MaxSilenceSizeBytes: func() int { return cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID) }, + }, + Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")), + Metrics: am.registry, }) if err != nil { return nil, fmt.Errorf("failed to create silences: %v", err) diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index 6859fb50867..c4ed3064fa2 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -9,6 +9,7 @@ import ( "github.com/go-kit/log" "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/silence/silencepb" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -19,6 +20,75 @@ import ( "github.com/cortexproject/cortex/pkg/util/test" ) +func TestSilencesLimits(t *testing.T) { + user := "test" + + reg := prometheus.NewPedanticRegistry() + maxSilencesCount := 3 + maxSilencesSizeBytes := 500 + am, err := New(&Config{ + UserID: user, + Logger: log.NewNopLogger(), + Limits: &mockAlertManagerLimits{maxSilencesCount: maxSilencesCount, maxSilencesSizeBytes: maxSilencesSizeBytes}, + TenantDataDir: t.TempDir(), + ExternalURL: &url.URL{Path: "/am"}, + ShardingEnabled: false, + GCInterval: 30 * time.Minute, + }, reg) + require.NoError(t, err) + defer am.StopAndWait() + + t.Run("Test maxSilencesCount", func(t *testing.T) { + createSilences := func() *silencepb.Silence { + return &silencepb.Silence{ + Matchers: []*silencepb.Matcher{{Name: "name", Pattern: "pattern"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Minute * 30), + } + } + + // create silences up to maxSilencesCount + for i := 0; i < maxSilencesCount; i++ { + err := am.silences.Set(createSilences()) + require.NoError(t, err) + } + + // exceeds limit + err = am.silences.Set(createSilences()) + require.Error(t, err) + require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error()) + + // expire whole silences + silences, _, err := am.silences.Query() + require.NoError(t, err) + for _, s := range silences { + err := am.silences.Expire(s.Id) + require.NoError(t, err) + } + + // check maxSilencesCount includes expired silences + err = am.silences.Set(createSilences()) + require.Error(t, err) + require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error()) + + // GC + n, err := am.silences.GC() + require.NoError(t, err) + require.Equal(t, maxSilencesCount, n) + }) + t.Run("Test maxSilencesSizeBytes", func(t *testing.T) { + bigSilences := &silencepb.Silence{ + Matchers: []*silencepb.Matcher{{Name: strings.Repeat("a", maxSilencesSizeBytes/2+1), Pattern: strings.Repeat("b", maxSilencesSizeBytes/2+1)}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Minute * 30), + } + + err = am.silences.Set(bigSilences) + require.Error(t, err) + require.True(t, strings.Contains(err.Error(), "silence exceeded maximum size")) + }) +} + func TestDispatcherGroupLimits(t *testing.T) { for name, tc := range map[string]struct { groups int diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index abf421335e8..0081e9ab78d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -228,6 +228,12 @@ type Limits interface { // AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit. // Size of the alert is computed from alert labels, annotations and generator URL. AlertmanagerMaxAlertsSizeBytes(tenant string) int + + // AlertmanagerMaxSilencesCount returns max number of silences that tenant can have, including expired silences. 0 = no limit. + AlertmanagerMaxSilencesCount(tenant string) int + + // AlertmanagerMaxSilenceSizeBytes returns the maximum size of an individual silence. 0 = no limit. + AlertmanagerMaxSilenceSizeBytes(tenant string) int } // A MultitenantAlertmanager manages Alertmanager instances for multiple diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 6c9801ed616..4fc55df02bb 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -1777,8 +1777,14 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) { amConfig.ShardingEnabled = true } + var limits validation.Limits + flagext.DefaultValues(&limits) + + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(t, err) + reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1969,8 +1975,14 @@ func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers(t *testi amConfig.ShardingEnabled = true + var limits validation.Limits + flagext.DefaultValues(&limits) + + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(t, err) + reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg) require.NoError(t, err) clientPool.setServer(amConfig.ShardingRing.InstanceAddr+":0", am) @@ -2285,6 +2297,8 @@ type mockAlertManagerLimits struct { maxDispatcherAggregationGroups int maxAlertsCount int maxAlertsSizeBytes int + maxSilencesCount int + maxSilencesSizeBytes int } func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int { @@ -2326,3 +2340,11 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsCount(_ string) int { func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsSizeBytes(_ string) int { return m.maxAlertsSizeBytes } + +func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(_ string) int { + return m.maxSilencesCount +} + +func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int { + return m.maxSilencesSizeBytes +} diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 729f62566e2..b39a71c2b0a 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -218,6 +218,8 @@ type Limits struct { AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"` AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"` AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"` + AlertmanagerMaxSilencesCount int `yaml:"alertmanager_max_silences_count" json:"alertmanager_max_silences_count"` + AlertmanagerMaxSilencesSizeBytes int `yaml:"alertmanager_max_silences_size_bytes" json:"alertmanager_max_silences_size_bytes"` DisabledRuleGroups DisabledRuleGroups `yaml:"disabled_rule_groups" json:"disabled_rule_groups" doc:"nocli|description=list of rule groups to disable"` } @@ -310,6 +312,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that a single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that a single user can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxSilencesCount, "alertmanager.max-silences-count", 0, "Maximum number of silences that a single user can have, including expired silences. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxSilencesSizeBytes, "alertmanager.max-silences-size-bytes", 0, "Maximum size of individual silences that a single user can have. 0 = no limit.") } // Validate the limits config and returns an error if the validation @@ -971,6 +975,14 @@ func (o *Overrides) AlertmanagerMaxAlertsSizeBytes(userID string) int { return o.GetOverridesForUser(userID).AlertmanagerMaxAlertsSizeBytes } +func (o *Overrides) AlertmanagerMaxSilencesCount(userID string) int { + return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesCount +} + +func (o *Overrides) AlertmanagerMaxSilenceSizeBytes(userID string) int { + return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesSizeBytes +} + func (o *Overrides) DisabledRuleGroups(userID string) DisabledRuleGroups { if o.tenantLimits != nil { l := o.tenantLimits.ByUserID(userID)