Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions notify/discord/discord.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,8 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error)
if err != nil {
return true, notify.RedactURL(err)
}
defer notify.Drain(resp)

shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body)
if err != nil {
return shouldRetry, err
}
return false, nil
shouldRetry, errWithReason := n.retrier.Check(resp)
return shouldRetry, errWithReason
}
2 changes: 1 addition & 1 deletion notify/discord/discord_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TestDiscordRetry(t *testing.T) {
require.NoError(t, err)

for statusCode, expected := range test.RetryTests(test.DefaultRetryCodes()) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retry - error on status %d", statusCode)
}
}
Expand Down
6 changes: 1 addition & 5 deletions notify/incidentio/incidentio.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,7 @@ func (n *Notifier) Notify(ctx context.Context, alerts ...*types.Alert) (bool, er
}
defer notify.Drain(resp)

shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body)
if err != nil {
return shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
}
return shouldRetry, err
return n.retrier.Check(resp)
}

// errDetails extracts error details from the response for better error messages.
Expand Down
4 changes: 2 additions & 2 deletions notify/incidentio/incidentio_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TestIncidentIORetry(t *testing.T) {

retryCodes := append(test.DefaultRetryCodes(), http.StatusTooManyRequests)
for statusCode, expected := range test.RetryTests(retryCodes) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retry - error on status %d", statusCode)
}
}
Expand Down Expand Up @@ -187,7 +187,7 @@ func TestIncidentIORetryScenarios(t *testing.T) {
statusCode: http.StatusTooManyRequests,
responseBody: []byte(`{"error":"rate limit exceeded","message":"Too many requests"}`),
expectRetry: true,
expectErrorMsgContains: "rate limit exceeded",
expectErrorMsgContains: "unexpected status code 429: Too many requests",
},
{
name: "server error response",
Expand Down
14 changes: 11 additions & 3 deletions notify/jira/jira.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,17 @@ func (n *Notifier) doAPIRequestFullPath(ctx context.Context, method, path string
return nil, false, err
}

shouldRetry, err := n.retrier.Check(resp.StatusCode, bytes.NewReader(responseBody))
if err != nil {
return nil, shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
// Pass a temporary response to the retrier so that resp.Body remains
// the original network body; defer notify.Drain(resp) will then close
// the real connection rather than the in-memory reader.
checkResp := &http.Response{
StatusCode: resp.StatusCode,
Header: resp.Header,
Body: io.NopCloser(bytes.NewReader(responseBody)),
}
shouldRetry, errWithReason := n.retrier.Check(checkResp)
if errWithReason != nil {
return nil, shouldRetry, errWithReason
}

return responseBody, false, nil
Expand Down
2 changes: 1 addition & 1 deletion notify/jira/jira_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func TestJiraRetry(t *testing.T) {
retryCodes := append(test.DefaultRetryCodes(), http.StatusTooManyRequests)

for statusCode, expected := range test.RetryTests(retryCodes) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retry - error on status %d", statusCode)
}
}
Expand Down
10 changes: 7 additions & 3 deletions notify/mattermost/mattermost.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,14 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err

// Use a retrier to generate an error message for non-200 responses and
// classify them as retriable or not.
retry, err := n.retrier.Check(resp.StatusCode, resp.Body)
retry, err := n.retrier.Check(resp)
if err != nil {
err = fmt.Errorf("channel %q: %w", req.Channel, err)
return retry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
var ewr *notify.ErrorWithReason
if errors.As(err, &ewr) {
ewr.Err = fmt.Errorf("channel %q: %w", req.Channel, ewr.Err)
return retry, ewr
}
return retry, fmt.Errorf("channel %q: %w", req.Channel, err)
}
n.logger.Debug("Message sent to Mattermost successfully",
"status", resp.StatusCode)
Expand Down
2 changes: 1 addition & 1 deletion notify/mattermost/mattermost_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func TestMattermostRetry(t *testing.T) {
require.NoError(t, err)

for statusCode, expected := range test.RetryTests(test.DefaultRetryCodes()) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retry - error on status %d", statusCode)
}
}
Expand Down
7 changes: 2 additions & 5 deletions notify/msteams/msteams.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,6 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error)
defer notify.Drain(resp)

// https://learn.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/connectors-using?tabs=cURL#rate-limiting-for-connectors
shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body)
if err != nil {
return shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
}
return shouldRetry, err
shouldRetry, errWithReason := n.retrier.Check(resp)
return shouldRetry, errWithReason
}
2 changes: 1 addition & 1 deletion notify/msteams/msteams_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TestMSTeamsRetry(t *testing.T) {
require.NoError(t, err)

for statusCode, expected := range test.RetryTests(test.DefaultRetryCodes()) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retry - error on status %d", statusCode)
}
}
Expand Down
7 changes: 2 additions & 5 deletions notify/msteamsv2/msteamsv2.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,6 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error)
defer notify.Drain(resp)

// https://learn.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/connectors-using?tabs=cURL#rate-limiting-for-connectors
shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body)
if err != nil {
return shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
}
return shouldRetry, err
shouldRetry, errWithReason := n.retrier.Check(resp)
return shouldRetry, errWithReason
}
2 changes: 1 addition & 1 deletion notify/msteamsv2/msteamsv2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TestMSTeamsV2Retry(t *testing.T) {
require.NoError(t, err)

for statusCode, expected := range test.RetryTests(test.DefaultRetryCodes()) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retry - error on status %d", statusCode)
}
}
Expand Down
34 changes: 30 additions & 4 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -908,8 +908,20 @@ func (r RetryStage) exec(ctx context.Context, l *slog.Logger, alerts ...*types.A
b := backoff.NewExponentialBackOff()
b.MaxElapsedTime = 0 // Always retry.

tick := backoff.NewTicker(b)
defer tick.Stop()
stopTimer := func(timer *time.Timer) {
if timer == nil {
return
}
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
}

attemptTimer := time.NewTimer(0)
defer stopTimer(attemptTimer)

var (
i = 0
Expand Down Expand Up @@ -943,7 +955,7 @@ func (r RetryStage) exec(ctx context.Context, l *slog.Logger, alerts ...*types.A
}

select {
case <-tick.C:
case <-attemptTimer.C:
now := time.Now()
retry, err := r.integration.Notify(ctx, sent...)
i++
Expand All @@ -956,10 +968,24 @@ func (r RetryStage) exec(ctx context.Context, l *slog.Logger, alerts ...*types.A
return ctx, alerts, fmt.Errorf("%s/%s: notify retry canceled due to unrecoverable error after %d attempts: %w", r.groupName, r.integration.String(), i, err)
}
if ctx.Err() == nil {
if iErr == nil || err.Error() != iErr.Error() {
nextDelay := b.NextBackOff()

var e *ErrorWithReason
if errors.As(err, &e) && e.Reason == TooManyRequestsReason && e.RetryAfter > 0 {
nextDelay = e.RetryAfter
l.Warn("Notify attempt failed, honoring Retry-After", "attempts", i, "retry_after", e.RetryAfter, "err", err)
} else if iErr == nil || err.Error() != iErr.Error() {
// Log the error if the context isn't done and the error isn't the same as before.
l.Warn("Notify attempt failed, will retry later", "attempts", i, "err", err)
}

// not really needed since the set b.MaxElapsedTime = 0,
// but just in case the backoff configuration changes in the future.
if nextDelay == backoff.Stop {
return ctx, nil, fmt.Errorf("%s/%s: notify retry stopped after %d attempts: %w", r.groupName, r.integration.String(), i, err)
}

attemptTimer.Reset(nextDelay)
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
Expand Down
109 changes: 109 additions & 0 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,115 @@ func TestRetryStageWithContextCanceled(t *testing.T) {
require.NotNil(t, resctx)
}

func TestRetryStageHonorsRetryAfter(t *testing.T) {
attempts := 0
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
attempts++
if attempts < 4 {
err := NewErrorWithReason(TooManyRequestsReason, errors.New("received 429 Too Many Requests"))
err.RetryAfter = 10 * time.Millisecond
return true, err
}
return false, nil
}),
rs: sendResolved(false),
}
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))

alerts := []*types.Alert{{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
}}

// The default exponential backoff starts at 500ms after the first immediate
// attempt, so 4 attempts can only complete within this timeout when
// Retry-After is actually honored.
ctx, cancel := context.WithTimeout(context.Background(), 400*time.Millisecond)
defer cancel()
ctx = WithFiringAlerts(ctx, []uint64{0})

start := time.Now()
_, _, err := r.Exec(ctx, promslog.NewNopLogger(), alerts...)
elapsed := time.Since(start)
require.NoError(t, err)
require.Equal(t, 4, attempts)
require.GreaterOrEqual(t, elapsed, 30*time.Millisecond)
require.Less(t, elapsed, 350*time.Millisecond)
}

func TestRetryStageRecalculatesBackoffAfterRetryAfter(t *testing.T) {
attempts := 0
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
attempts++
switch attempts {
case 1:
err := NewErrorWithReason(TooManyRequestsReason, errors.New("received 429 Too Many Requests"))
err.RetryAfter = 10 * time.Millisecond
return true, err
case 2:
return true, errors.New("temporary failure")
default:
return false, nil
}
}),
rs: sendResolved(false),
}
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))

alerts := []*types.Alert{{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
}}

ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
ctx = WithFiringAlerts(ctx, []uint64{0})

_, _, err := r.Exec(ctx, promslog.NewNopLogger(), alerts...)
require.Error(t, err)
require.Contains(t, err.Error(), "notify retry canceled after 2 attempts")
require.Equal(t, 2, attempts)
}

func TestRetryStageWithoutRetryAfterUsesExponentialBackoff(t *testing.T) {
attempts := 0
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
attempts++
if attempts < 4 {
return true, NewErrorWithReason(TooManyRequestsReason, errors.New("received 429 Too Many Requests"))
}
return false, nil
}),
rs: sendResolved(false),
}
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))

alerts := []*types.Alert{{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
}}

// Without Retry-After we should follow the default backoff ticker, whose
// first interval after the initial attempt is far larger than this timeout.
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
ctx = WithFiringAlerts(ctx, []uint64{0})

_, _, err := r.Exec(ctx, promslog.NewNopLogger(), alerts...)
require.Error(t, err)
require.Contains(t, err.Error(), "notify retry canceled after 1 attempts")
require.Equal(t, 1, attempts)
}

func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
Expand Down
6 changes: 3 additions & 3 deletions notify/opsgenie/opsgenie.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error)
if err != nil {
return true, err
}
shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body)
shouldRetry, errWithReason := n.retrier.Check(resp)
notify.Drain(resp)
if err != nil {
return shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
if errWithReason != nil {
return shouldRetry, errWithReason
}
}
return true, nil
Expand Down
2 changes: 1 addition & 1 deletion notify/opsgenie/opsgenie_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func TestOpsGenieRetry(t *testing.T) {

retryCodes := append(test.DefaultRetryCodes(), http.StatusTooManyRequests)
for statusCode, expected := range test.RetryTests(retryCodes) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "error on status %d", statusCode)
}
}
Expand Down
9 changes: 3 additions & 6 deletions notify/pagerduty/pagerduty.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ func (n *Notifier) notifyV1(
}
defer notify.Drain(resp)

return n.retrier.Check(resp.StatusCode, resp.Body)
return n.retrier.Check(resp)
}

func (n *Notifier) notifyV2(
Expand Down Expand Up @@ -293,11 +293,8 @@ func (n *Notifier) notifyV2(
}
defer notify.Drain(resp)

retry, err := n.retrier.Check(resp.StatusCode, resp.Body)
if err != nil {
return retry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
}
return retry, err
retry, errWithReason := n.retrier.Check(resp)
return retry, errWithReason
}

// Notify implements the Notifier interface.
Expand Down
4 changes: 2 additions & 2 deletions notify/pagerduty/pagerduty_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestPagerDutyRetryV1(t *testing.T) {

retryCodes := append(test.DefaultRetryCodes(), http.StatusForbidden)
for statusCode, expected := range test.RetryTests(retryCodes) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retryv1 - error on status %d", statusCode)
}
}
Expand All @@ -71,7 +71,7 @@ func TestPagerDutyRetryV2(t *testing.T) {

retryCodes := append(test.DefaultRetryCodes(), http.StatusTooManyRequests)
for statusCode, expected := range test.RetryTests(retryCodes) {
actual, _ := notifier.retrier.Check(statusCode, nil)
actual, _ := notifier.retrier.Check(test.HTTPResponseForStatusCode(statusCode))
require.Equal(t, expected, actual, "retryv2 - error on status %d", statusCode)
}
}
Expand Down
Loading
Loading