From a7e9153b7df095bdd85b8b2ec9a47c0a9611730b Mon Sep 17 00:00:00 2001 From: Brandur Date: Fri, 25 Apr 2025 21:00:16 -0700 Subject: [PATCH] Fix intermittent test failures checking job `retryable` We got a lot of the remaining intermittent test failures in #848, but one that I was punting on because I couldn't immediately see the problem is this one [1]: --- FAIL: Test_Client_JobCompletion (0.00s) --- FAIL: Test_Client_JobCompletion/JobThatReturnsErrIsRetryable (1.65s) client_test.go:5561: Reusing idle schema "river_2025_04_26t03_40_43_schema_31" after cleaning in 194.808714ms [32 generated] [28 reused] logger.go:256: time=2025-04-26T03:40:53.907Z level=INFO msg="River client started" client_id=fv-az1970-529_2025_04_26T03_40_53_541948 logger.go:256: time=2025-04-26T03:40:53.966Z level=WARN msg="jobexecutor.JobExecutor: Job errored; retrying" error=oops job_id=2 job_kind=callback logger.go:256: time=2025-04-26T03:40:54.947Z level=INFO msg="maintenance.JobScheduler: Ran successfully" num_jobs_scheduled=1 client_test.go:5573: Error Trace: /home/runner/work/river/river/client_test.go:5573 Error: Not equal: expected: "retryable" actual : "available" Diff: --- Expected +++ Actual @@ -1,2 +1,2 @@ -(rivertype.JobState) (len=9) "retryable" +(rivertype.JobState) (len=9) "available" Test: Test_Client_JobCompletion/JobThatReturnsErrIsRetryable logger.go:256: time=2025-04-26T03:40:54.997Z level=INFO msg="River client stopped" client_id=fv-az1970-529_2025_04_26T03_40_53_541948 FAIL FAIL github.com/riverqueue/river 23.010s This test fails fairly often in CI, but won't fail locally no matter how many times you run it (or with counts up to 1,000 anyway). I think I finally see the issue, which is that when checking `retryable`, by the time we go to look up the job by ID, in slow environments like GitHub Actions it's possible for the test case goroutine to have been paused for more than one second, which means that the job may have already been scheduled to be available again because the first retry backoff is 1 second. I've even seen failures where the job is actually running again already because it went to available, then was picked up by the client to be run again. Here, fix the problem by defining a special `RetryPolicySlow` retry policy that retries very slowly and makes this problem impossible. We don't use this everywhere for things to stay realistic, but configure the client with it in the failing test case. [1] https://github.com/riverqueue/river/actions/runs/14677353625/job/41195546754 --- client_test.go | 5 ++++- .../retrypolicytest/retrypolicytest.go | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/client_test.go b/client_test.go index ab3002dd..1c089af5 100644 --- a/client_test.go +++ b/client_test.go @@ -5652,6 +5652,9 @@ func Test_Client_JobCompletion(t *testing.T) { }) config.RetryPolicy = &retrypolicytest.RetryPolicyNoJitter{} + retryPolicy := &retrypolicytest.RetryPolicySlow{} // make sure job isn't set back to available/running by the time we check it below + config.RetryPolicy = retryPolicy + client, bundle := setup(t, config) now := client.baseService.Time.StubNowUTC(time.Now().UTC()) @@ -5667,7 +5670,7 @@ func Test_Client_JobCompletion(t *testing.T) { require.NoError(err) require.Equal(rivertype.JobStateRetryable, reloadedJob.State) - require.WithinDuration(now.Add(1*time.Second), reloadedJob.ScheduledAt, time.Microsecond) + require.WithinDuration(now.Add(retryPolicy.Interval()), reloadedJob.ScheduledAt, time.Microsecond) require.Nil(reloadedJob.FinalizedAt) }) diff --git a/internal/riverinternaltest/retrypolicytest/retrypolicytest.go b/internal/riverinternaltest/retrypolicytest/retrypolicytest.go index bb150234..7b761223 100644 --- a/internal/riverinternaltest/retrypolicytest/retrypolicytest.go +++ b/internal/riverinternaltest/retrypolicytest/retrypolicytest.go @@ -58,3 +58,23 @@ func (p *RetryPolicyNoJitter) retrySecondsWithoutJitter(attempt int) float64 { retrySeconds := math.Pow(float64(attempt), 4) return min(retrySeconds, maxDurationSeconds) } + +const retryPolicySlowInterval = 1 * time.Hour + +// RetryPolicySlow is a retry policy that has a very slow retry interval. This +// is used in tests that check retries to make sure that in slower environments +// (like GitHub Actions), jobs aren't accidentally set back to available or +// running before a paused test case can check to make sure it's in a retryable +// state. +type RetryPolicySlow struct{} + +// Interval is the slow retry interval exposed for use in test case assertions. +// Unlike the standard retry policy, RetryPolicySlow's interval is constant +// regardless of which attempt number the job was on. +func (p *RetryPolicySlow) Interval() time.Duration { + return retryPolicySlowInterval +} + +func (p *RetryPolicySlow) NextRetry(job *rivertype.JobRow) time.Time { + return job.AttemptedAt.Add(retryPolicySlowInterval) +}