From a7e9153b7df095bdd85b8b2ec9a47c0a9611730b Mon Sep 17 00:00:00 2001
From: Brandur <brandur@brandur.org>
Date: Fri, 25 Apr 2025 21:00:16 -0700
Subject: [PATCH] Fix intermittent test failures checking job `retryable`

We got a lot of the remaining intermittent test failures in #848, but
one that I was punting on because I couldn't immediately see the problem
is this one [1]:

    --- FAIL: Test_Client_JobCompletion (0.00s)
        --- FAIL: Test_Client_JobCompletion/JobThatReturnsErrIsRetryable (1.65s)
            client_test.go:5561: Reusing idle schema "river_2025_04_26t03_40_43_schema_31" after cleaning in 194.808714ms [32 generated] [28 reused]
            logger.go:256: time=2025-04-26T03:40:53.907Z level=INFO msg="River client started" client_id=fv-az1970-529_2025_04_26T03_40_53_541948
            logger.go:256: time=2025-04-26T03:40:53.966Z level=WARN msg="jobexecutor.JobExecutor: Job errored; retrying" error=oops job_id=2 job_kind=callback
            logger.go:256: time=2025-04-26T03:40:54.947Z level=INFO msg="maintenance.JobScheduler: Ran successfully" num_jobs_scheduled=1
            client_test.go:5573:
                    Error Trace:	/home/runner/work/river/river/client_test.go:5573
                    Error:      	Not equal:
                                    expected: "retryable"
                                    actual  : "available"

                                    Diff:
                                    --- Expected
                                    +++ Actual
                                    @@ -1,2 +1,2 @@
                                    -(rivertype.JobState) (len=9) "retryable"
                                    +(rivertype.JobState) (len=9) "available"

                    Test:       	Test_Client_JobCompletion/JobThatReturnsErrIsRetryable
            logger.go:256: time=2025-04-26T03:40:54.997Z level=INFO msg="River client stopped" client_id=fv-az1970-529_2025_04_26T03_40_53_541948
    FAIL
    FAIL	github.com/riverqueue/river	23.010s

This test fails fairly often in CI, but won't fail locally no matter how
many times you run it (or with counts up to 1,000 anyway).

I think I finally see the issue, which is that when checking `retryable`,
by the time we go to look up the job by ID, in slow environments like
GitHub Actions it's possible for the test case goroutine to have been
paused for more than one second, which means that the job may have
already been scheduled to be available again because the first retry
backoff is 1 second. I've even seen failures where the job is actually
running again already because it went to available, then was picked up
by the client to be run again.

Here, fix the problem by defining a special `RetryPolicySlow` retry
policy that retries very slowly and makes this problem impossible. We
don't use this everywhere for things to stay realistic, but configure
the client with it in the failing test case.

[1] https://github.com/riverqueue/river/actions/runs/14677353625/job/41195546754
---
 client_test.go                                |  5 ++++-
 .../retrypolicytest/retrypolicytest.go        | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/client_test.go b/client_test.go
index ab3002dd..1c089af5 100644
--- a/client_test.go
+++ b/client_test.go
@@ -5652,6 +5652,9 @@ func Test_Client_JobCompletion(t *testing.T) {
 		})
 		config.RetryPolicy = &retrypolicytest.RetryPolicyNoJitter{}
 
+		retryPolicy := &retrypolicytest.RetryPolicySlow{} // make sure job isn't set back to available/running by the time we check it below
+		config.RetryPolicy = retryPolicy
+
 		client, bundle := setup(t, config)
 
 		now := client.baseService.Time.StubNowUTC(time.Now().UTC())
@@ -5667,7 +5670,7 @@ func Test_Client_JobCompletion(t *testing.T) {
 		require.NoError(err)
 
 		require.Equal(rivertype.JobStateRetryable, reloadedJob.State)
-		require.WithinDuration(now.Add(1*time.Second), reloadedJob.ScheduledAt, time.Microsecond)
+		require.WithinDuration(now.Add(retryPolicy.Interval()), reloadedJob.ScheduledAt, time.Microsecond)
 		require.Nil(reloadedJob.FinalizedAt)
 	})
 
diff --git a/internal/riverinternaltest/retrypolicytest/retrypolicytest.go b/internal/riverinternaltest/retrypolicytest/retrypolicytest.go
index bb150234..7b761223 100644
--- a/internal/riverinternaltest/retrypolicytest/retrypolicytest.go
+++ b/internal/riverinternaltest/retrypolicytest/retrypolicytest.go
@@ -58,3 +58,23 @@ func (p *RetryPolicyNoJitter) retrySecondsWithoutJitter(attempt int) float64 {
 	retrySeconds := math.Pow(float64(attempt), 4)
 	return min(retrySeconds, maxDurationSeconds)
 }
+
+const retryPolicySlowInterval = 1 * time.Hour
+
+// RetryPolicySlow is a retry policy that has a very slow retry interval. This
+// is used in tests that check retries to make sure that in slower environments
+// (like GitHub Actions), jobs aren't accidentally set back to available or
+// running before a paused test case can check to make sure it's in a retryable
+// state.
+type RetryPolicySlow struct{}
+
+// Interval is the slow retry interval exposed for use in test case assertions.
+// Unlike the standard retry policy, RetryPolicySlow's interval is constant
+// regardless of which attempt number the job was on.
+func (p *RetryPolicySlow) Interval() time.Duration {
+	return retryPolicySlowInterval
+}
+
+func (p *RetryPolicySlow) NextRetry(job *rivertype.JobRow) time.Time {
+	return job.AttemptedAt.Add(retryPolicySlowInterval)
+}