riverqueue · bgentry · Jan 13, 2024 · Dec 30, 2023 · Jan 10, 2024 · Jan 13, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `Cancel` and `CancelTx` to the `Client` to enable cancellation of jobs. [PR #141](https://github.com/riverqueue/river/pull/141).
 - Added `ClientFromContext` and `ClientWithContextSafely` helpers to extract the `Client` from the worker's context where it is now available to workers. This simplifies making the River client available within your workers for i.e. enqueueing additional jobs. [PR #145](https://github.com/riverqueue/river/pull/145).
 
 ## [0.0.16] - 2024-01-06

diff --git a/client.go b/client.go
@@ -312,6 +312,11 @@ func (ts *clientTestSignals) Init() {
 }
 
 var (
+	// ErrNotFound is returned when a query by ID does not match any existing
+	// rows. For example, attempting to cancel a job that doesn't exist will
+	// return this error.
+	ErrNotFound = errors.New("not found")
+
 	errMissingConfig                 = errors.New("missing config")
 	errMissingDatabasePoolWithQueues = errors.New("must have a non-nil database pool to execute jobs (either use a driver with database pool or don't configure Queues)")
 	errMissingDriver                 = errors.New("missing database driver (try wrapping a Pgx pool with river/riverdriver/riverpgxv5.New)")
@@ -935,6 +940,106 @@ func (c *Client[TTx]) runProducers(fetchNewWorkCtx, workCtx context.Context) {
 	}
 }
 
+// Cancel cancels the job with the given ID. If possible, the job is cancelled
+// immediately and will not be retried. The provided context is used for the
+// underlying Postgres update and can be used to cancel the operation or apply a
+// timeout.
+//
+// If the job is still in the queue (available, scheduled, or retryable), it is
+// immediately marked as cancelled and will not be retried.
+//
+// If the job is already finalized (cancelled, completed, or discarded), no
+// changes are made.
+//
+// If the job is currently running, it is not immediately cancelled, but is
+// instead marked for cancellation. The client running the job will also be
+// notified (via LISTEN/NOTIFY) to cancel the running job's context. Although
+// the job's context will be cancelled, since Go does not provide a mechanism to
+// interrupt a running goroutine the job will continue running until it returns.
+// As always, it is important for workers to respect context cancellation and
+// return promptly when the job context is done.
+//
+// Once the cancellation signal is received by the client running the job, any
+// error returned by that job will result in it being cancelled permanently and
+// not retried. However if the job returns no error, it will be completed as
+// usual.
+//
+// In the event the running job finishes executing _before_ the cancellation
+// signal is received but _after_ this update was made, the behavior depends on
+// which state the job is being transitioned into (based on its return error):
+//
+//   - If the job completed successfully, was cancelled from within, or was
+//     discarded due to exceeding its max attempts, the job will be updated as
+//     usual.
+//   - If the job was snoozed to run again later or encountered a retryable error,
+//     the job will be marked as cancelled and will not be attempted again.
+//
+// Returns the up-to-date JobRow for the specified jobID if it exists. Returns
+// ErrNotFound if the job doesn't exist.
+func (c *Client[TTx]) Cancel(ctx context.Context, jobID int64) (*rivertype.JobRow, error) {
+	job, err := c.adapter.JobCancel(ctx, jobID)
+	if err != nil {
+		if errors.Is(err, riverdriver.ErrNoRows) {
+			return nil, ErrNotFound
+		}
+		return nil, err
+	}
+
+	return dbsqlc.JobRowFromInternal(job), nil
+}
+
+// CancelTx cancels the job with the given ID within the specified transaction.
+// This variant lets a caller cancel a job atomically alongside other database
+// changes. An cancelled job doesn't take effect until the transaction commits,
+// and if the transaction rolls back, so too is the cancelled job.
+//
+// If possible, the job is cancelled immediately and will not be retried. The
+// provided context is used for the underlying Postgres update and can be used
+// to cancel the operation or apply a timeout.
+//
+// If the job is still in the queue (available, scheduled, or retryable), it is
+// immediately marked as cancelled and will not be retried.
+//
+// If the job is already finalized (cancelled, completed, or discarded), no
+// changes are made.
+//
+// If the job is currently running, it is not immediately cancelled, but is
+// instead marked for cancellation. The client running the job will also be
+// notified (via LISTEN/NOTIFY) to cancel the running job's context. Although
+// the job's context will be cancelled, since Go does not provide a mechanism to
+// interrupt a running goroutine the job will continue running until it returns.
+// As always, it is important for workers to respect context cancellation and
+// return promptly when the job context is done.
+//
+// Once the cancellation signal is received by the client running the job, any
+// error returned by that job will result in it being cancelled permanently and
+// not retried. However if the job returns no error, it will be completed as
+// usual.
+//
+// In the event the running job finishes executing _before_ the cancellation
+// signal is received but _after_ this update was made, the behavior depends on
+// which state the job is being transitioned into (based on its return error):
+//
+//   - If the job completed successfully, was cancelled from within, or was
+//     discarded due to exceeding its max attempts, the job will be updated as
+//     usual.
+//   - If the job was snoozed to run again later or encountered a retryable error,
+//     the job will be marked as cancelled and will not be attempted again.
+//
+// Returns the up-to-date JobRow for the specified jobID if it exists. Returns
+// ErrNotFound if the job doesn't exist.
+func (c *Client[TTx]) CancelTx(ctx context.Context, tx TTx, jobID int64) (*rivertype.JobRow, error) {
+	job, err := c.adapter.JobCancelTx(ctx, c.driver.UnwrapTx(tx), jobID)
+	if errors.Is(err, riverdriver.ErrNoRows) {
+		return nil, ErrNotFound
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return dbsqlc.JobRowFromInternal(job), nil
+}
+
 func insertParamsFromArgsAndOptions(args JobArgs, insertOpts *InsertOpts) (*dbadapter.JobInsertParams, error) {
 	encodedArgs, err := json.Marshal(args)
 	if err != nil {

diff --git a/client_test.go b/client_test.go
@@ -216,7 +216,7 @@ func Test_Client(t *testing.T) {
 		riverinternaltest.WaitOrTimeout(t, workedChan)
 	})
 
-	t.Run("JobCancel", func(t *testing.T) {
+	t.Run("JobCancelErrorReturned", func(t *testing.T) {
 		t.Parallel()
 
 		client, bundle := setup(t)
@@ -245,7 +245,7 @@ func Test_Client(t *testing.T) {
 		require.WithinDuration(t, time.Now(), *updatedJob.FinalizedAt, 2*time.Second)
 	})
 
-	t.Run("JobSnooze", func(t *testing.T) {
+	t.Run("JobSnoozeErrorReturned", func(t *testing.T) {
 		t.Parallel()
 
 		client, bundle := setup(t)
@@ -274,6 +274,130 @@ func Test_Client(t *testing.T) {
 		require.WithinDuration(t, time.Now().Add(15*time.Minute), updatedJob.ScheduledAt, 2*time.Second)
 	})
 
+	// This helper is used to test cancelling a job both _in_ a transaction and
+	// _outside of_ a transaction. The exact same test logic applies to each case,
+	// the only difference is a different cancelFunc provided by the specific
+	// subtest.
+	cancelRunningJobTestHelper := func(t *testing.T, cancelFunc func(ctx context.Context, client *Client[pgx.Tx], jobID int64) (*rivertype.JobRow, error)) { //nolint:thelper
+		client, bundle := setup(t)
+
+		jobStartedChan := make(chan int64)
+
+		type JobArgs struct {
+			JobArgsReflectKind[JobArgs]
+		}
+
+		AddWorker(client.config.Workers, WorkFunc(func(ctx context.Context, job *Job[JobArgs]) error {
+			jobStartedChan <- job.ID
+			<-ctx.Done()
+			return ctx.Err()
+		}))
+
+		statusUpdateCh := client.monitor.RegisterUpdates()
+		startClient(ctx, t, client)
+		waitForClientHealthy(ctx, t, statusUpdateCh)
+
+		insertedJob, err := client.Insert(ctx, &JobArgs{}, nil)
+		require.NoError(t, err)
+
+		startedJobID := riverinternaltest.WaitOrTimeout(t, jobStartedChan)
+		require.Equal(t, insertedJob.ID, startedJobID)
+
+		// Cancel the job:
+		updatedJob, err := cancelFunc(ctx, client, insertedJob.ID)
+		require.NoError(t, err)
+		require.NotNil(t, updatedJob)
+		// Job is still actively running at this point because the query wouldn't
+		// modify that column for a running job:
+		require.Equal(t, rivertype.JobStateRunning, updatedJob.State)
+
+		event := riverinternaltest.WaitOrTimeout(t, bundle.subscribeChan)
+		require.Equal(t, EventKindJobCancelled, event.Kind)
+		require.Equal(t, JobStateCancelled, event.Job.State)
+		require.WithinDuration(t, time.Now(), *event.Job.FinalizedAt, 2*time.Second)
+
+		jobAfterCancel, err := bundle.queries.JobGetByID(ctx, client.driver.GetDBPool(), insertedJob.ID)
+		require.NoError(t, err)
+		require.Equal(t, dbsqlc.JobStateCancelled, jobAfterCancel.State)
+		require.WithinDuration(t, time.Now(), *jobAfterCancel.FinalizedAt, 2*time.Second)
+	}
+
+	t.Run("CancelRunningJob", func(t *testing.T) {
+		t.Parallel()
+
+		cancelRunningJobTestHelper(t, func(ctx context.Context, client *Client[pgx.Tx], jobID int64) (*rivertype.JobRow, error) {
+			return client.Cancel(ctx, jobID)
+		})
+	})
+
+	t.Run("CancelRunningJobInTx", func(t *testing.T) {
+		t.Parallel()
+
+		cancelRunningJobTestHelper(t, func(ctx context.Context, client *Client[pgx.Tx], jobID int64) (*rivertype.JobRow, error) {
+			var (
+				job *rivertype.JobRow
+				err error
+			)
+			txErr := pgx.BeginFunc(ctx, client.driver.GetDBPool(), func(tx pgx.Tx) error {
+				job, err = client.CancelTx(ctx, tx, jobID)
+				return err
+			})
+			require.NoError(t, txErr)
+			return job, err
+		})
+	})
+
+	t.Run("CancelScheduledJob", func(t *testing.T) {
+		t.Parallel()
+
+		client, _ := setup(t)
+
+		jobStartedChan := make(chan int64)
+
+		type JobArgs struct {
+			JobArgsReflectKind[JobArgs]
+		}
+
+		AddWorker(client.config.Workers, WorkFunc(func(ctx context.Context, job *Job[JobArgs]) error {
+			jobStartedChan <- job.ID
+			<-ctx.Done()
+			return ctx.Err()
+		}))
+
+		startClient(ctx, t, client)
+
+		insertedJob, err := client.Insert(ctx, &JobArgs{}, &InsertOpts{ScheduledAt: time.Now().Add(5 * time.Minute)})
+		require.NoError(t, err)
+
+		// Cancel the job:
+		updatedJob, err := client.Cancel(ctx, insertedJob.ID)
+		require.NoError(t, err)
+		require.NotNil(t, updatedJob)
+		require.Equal(t, rivertype.JobStateCancelled, updatedJob.State)
+		require.WithinDuration(t, time.Now(), *updatedJob.FinalizedAt, 2*time.Second)
+	})
+
+	t.Run("CancelNonExistentJob", func(t *testing.T) {
+		t.Parallel()
+
+		client, _ := setup(t)
+		startClient(ctx, t, client)
+
+		// Cancel an unknown job ID:
+		jobAfter, err := client.Cancel(ctx, 0)
+		require.ErrorIs(t, err, ErrNotFound)
+		require.Nil(t, jobAfter)
+
+		// Cancel an unknown job ID, within a transaction:
+		err = pgx.BeginFunc(ctx, client.driver.GetDBPool(), func(tx pgx.Tx) error {
+			jobAfter, err := client.CancelTx(ctx, tx, 0)
+			require.ErrorIs(t, err, ErrNotFound)
+			require.Nil(t, jobAfter)
+			return nil
+		})
+		require.NoError(t, err)
+	})
+
 	t.Run("AlternateSchema", func(t *testing.T) {
 		t.Parallel()
 

diff --git a/example_cancel_from_client_test.go b/example_cancel_from_client_test.go
@@ -0,0 +1,102 @@
+package river_test
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"time"
+
+	"github.com/jackc/pgx/v5/pgxpool"
+
+	"github.com/riverqueue/river"
+	"github.com/riverqueue/river/internal/riverinternaltest"
+	"github.com/riverqueue/river/internal/util/slogutil"
+	"github.com/riverqueue/river/riverdriver/riverpgxv5"
+)
+
+type SleepingArgs struct{}
+
+func (args SleepingArgs) Kind() string { return "SleepingWorker" }
+
+type SleepingWorker struct {
+	river.WorkerDefaults[CancellingArgs]
+	jobChan chan int64
+}
+
+func (w *SleepingWorker) Work(ctx context.Context, job *river.Job[CancellingArgs]) error {
+	w.jobChan <- job.ID
+	select {
+	case <-ctx.Done():
+	case <-time.After(5 * time.Second):
+		return errors.New("sleeping worker timed out")
+	}
+	return ctx.Err()
+}
+
+// Example_cancelJobFromClient demonstrates how to permanently cancel a job from
+// any Client using Cancel.
+func Example_cancelJobFromClient() {
+	ctx := context.Background()
+
+	dbPool, err := pgxpool.NewWithConfig(ctx, riverinternaltest.DatabaseConfig("river_testdb_example"))
+	if err != nil {
+		panic(err)
+	}
+	defer dbPool.Close()
+
+	// Required for the purpose of this test, but not necessary in real usage.
+	if err := riverinternaltest.TruncateRiverTables(ctx, dbPool); err != nil {
+		panic(err)
+	}
+
+	jobChan := make(chan int64)
+
+	workers := river.NewWorkers()
+	river.AddWorker(workers, &SleepingWorker{jobChan: jobChan})
+
+	riverClient, err := river.NewClient(riverpgxv5.New(dbPool), &river.Config{
+		Logger: slog.New(&slogutil.SlogMessageOnlyHandler{Level: slog.LevelWarn}),
+		Queues: map[string]river.QueueConfig{
+			river.QueueDefault: {MaxWorkers: 10},
+		},
+		Workers: workers,
+	})
+	if err != nil {
+		panic(err)
+	}
+
+	// Not strictly needed, but used to help this test wait until job is worked.
+	subscribeChan, subscribeCancel := riverClient.Subscribe(river.EventKindJobCancelled)
+	defer subscribeCancel()
+
+	if err := riverClient.Start(ctx); err != nil {
+		panic(err)
+	}
+	job, err := riverClient.Insert(ctx, CancellingArgs{ShouldCancel: true}, nil)
+	if err != nil {
+		panic(err)
+	}
+	select {
+	case <-jobChan:
+	case <-time.After(2 * time.Second):
+		panic("no jobChan signal received")
+	}
+
+	// There is presently no way to wait for the client to be 100% ready, so we
+	// sleep for a bit to give it time to start up. This is only needed in this
+	// example because we need the notifier to be ready for it to receive the
+	// cancellation signal.
+	time.Sleep(500 * time.Millisecond)
+
+	if _, err = riverClient.Cancel(ctx, job.ID); err != nil {
+		panic(err)
+	}
+	waitForNJobs(subscribeChan, 1)
+
+	if err := riverClient.Stop(ctx); err != nil {
+		panic(err)
+	}
+
+	// Output:
+	// jobExecutor: job cancelled remotely
+}