From ee36923d7bf74c69bf4b95c75494bdb70d2716dc Mon Sep 17 00:00:00 2001
From: Jeremy Prevost <JPrevost@users.noreply.github.com>
Date: Mon, 22 Sep 2025 16:46:35 -0400
Subject: [PATCH 1/4] Adds retry logic to PreservationSubmissionJob

Why are these changes being introduced:

* The APT lambda will return 502 errors when it hasn't been called
  for a few days as AWS makes the lambda `inactive`
* This logic retries the jobs after 5 minutes to hopefully give enough
  time for the lambda to become active again. It retries up to 10 times.

Relevant ticket(s):

* https://mitlibraries.atlassian.net/browse/ETD-677

How does this address that need:

* Adds a custom error class for APTBadGatewayErrors
* Configures ActiveJob to retry on that error class up to 10 times with
  a wait of 5 minutes between retries

Document any side effects to this change:

* While this will reduce the number off failed jobs and manual cleanup,
  it will create additional files in S3 and additional records in the
  ETD database due to that logic being included in the job that is
  being retried. This is a tradeoff we are willing to make to at this
  time as it is much better than a manual cleanup after failure and
  we are hopeful Infra will be able to add a Cron-like job to keep the
  lambda from fully deprovisioning into the `inactive` state soon so
  this should be a temporary workaround but still worth having in our
  codebase.
* We should ensure the Lambda is not `inactive` manually before running
  our jobs for a while until we better understand how long passes before
  it becomes `inactive` and how long it takes to become active again.
  This code is more of a safety net for when we forget.
---
 app/jobs/preservation_submission_job.rb       | 27 +++++++--
 test/jobs/preservation_submission_job_test.rb | 57 ++++++++++++++++---
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/app/jobs/preservation_submission_job.rb b/app/jobs/preservation_submission_job.rb
index 13ab5f1c..594db539 100644
--- a/app/jobs/preservation_submission_job.rb
+++ b/app/jobs/preservation_submission_job.rb
@@ -4,6 +4,16 @@ class PreservationSubmissionJob < ActiveJob::Base
 
   queue_as :default
 
+  # Custom error class for 502 Bad Gateway responses from APT
+  class APTBadGatewayError < StandardError; end
+
+  # Retry up to 10 times for transient 502 Bad Gateway responses. These 502 errors are generally caused by the lambda
+  # entering `Inactive` state after a long period of inactivity, which can take several minutes to recover from.
+  # We are using a fixed wait time of 5 minutes between retries to give the lambda time to warm up, rather than
+  # retrying immediately with exponential backoff as we expect the first few retries to fail in that scenario so this
+  # longer time is more effective.
+  retry_on APTBadGatewayError, wait: 5.minutes, attempts: 10
+
   def perform(theses)
     Rails.logger.info("Preparing to send #{theses.count} theses to preservation")
     results = { total: theses.count, processed: 0, errors: [] }
@@ -13,7 +23,10 @@ def perform(theses)
       preserve_payload(payload)
       Rails.logger.info("Thesis #{thesis.id} has been sent to preservation")
       results[:processed] += 1
-    rescue StandardError, Aws::Errors => e
+    rescue StandardError => e
+      # Explicitly re-raise the 502-specific error so ActiveJob can retry it.
+      raise e if e.is_a?(APTBadGatewayError)
+
       preservation_error = "Thesis #{thesis.id} could not be preserved: #{e}"
       Rails.logger.info(preservation_error)
       results[:errors] << preservation_error
@@ -40,6 +53,12 @@ def post_payload(payload)
       http.request(request)
     end
 
+    # If the remote endpoint returns 502, raise a APTBadGatewayError so ActiveJob can retry.
+    if response.code.to_s == '502'
+      Rails.logger.warn("Received 502 from APT for payload #{payload.id}; raising for retry")
+      raise APTBadGatewayError, 'APT returned 502 Bad Gateway'
+    end
+
     validate_response(response)
   end
 
@@ -49,8 +68,8 @@ def validate_response(response)
     end
 
     result = JSON.parse(response.body)
-    unless result['success'] == true
-      raise "APT failed to create a bag: #{response.body}"
-    end
+    return if result['success'] == true
+
+    raise "APT failed to create a bag: #{response.body}"
   end
 end
diff --git a/test/jobs/preservation_submission_job_test.rb b/test/jobs/preservation_submission_job_test.rb
index 66aee414..dd1eb4b1 100644
--- a/test/jobs/preservation_submission_job_test.rb
+++ b/test/jobs/preservation_submission_job_test.rb
@@ -2,7 +2,6 @@
 require 'webmock/minitest'
 
 class PreservationSubmissionJobTest < ActiveJob::TestCase
-
   # because we need to actually use the file it's easier to attach it in the test rather
   # than use our fixtures as the fixtures oddly don't account for the file actually being
   # where ActiveStorage expects them to be. We also need this to be a record that looks like
@@ -27,24 +26,24 @@ def setup_thesis_two
   end
 
   def stub_apt_lambda_success
-    stub_request(:post, ENV['APT_LAMBDA_URL'])
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
       .to_return(
         status: 200,
         body: {
           success: true,
           bag: {
             entries: {
-              "manifest-md5.txt" => { md5: "fakehash" }
+              'manifest-md5.txt' => { md5: 'fakehash' }
             }
           },
-          output_zip_s3_uri: "s3://my-bucket/apt-testing/test-one-medium.zip"
+          output_zip_s3_uri: 's3://my-bucket/apt-testing/test-one-medium.zip'
         }.to_json,
         headers: { 'Content-Type' => 'application/json' }
       )
   end
 
   def stub_apt_lambda_failure
-    stub_request(:post, ENV['APT_LAMBDA_URL'])
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
       .to_return(
         status: 500,
         headers: { 'Content-Type' => 'application/json' }
@@ -52,13 +51,13 @@ def stub_apt_lambda_failure
   end
 
   def stub_apt_lambda_200_failure
-    stub_request(:post, ENV['APT_LAMBDA_URL'])
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
       .to_return(
         status: 200,
         body: {
           success: false,
-          error: "An error occurred (404) when calling the HeadObject operation: Not Found",
-          bag: { entries: {}},
+          error: 'An error occurred (404) when calling the HeadObject operation: Not Found',
+          bag: { entries: {} }
         }.to_json,
         headers: { 'Content-Type' => 'application/json' }
       )
@@ -177,7 +176,6 @@ def stub_apt_lambda_200_failure
     assert_empty another_good_thesis.archivematica_payloads
 
     PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis])
-
     # first thesis should succeed and have a payload
     assert_equal 1, good_thesis.archivematica_payloads.count
 
@@ -187,4 +185,45 @@ def stub_apt_lambda_200_failure
     # third thesis should succeed and have a payload, despite prior failure
     assert_equal 1, another_good_thesis.archivematica_payloads.count
   end
+
+  test 'throws exceptions and probably creates payloads when a bad key is provided' do
+    skip('Test not implemented yet')
+    # Our lambda returns 400 Bad Request with a error body of Invalid input payload
+    # This should never happen as we submit it via ENV, but just in case we should understand what it looks like
+  end
+
+  test 'retries on 502 Bad Gateway errors' do
+    # This syntax will cause the first two calls to return 502, and the third to succeed.
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
+      .to_return(
+        { status: 502 },
+        { status: 502 },
+        { status: 200, body: {
+          success: true,
+          bag: {
+            entries: {
+              'manifest-md5.txt' => { md5: 'fakehash' }
+            }
+          },
+          output_zip_s3_uri: 's3://my-bucket/apt-testing/test-one-medium.zip'
+        }.to_json, headers: { 'Content-Type' => 'application/json' } }
+      )
+
+    thesis = setup_thesis
+    assert_equal 0, thesis.archivematica_payloads.count
+
+    # We need to wrap this in perform_enqueued_jobs so that the retries are actually executed. Our normal syntax only
+    # runs a single job, but because we are testing retries this wrapper executes the jobs this job queues.
+    perform_enqueued_jobs do
+      PreservationSubmissionJob.perform_now([thesis])
+    end
+
+    # 3 payloads were created. 2 for the failures, 1 for the success.
+    # This isn't ideal, but it's the current behavior. A refactor to using a pre-preservation job to create
+    # metadata.csv prior to this job would fix this but is out of scope for now.
+    assert_equal 3, thesis.archivematica_payloads.count
+
+    # The last payload should be marked as preserved.
+    assert_equal 'preserved', thesis.archivematica_payloads.last.preservation_status
+  end
 end

From 537d6f4d75529e77910a0a84a4a53db0ca141036 Mon Sep 17 00:00:00 2001
From: Jeremy Prevost <JPrevost@users.noreply.github.com>
Date: Tue, 23 Sep 2025 10:15:57 -0400
Subject: [PATCH 2/4] Updates documentation and rewords two tests

Why are these changes being introduced:

* Two tests were misleading as to what they were testing
* Documentation to reflect it is best to manual check the lambda state
  will help us avoid confusion in the future

Relevant ticket(s):

* https://mitlibraries.atlassian.net/browse/ETD-677

How does this address that need:

* Updates docs to include how to run dss
* Updates docs to include note to check lambda state manually
* Rewords two test names to reflect what they are actually testing

Document any side effects to this change:

* There are additional error states from APT we should detect and handle
  but that is out of scope for this ticket
---
 README.md                                     | 14 ++++++++---
 test/jobs/preservation_submission_job_test.rb | 25 ++++++-------------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2d26dbda..111c0c20 100644
--- a/README.md
+++ b/README.md
@@ -374,11 +374,19 @@ processors must create a corresponding Archivematica accession number.
 1. Following the processing workflow, stakeholders choose a term to publish (Process theses - Select term - Select
 Publication Review - Publish)
 2. ETD will now automatically send data to DSS via the SQS queue
-3. DSS runs (as of this writing that is a manual process documented in the
-  [DSS repo](https://github.com/MITLibraries/dspace-submission-service#run-stage))
+3. DSS runs (as of this writing that is no longer documented in the
+  [DSS repo](https://github.com/MITLibraries/dspace-submission-service))
+   * From the root of a local checkout of `dspace-submission-service`
+   * Python dependencies do *not* need to be installed
+   * Authenticate your terminal to appropriate AWS account and role (dssManagement)
+   * `make run-prod` (or `make run-stage`)
 4. ETD processes output queue to update records and send email to stakeholders with summary data and list
   of any error records. As of now this is a manual process, but can be triggered via rake task using the following heroku-cli command:
 
+  >!IMPORTANT
+  >Check the AWS Console to ensure the APT Lambda is not inactive before proceeding.
+  >If you do not, the preservation jobs will retry but the initial email to stakeholders will indicate preservation has failed.
+
   ```shell
   # run the output queue processing job
   heroku run rails dss:process_output_queue --app TARGET-HEROKU-APP
@@ -397,7 +405,7 @@ You can publish a single thesis that is already in `Publication review` or `Pend
 task:
 
 ```shell
-heroku run -s standard-2x rails dss:publish_thesis_by_id[THESIS_ID] --app TARGET-HEROKU-APP
+heroku run rails dss:publish_thesis_by_id[THESIS_ID] --app TARGET-HEROKU-APP
 ```
 
 Note: `Pending publication` is allowed here, but not expected to be a normal occurence, to handle the edge case of the app thinking data was sent to SQS but the data not arriving for any reason.
diff --git a/test/jobs/preservation_submission_job_test.rb b/test/jobs/preservation_submission_job_test.rb
index dd1eb4b1..4ce72456 100644
--- a/test/jobs/preservation_submission_job_test.rb
+++ b/test/jobs/preservation_submission_job_test.rb
@@ -145,44 +145,33 @@ def stub_apt_lambda_200_failure
     end
   end
 
-  test 'does not create payloads if job fails' do
+  # This is not the desired behavior, but it confirms the current behavior.
+  test 'create payloads even if APT job fails' do
     stub_apt_lambda_failure
-    bad_thesis = theses(:one)
     good_thesis = setup_thesis
     another_good_thesis = setup_thesis_two
-    assert_empty bad_thesis.archivematica_payloads
     assert_empty good_thesis.archivematica_payloads
     assert_empty another_good_thesis.archivematica_payloads
 
-    PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis])
+    PreservationSubmissionJob.perform_now([good_thesis, another_good_thesis])
 
-    # first thesis should succeed and have a payload
     assert_equal 1, good_thesis.archivematica_payloads.count
 
-    # second thesis should fail and not have a payload
-    assert_empty bad_thesis.archivematica_payloads
-
-    # third thesis should succeed and have a payload, despite prior failure
     assert_equal 1, another_good_thesis.archivematica_payloads.count
   end
 
-  test 'does not create payloads if post succeeds but APT fails' do
+  # This is not the desired behavior, but it confirms the current behavior.
+  test 'creates payloads if post succeeds but APT fails' do
     stub_apt_lambda_200_failure
-    bad_thesis = theses(:one)
     good_thesis = setup_thesis
     another_good_thesis = setup_thesis_two
-    assert_empty bad_thesis.archivematica_payloads
     assert_empty good_thesis.archivematica_payloads
     assert_empty another_good_thesis.archivematica_payloads
 
-    PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis])
-    # first thesis should succeed and have a payload
-    assert_equal 1, good_thesis.archivematica_payloads.count
+    PreservationSubmissionJob.perform_now([good_thesis, another_good_thesis])
 
-    # second thesis should fail and not have a payload
-    assert_empty bad_thesis.archivematica_payloads
+    assert_equal 1, good_thesis.archivematica_payloads.count
 
-    # third thesis should succeed and have a payload, despite prior failure
     assert_equal 1, another_good_thesis.archivematica_payloads.count
   end
 

From 9619216efe7e00ffffbfeeac8e43bbb6549ff4f1 Mon Sep 17 00:00:00 2001
From: Jeremy Prevost <JPrevost@users.noreply.github.com>
Date: Tue, 23 Sep 2025 10:48:55 -0400
Subject: [PATCH 3/4] Fixup note in readme

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 111c0c20..09caf6aa 100644
--- a/README.md
+++ b/README.md
@@ -383,9 +383,8 @@ Publication Review - Publish)
 4. ETD processes output queue to update records and send email to stakeholders with summary data and list
   of any error records. As of now this is a manual process, but can be triggered via rake task using the following heroku-cli command:
 
-  >!IMPORTANT
-  >Check the AWS Console to ensure the APT Lambda is not inactive before proceeding.
-  >If you do not, the preservation jobs will retry but the initial email to stakeholders will indicate preservation has failed.
+  > [!IMPORTANT]
+  > Check the AWS Console to ensure the APT Lambda is not inactive before proceeding. If you do not, the preservation jobs will retry but the initial email to stakeholders will indicate preservation has failed.
 
   ```shell
   # run the output queue processing job

From 30f7a0365b633c36200a909d00e46834be5f8bf8 Mon Sep 17 00:00:00 2001
From: Jeremy Prevost <JPrevost@users.noreply.github.com>
Date: Tue, 23 Sep 2025 16:14:08 -0400
Subject: [PATCH 4/4] Fixup docs around checking lambda status

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 09caf6aa..6cf32dd5 100644
--- a/README.md
+++ b/README.md
@@ -384,7 +384,7 @@ Publication Review - Publish)
   of any error records. As of now this is a manual process, but can be triggered via rake task using the following heroku-cli command:
 
   > [!IMPORTANT]
-  > Check the AWS Console to ensure the APT Lambda is not inactive before proceeding. If you do not, the preservation jobs will retry but the initial email to stakeholders will indicate preservation has failed.
+  > Check the AWS Console to ensure the APT Lambda is active (i.e. not inactive) before proceeding. If you do not, the preservation jobs may fail if they have not run for a few days.
 
   ```shell
   # run the output queue processing job