From 8a8ed011b61bad18203651850b28320dd55333ae Mon Sep 17 00:00:00 2001 From: jazairi <16103405+jazairi@users.noreply.github.com> Date: Fri, 11 Jul 2025 13:22:39 -0400 Subject: [PATCH 1/3] Wire preservation workflow to Archival Packaging Tool (APT) Why these changes are being introduced: DataEng has developed [APT](https://github.com/MITLibraries/archival-packaging-tool/) as middleware between ETD and Archivematica. This new application handles the BagIt logic, including creating bags in an S3 bucket connected to Archivematica. Thus, much of the SIP logic in ETD is no longer required. Relevant ticket(s): * [ETD-669](https://mitlibraries.atlassian.net/browse/ETD-669) How this addresses that need: This adds an Archivematica Payload model that effectively replaces the SIP model. The new model constructs the payload JSON expected by APT. Instantations of the model generate and persist this JSON on create, along with the metadata CSV as an ActiveStorage attachment. The other significant change is in the Preservation Submission Job. Previously, this job invoked the Submission Information Package Zipper model to stream a serialized bag to S3. Now, it's responsible for POSTing the JSON data to APT and handling the response. Side effects of this change: * The tests that call APT use webmock and stubbed responses. We would normally use VCR for external API calls, but in this case it doesn't seem prudent to pollute the APT S3 bucket, as it's possible the current test bucket will become the bucket we use. * The SIP model is retained for historical purposes. This is not ideal in terms of maintainability, but it feels important to retain that data, at least for the time being. --- Gemfile | 1 + Gemfile.lock | 9 ++ README.md | 27 +++-- app/jobs/preservation_submission_job.rb | 41 +++++-- app/models/archivematica_payload.rb | 101 +++++++++++++++++ app/models/submission_information_package.rb | 4 + .../submission_information_package_zipper.rb | 54 --------- app/models/thesis.rb | 1 + config/environments/test.rb | 5 + ...624182142_create_archivematica_payloads.rb | 13 +++ db/schema.rb | 13 ++- test/jobs/preservation_submission_job_test.rb | 106 +++++++++++++++--- test/models/archivematica_payload_test.rb | 63 +++++++++++ ...mission_information_package_zipper_test.rb | 50 --------- 14 files changed, 353 insertions(+), 135 deletions(-) create mode 100644 app/models/archivematica_payload.rb delete mode 100644 app/models/submission_information_package_zipper.rb create mode 100644 db/migrate/20250624182142_create_archivematica_payloads.rb create mode 100644 test/models/archivematica_payload_test.rb delete mode 100644 test/models/submission_information_package_zipper_test.rb diff --git a/Gemfile b/Gemfile index 66ceee92..289cbdc0 100644 --- a/Gemfile +++ b/Gemfile @@ -36,6 +36,7 @@ gem 'sentry-ruby' gem 'simple_form' gem 'skylight' gem 'terser' +gem 'webmock' gem 'zip_tricks' group :production do diff --git a/Gemfile.lock b/Gemfile.lock index f6647010..8c5b5136 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -154,6 +154,9 @@ GEM cocoon (1.2.15) concurrent-ruby (1.3.5) connection_pool (2.5.3) + crack (1.0.0) + bigdecimal + rexml crass (1.0.6) date (3.4.1) delayed_job (4.1.13) @@ -182,6 +185,7 @@ GEM terminal-table (>= 1.8) globalid (1.2.1) activesupport (>= 6.1) + hashdiff (1.2.0) hashie (5.0.0) i18n (1.14.7) concurrent-ruby (~> 1.0) @@ -445,6 +449,10 @@ GEM activemodel (>= 6.0.0) bindex (>= 0.4.0) railties (>= 6.0.0) + webmock (3.25.1) + addressable (>= 2.8.0) + crack (>= 0.3.2) + hashdiff (>= 0.4.0, < 2.0.0) websocket (1.2.11) websocket-driver (0.7.7) base64 @@ -509,6 +517,7 @@ DEPENDENCIES terser timecop web-console + webmock zip_tricks RUBY VERSION diff --git a/README.md b/README.md index d2da0fc0..47593781 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,19 @@ polling by specifying a longer queue wait time. Defaults to 10 if unset. `SQS_RESULT_IDLE_TIMEOUT` - Configures the :idle_timeout arg of the AWS poll method, which specifies the maximum time in seconds to wait for a new message before the polling loop exists. Defaults to 0 if unset. +### Archival Packaging Tool (APT) configuration + +The following enviroment variables are needed to communicate with [APT](https://github.com/MITLibraries/archival-packaging-tool), which is used in the +[preservation workflow](#publishing-workflow). + +`APT_CHALLENGE_SECRET` - Secret value used to authenticate requests to the APT Lambda endpoint. +`APT_VERBOSE` - If set to `true`, enables verbose logging for APT requests. +`APT_CHECKSUMS_TO_GENERATE` - Array of checksum algorithms to generate for files (default: ['md5']). +`APT_COMPRESS_ZIP` - Boolean value to indicate whether the output bag should be compressed as a zip +file (default: true). +`APT_S3_BUCKET` - S3 bucket URI where APT output bags are stored. +`APT_LAMBDA_URL` - The URL of the APT Lambda endpoint for preservation requests. + ### Email configuration `SMTP_ADDRESS`, `SMTP_PASSWORD`, `SMTP_PORT`, `SMTP_USER` - all required to send mail. @@ -397,15 +410,15 @@ Note: `Pending publication` is allowed here, but not expected to be a normal occ ## Preservation workflow The publishing workflow will automatically trigger preservation for all of the published theses in the results queue. -At this point a submission information package is generated for each thesis, then a bag is constructed, zipped, and -streamed to an S3 bucket. (See the SubmissionInformationPackage and SubmissionInformationPackageZipper classes for more -details on this part of the process.) -Once they are in the S3 bucket, the bags are automatically replicated to the Digital Preservation S3 bucket, where they -can be ingested into Archivematica. +At this point, the preservation job will generate an Archivematica payload for each thesis, which +are then POSTed to [APT](https://github.com/MITLibraries/archival-packaging-tool) for further processing. Each payload includes a metadata CSV and a JSON object containing structural information about the thesis files. + +Once the payloads are sent to APT, each thesis is structured as a BagIt bag and saved to an S3 +bucket, where they can be ingested into Archivematica. -A thesis can be sent to preservation more than once. In order to track provenance across multiple preservation events, -we persist certain data about the SIP and audit the model using `paper_trail`. +A thesis can be sent to preservation more than once. In order to track provenance across multiple preservation events, we persist certain data about the Archivematica payload and audit the model +using `paper_trail`. ### Preserving a single thesis diff --git a/app/jobs/preservation_submission_job.rb b/app/jobs/preservation_submission_job.rb index df6c83aa..17b1e8af 100644 --- a/app/jobs/preservation_submission_job.rb +++ b/app/jobs/preservation_submission_job.rb @@ -1,4 +1,7 @@ class PreservationSubmissionJob < ActiveJob::Base + require 'net/http' + require 'uri' + queue_as :default def perform(theses) @@ -6,8 +9,8 @@ def perform(theses) results = { total: theses.count, processed: 0, errors: [] } theses.each do |thesis| Rails.logger.info("Thesis #{thesis.id} is now being prepared for preservation") - sip = thesis.submission_information_packages.create! - preserve_sip(sip) + payload = thesis.archivematica_payloads.create! + preserve_payload(payload) Rails.logger.info("Thesis #{thesis.id} has been sent to preservation") results[:processed] += 1 rescue StandardError, Aws::Errors => e @@ -20,10 +23,34 @@ def perform(theses) private - def preserve_sip(sip) - SubmissionInformationPackageZipper.new(sip) - sip.preservation_status = 'preserved' - sip.preserved_at = DateTime.now - sip.save + def preserve_payload(payload) + post_payload(payload) + payload.preservation_status = 'preserved' + payload.preserved_at = DateTime.now + payload.save + end + + def post_payload(payload) + s3_url = ENV.fetch('APT_LAMBDA_URL', nil) + uri = URI.parse(s3_url) + request = Net::HTTP::Post.new(uri, { 'Content-Type' => 'application/json' }) + request.body = payload.payload_json + + response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http| + http.request(request) + end + + validate_response(response) + end + + def validate_response(response) + unless response.is_a?(Net::HTTPSuccess) + raise "Failed to post Archivematica payload to APT: #{response.code} #{response.body}" + end + + result = JSON.parse(response.body) + unless result['success'] == true + raise "APT failed to create a bag: #{response.body}" + end end end diff --git a/app/models/archivematica_payload.rb b/app/models/archivematica_payload.rb new file mode 100644 index 00000000..416d8ed1 --- /dev/null +++ b/app/models/archivematica_payload.rb @@ -0,0 +1,101 @@ +# == Schema Information +# +# Table name: archivematica_payloads +# +# id :integer not null, primary key +# preservation_status :integer default("unpreserved"), not null +# payload_json :text +# preserved_at :datetime +# thesis_id :integer not null +# created_at :datetime not null +# updated_at :datetime not null +# +# This class assembles a payload to send to the Archival Packaging Tool (APT), which then creates a bag for +# preservation. It includes the thesis files, metadata, and checksums. The payload is then serialized to JSON +# for transmission. +# +# Instances of this class are invalid without an associated thesis that has a DSpace handle, a copyright, and +# at least one attached file with no duplicate filenames. +# +# There is some intentional duplication between this and the SubmissionInformationPackage model. The +# SubmissionInformationPackage is the legacy model that was used to create the bag, but it is not +# used in the current APT workflow. We are retaining it for historical purposes. +class ArchivematicaPayload < ApplicationRecord + include Checksums + include Baggable + + has_paper_trail + belongs_to :thesis + has_one_attached :metadata_csv + + validates :baggable?, presence: true + + before_create :set_metadata_csv, :set_payload_json + + enum preservation_status: %i[unpreserved preserved] + + private + + # compress_zip is cast to a boolean to override the string value from ENV. APT strictly requires + # a boolean for this field. + def build_payload + { + action: 'create-bagit-zip', + challenge_secret: ENV.fetch('APT_CHALLENGE_SECRET', nil), + verbose: ENV.fetch('APT_VERBOSE', false), + input_files: build_input_files, + checksums_to_generate: ENV.fetch('APT_CHECKSUMS_TO_GENERATE', ['md5']), + output_zip_s3_uri: bag_output_uri, + compress_zip: ActiveModel::Type::Boolean.new.cast(ENV.fetch('APT_COMPRESS_ZIP', true)) + } + end + + # Build input_files array from thesis files and attached metadata CSV + def build_input_files + files = thesis.files.map { |file| build_file_entry(file) } + files << build_file_entry(metadata_csv) # Metadata CSV is the only file that is generated in this model + files + end + + # Build a file entry for each file, including the metadata CSV. + def build_file_entry(file) + { + uri: ["s3://#{ENV.fetch('AWS_S3_BUCKET')}", file.blob.key].join('/'), + filepath: set_filepath(file), + checksums: { + md5: base64_to_hex(file.blob.checksum) + } + } + end + + def set_filepath(file) + file == metadata_csv ? 'metadata/metadata.csv' : file.filename.to_s + end + + # The bag_name has to be unique due to our using it as the basis of an ActiveStorage key. Using a UUID + # was not preferred as the target system of these bags adds it's own UUID to the file when it arrives there + # so the filename was unwieldy with two UUIDs embedded in it so we simply increment integers. + def bag_name + safe_handle = thesis.dspace_handle.gsub('/', '_') + "#{safe_handle}-thesis-#{thesis.submission_information_packages.count + 1}" + end + + # The bag_output_uri key is constructed to match the expected format for Archivematica. + def bag_output_uri + key = "etdsip/#{thesis.graduation_year}/#{thesis.graduation_month}-#{thesis.accession_number}/#{bag_name}.zip" + [ENV.fetch('APT_S3_BUCKET'), key].join('/') + end + + def baggable? + baggable_thesis?(thesis) + end + + def set_metadata_csv + csv_data = ArchivematicaMetadata.new(thesis).to_csv + metadata_csv.attach(io: StringIO.new(csv_data), filename: 'metadata.csv', content_type: 'text/csv') + end + + def set_payload_json + self.payload_json = build_payload.to_json + end +end diff --git a/app/models/submission_information_package.rb b/app/models/submission_information_package.rb index 893d0699..f440e5dc 100644 --- a/app/models/submission_information_package.rb +++ b/app/models/submission_information_package.rb @@ -14,6 +14,10 @@ # updated_at :datetime not null # +# This model is no longer used, but it is retained for historical purposes and to preserve existing +# data. Its functionality has been replaced by the ArchivematicaPayload model, which is used in the +# current preservation workflow. +# # Creates the structure for an individual thesis to be preserved in Archivematica according to the BagIt spec: # https://datatracker.ietf.org/doc/html/rfc8493. # diff --git a/app/models/submission_information_package_zipper.rb b/app/models/submission_information_package_zipper.rb deleted file mode 100644 index bdf179df..00000000 --- a/app/models/submission_information_package_zipper.rb +++ /dev/null @@ -1,54 +0,0 @@ -# SubmissionInformationPackageZipper creates a temporary zip file containing a bag and then attaches it via -# ActiveStorage. The `keygen` method creates the `path` to the file in S3. We are relying on S3 Replication to copy -# the objects from the ETD bucket to the Archivematica bucket. There is no automatic communication back from S3 or -# Archivematica to ETD, so we treat the successful save of the zip file as `preserved`. -class SubmissionInformationPackageZipper - def initialize(sip) - Tempfile.create([sip.bag_name.to_s, '.zip'], binmode: true) do |tmpfile| - bagamatic(sip, tmpfile) - - sip.bag.attach(io: File.open(tmpfile), - key: keygen(sip), - filename: "#{sip.bag_name}.zip", - content_type: 'application/zip') - end - end - - private - - # This key needs to be unique. By default, ActiveStorage generates a UUID, but since we want a file path for our - # Archivematica needs, we are generating a key. We handle uniqueness on the `bag_name` side. - def keygen(sip) - thesis = sip.thesis - "etdsip/#{thesis.graduation_year}/#{thesis.graduation_month}-#{thesis.accession_number}/#{sip.bag_name}.zip" - end - - # bagamatic takes a sip, creates a temporary zip file, and returns that file - def bagamatic(sip, tmpfile) - ZipTricks::Streamer.open(tmpfile) do |zip| - # bag manifest - zip.write_deflated_file('manifest-md5.txt') do |sink| - sink << sip.manifest - end - - # bag_declaration - zip.write_deflated_file('bagit.txt') do |sink| - sink << sip.bag_declaration - end - - # files. metadata.csv is just a string so we have to treat it differently than the binary stored files - sip.data.each do |data| - if data[1].is_a?(String) - zip.write_deflated_file(data[0]) do |sink| - sink << data[1] - end - else - zip.write_stored_file(data[0]) do |sink| - sink << data[1].download - end - end - end - end - tmpfile.close - end -end diff --git a/app/models/thesis.rb b/app/models/thesis.rb index 1c2ee202..2c2b7390 100644 --- a/app/models/thesis.rb +++ b/app/models/thesis.rb @@ -48,6 +48,7 @@ class Thesis < ApplicationRecord has_many :users, through: :authors has_many :submission_information_packages, dependent: :destroy + has_many :archivematica_payloads, dependent: :destroy has_many_attached :files has_one_attached :dspace_metadata diff --git a/config/environments/test.rb b/config/environments/test.rb index 0324c05f..03a8489b 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -40,9 +40,14 @@ ENV['SQS_RESULT_WAIT_TIME_SECONDS'] = '10' ENV['SQS_RESULT_IDLE_TIMEOUT'] = '0' ENV['AWS_REGION'] = 'us-east-1' + ENV['AWS_S3_BUCKET'] = 'fake-etd-bucket' ENV['DSPACE_DOCTORAL_HANDLE'] = '1721.1/999999' ENV['DSPACE_GRADUATE_HANDLE'] = '1721.1/888888' ENV['DSPACE_UNDERGRADUATE_HANDLE'] = '1721.1/777777' + ENV['APT_CHALLENGE_SECRET'] = 'fake-challenge-secret' + ENV['APT_S3_BUCKET'] = 's3://fake-apt-bucket' + ENV['APT_LAMBDA_URL'] = 'https://fake-lambda.example.com/' + ENV['APT_COMPRESS_ZIP'] = 'true' # While tests run files are not watched, reloading is not necessary. config.enable_reloading = false diff --git a/db/migrate/20250624182142_create_archivematica_payloads.rb b/db/migrate/20250624182142_create_archivematica_payloads.rb new file mode 100644 index 00000000..12ee7d29 --- /dev/null +++ b/db/migrate/20250624182142_create_archivematica_payloads.rb @@ -0,0 +1,13 @@ +class CreateArchivematicaPayloads < ActiveRecord::Migration[7.1] + def change + create_table :archivematica_payloads do |t| + t.integer :preservation_status, null: false, default: 0 + t.text :payload_json + t.datetime :preserved_at + + t.references :thesis, null: false, foreign_key: true + + t.timestamps + end + end +end diff --git a/db/schema.rb b/db/schema.rb index 1d1610c4..08e8f3a7 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2025_02_21_202836) do +ActiveRecord::Schema[7.1].define(version: 2025_06_24_182142) do create_table "active_storage_attachments", force: :cascade do |t| t.string "name", null: false t.string "record_type", null: false @@ -65,6 +65,16 @@ t.index ["degree_period_id"], name: "index_archivematica_accessions_on_degree_period_id", unique: true end + create_table "archivematica_payloads", force: :cascade do |t| + t.integer "preservation_status", default: 0, null: false + t.text "payload_json" + t.datetime "preserved_at" + t.integer "thesis_id", null: false + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["thesis_id"], name: "index_archivematica_payloads_on_thesis_id" + end + create_table "authors", force: :cascade do |t| t.integer "user_id", null: false t.integer "thesis_id", null: false @@ -290,6 +300,7 @@ add_foreign_key "active_storage_attachments", "active_storage_blobs", column: "blob_id" add_foreign_key "active_storage_variant_records", "active_storage_blobs", column: "blob_id" add_foreign_key "archivematica_accessions", "degree_periods" + add_foreign_key "archivematica_payloads", "theses" add_foreign_key "authors", "theses" add_foreign_key "authors", "users" add_foreign_key "degrees", "degree_types" diff --git a/test/jobs/preservation_submission_job_test.rb b/test/jobs/preservation_submission_job_test.rb index 55108272..a4f4a26e 100644 --- a/test/jobs/preservation_submission_job_test.rb +++ b/test/jobs/preservation_submission_job_test.rb @@ -1,4 +1,5 @@ require 'test_helper' +require 'webmock/minitest' class PreservationSubmissionJobTest < ActiveJob::TestCase @@ -16,7 +17,55 @@ def setup_thesis thesis end - test 'sends report emails' do + def stub_apt_lambda_success + stub_request(:post, ENV['APT_LAMBDA_URL']) + .to_return( + status: 200, + body: { + success: true, + bag: { + entries: { + "manifest-md5.txt" => { md5: "fakehash" } + } + }, + output_zip_s3_uri: "s3://my-bucket/apt-testing/test-one-medium.zip" + }.to_json, + headers: { 'Content-Type' => 'application/json' } + ) + end + + def stub_apt_lambda_failure + stub_request(:post, ENV['APT_LAMBDA_URL']) + .to_return( + status: 500, + headers: { 'Content-Type' => 'application/json' } + ) + end + + def stub_apt_lambda_200_failure + stub_request(:post, ENV['APT_LAMBDA_URL']) + .to_return( + status: 200, + body: { + success: false, + error: "An error occurred (404) when calling the HeadObject operation: Not Found", + bag: { entries: {}}, + }.to_json, + headers: { 'Content-Type' => 'application/json' } + ) + end + + test 'sends report emails on success' do + stub_apt_lambda_success + ClimateControl.modify DISABLE_ALL_EMAIL: 'false' do + assert_difference('ActionMailer::Base.deliveries.size', 1) do + PreservationSubmissionJob.perform_now([setup_thesis]) + end + end + end + + test 'sends report emails on failure' do + stub_apt_lambda_success ClimateControl.modify DISABLE_ALL_EMAIL: 'false' do assert_difference('ActionMailer::Base.deliveries.size', 1) do PreservationSubmissionJob.perform_now([setup_thesis]) @@ -24,60 +73,85 @@ def setup_thesis end end - test 'creates a SIP' do + test 'sends report email if post succeeds but APT fails' do + stub_apt_lambda_200_failure + ClimateControl.modify DISABLE_ALL_EMAIL: 'false' do + assert_difference('ActionMailer::Base.deliveries.size', 1) do + PreservationSubmissionJob.perform_now([setup_thesis]) + end + end + end + + test 'creates an Archivematica payload' do + stub_apt_lambda_success thesis = setup_thesis - assert_equal 0, thesis.submission_information_packages.count + assert_equal 0, thesis.archivematica_payloads.count PreservationSubmissionJob.perform_now([thesis]) - assert_equal 1, thesis.submission_information_packages.count + assert_equal 1, thesis.archivematica_payloads.count end - test 'creates multiple SIPs' do + test 'creates multiple Archivematica payloads' do + stub_apt_lambda_success thesis_one = setup_thesis thesis_two = theses(:engineer) - assert_equal 0, thesis_one.submission_information_packages.count - assert_equal 0, thesis_two.submission_information_packages.count + assert_equal 0, thesis_one.archivematica_payloads.count + assert_equal 0, thesis_two.archivematica_payloads.count theses = [thesis_one, thesis_two] PreservationSubmissionJob.perform_now(theses) - assert_equal 1, thesis_one.submission_information_packages.count - assert_equal 1, thesis_two.submission_information_packages.count + assert_equal 1, thesis_one.archivematica_payloads.count + assert_equal 1, thesis_two.archivematica_payloads.count PreservationSubmissionJob.perform_now(theses) - assert_equal 2, thesis_one.submission_information_packages.count - assert_equal 2, thesis_two.submission_information_packages.count + assert_equal 2, thesis_one.archivematica_payloads.count + assert_equal 2, thesis_two.archivematica_payloads.count end test 'updates preservation_status to "preserved" after successfully processing a thesis' do + stub_apt_lambda_success thesis = setup_thesis PreservationSubmissionJob.perform_now([thesis]) - assert_equal 'preserved', thesis.submission_information_packages.last.preservation_status + assert_equal 'preserved', thesis.archivematica_payloads.last.preservation_status end test 'updates preserved_at to the current time after successfully processing a thesis' do + stub_apt_lambda_success time = DateTime.new.getutc Timecop.freeze(time) do thesis = setup_thesis PreservationSubmissionJob.perform_now([thesis]) - assert_equal time, thesis.submission_information_packages.last.preserved_at + assert_equal time, thesis.archivematica_payloads.last.preserved_at end end test 'throws exceptions when a thesis is unbaggable' do + stub_apt_lambda_failure assert_raises StandardError do PreservationSubmissionJob.perform_now([theses[:one]]) end + stub_apt_lambda_success assert_nothing_raised do PreservationSubmissionJob.perform_now([setup_thesis]) end end - test 'does not create a SIP if the job enters an error state' do + test 'does not create payload if job fails' do + stub_apt_lambda_failure + thesis = theses(:one) + assert_empty thesis.archivematica_payloads + + PreservationSubmissionJob.perform_now([thesis]) + assert_empty thesis.archivematica_payloads + end + + test 'does not create payload if post succeeds but APT fails' do + stub_apt_lambda_200_failure thesis = theses(:one) - assert_empty thesis.submission_information_packages + assert_empty thesis.archivematica_payloads PreservationSubmissionJob.perform_now([thesis]) - assert_empty thesis.submission_information_packages + assert_empty thesis.archivematica_payloads end end diff --git a/test/models/archivematica_payload_test.rb b/test/models/archivematica_payload_test.rb new file mode 100644 index 00000000..6de45836 --- /dev/null +++ b/test/models/archivematica_payload_test.rb @@ -0,0 +1,63 @@ +# == Schema Information +# +# Table name: archivematica_payloads +# +# id :integer not null, primary key +# preservation_status :integer default("unpreserved"), not null +# payload_json :text +# preserved_at :datetime +# thesis_id :integer not null +# created_at :datetime not null +# updated_at :datetime not null +# +require 'test_helper' + +class ArchivematicaPayloadTest < ActiveSupport::TestCase + def setup + @thesis = theses(:published) + @payload = @thesis.archivematica_payloads.create! + end + + test 'new payload has default preservation_status of unpreserved' do + assert_equal 'unpreserved', @payload.preservation_status + end + + test 'payload_json exists and contains valid JSON' do + assert @payload.payload_json.present? + + json = @payload.payload_json + assert json.is_a?(String) # Payload JSON should be a string + + parsed = JSON.parse(json) + assert parsed.is_a?(Hash) # Parsed JSON should be a valid hash + end + + test 'payload_json contains expected structure' do + parsed = JSON.parse(@payload.payload_json) + assert_equal parsed['action'], 'create-bagit-zip' + assert parsed['challenge_secret'] + assert_not_nil parsed['verbose'] + assert parsed['input_files'].is_a?(Array) + assert_equal ['md5'], parsed['checksums_to_generate'] + assert parsed['output_zip_s3_uri'].end_with?('.zip') + assert parsed['compress_zip'].in?([true, false]) + # Check S3 URI format + parsed['input_files'].each do |file| + assert_match %r{^s3://#{ENV.fetch('AWS_S3_BUCKET', nil)}/}, file['uri'] + end + end + + test 'input_files includes thesis files and metadata_csv' do + parsed = JSON.parse(@payload.payload_json) + thesis_files = @thesis.files.count + assert_equal thesis_files + 1, parsed['input_files'].size + assert_equal @thesis.files.first.filename.to_s, parsed['input_files'].first['filepath'] + assert_equal 'metadata/metadata.csv', parsed['input_files'].last['filepath'] + end + + test 'metadata_csv is attached and has correct filename/content type' do + assert @payload.metadata_csv.attached? + assert_equal 'metadata.csv', @payload.metadata_csv.filename.to_s + assert_equal 'text/csv', @payload.metadata_csv.content_type + end +end diff --git a/test/models/submission_information_package_zipper_test.rb b/test/models/submission_information_package_zipper_test.rb deleted file mode 100644 index 2857178f..00000000 --- a/test/models/submission_information_package_zipper_test.rb +++ /dev/null @@ -1,50 +0,0 @@ -require 'test_helper' - -class SubmissionInformationPackageZipperTest < ActiveSupport::TestCase - - # because we need to actually use the file it's easier to attach it in the test rather - # than use our fixtures as the fixtures oddly don't account for the file actually being - # where ActiveStorage expects them to be. We also need this to be a record that looks like - # a published record so we'll use the published fixture, remove the fixtured files, and attach - # one again. - def setup_thesis - thesis = theses(:published) - thesis.files = [] - thesis.save - file = Rails.root.join('test', 'fixtures', 'files', 'registrar_data_small_sample.csv') - thesis.files.attach(io: File.open(file), filename: 'registrar_data_small_sample.csv') - thesis - end - - test 'sip has an attached zipped bag' do - thesis = setup_thesis - sip = thesis.submission_information_packages.create - SubmissionInformationPackageZipper.new(sip) - - assert_equal("application/zip", thesis.submission_information_packages.first.bag.blob.content_type) - end - - # Failure to properly close the handles can allow the zip creation to appear correct but actually be invalid - # this is a regression test to ensure we avoid that situation - # You can see this test fail by removing the `tmpfile.close` from the end of bagamatic. - test 'zip is valid' do - thesis = setup_thesis - sip = thesis.submission_information_packages.create - SubmissionInformationPackageZipper.new(sip) - - blob = thesis.submission_information_packages.last.bag.blob - Zip::File.open(blob.service.send(:path_for, blob.key)) do |zipfile| - assert_nil(zipfile.find_entry("file_not_in_zipfile.txt")) - assert_equal('manifest-md5.txt', zipfile.find_entry("manifest-md5.txt").to_s) - end - end - - test 'sip has the correct key' do - thesis = setup_thesis - sip = thesis.submission_information_packages.create - SubmissionInformationPackageZipper.new(sip) - - blob = thesis.submission_information_packages.last.bag.blob - assert blob.key.starts_with? "etdsip/#{thesis.graduation_year}/#{thesis.graduation_month}-#{thesis.accession_number}" - end -end From d32eabc6b8e3c64294b145ab5c5f81c171ddf413 Mon Sep 17 00:00:00 2001 From: jazairi <16103405+jazairi@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:50:39 -0400 Subject: [PATCH 2/3] Address code review feedback --- app/jobs/preservation_submission_job.rb | 2 +- test/jobs/preservation_submission_job_test.rb | 53 +++++++++++++++---- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/app/jobs/preservation_submission_job.rb b/app/jobs/preservation_submission_job.rb index 17b1e8af..13ab5f1c 100644 --- a/app/jobs/preservation_submission_job.rb +++ b/app/jobs/preservation_submission_job.rb @@ -27,7 +27,7 @@ def preserve_payload(payload) post_payload(payload) payload.preservation_status = 'preserved' payload.preserved_at = DateTime.now - payload.save + payload.save! end def post_payload(payload) diff --git a/test/jobs/preservation_submission_job_test.rb b/test/jobs/preservation_submission_job_test.rb index a4f4a26e..66aee414 100644 --- a/test/jobs/preservation_submission_job_test.rb +++ b/test/jobs/preservation_submission_job_test.rb @@ -17,6 +17,15 @@ def setup_thesis thesis end + def setup_thesis_two + thesis = theses(:published_with_sip) + thesis.files = [] + thesis.save + file = Rails.root.join('test', 'fixtures', 'files', 'registrar_data_small_sample.csv') + thesis.files.attach(io: File.open(file), filename: 'registrar_data_small_sample.csv') + thesis + end + def stub_apt_lambda_success stub_request(:post, ENV['APT_LAMBDA_URL']) .to_return( @@ -137,21 +146,45 @@ def stub_apt_lambda_200_failure end end - test 'does not create payload if job fails' do + test 'does not create payloads if job fails' do stub_apt_lambda_failure - thesis = theses(:one) - assert_empty thesis.archivematica_payloads + bad_thesis = theses(:one) + good_thesis = setup_thesis + another_good_thesis = setup_thesis_two + assert_empty bad_thesis.archivematica_payloads + assert_empty good_thesis.archivematica_payloads + assert_empty another_good_thesis.archivematica_payloads - PreservationSubmissionJob.perform_now([thesis]) - assert_empty thesis.archivematica_payloads + PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis]) + + # first thesis should succeed and have a payload + assert_equal 1, good_thesis.archivematica_payloads.count + + # second thesis should fail and not have a payload + assert_empty bad_thesis.archivematica_payloads + + # third thesis should succeed and have a payload, despite prior failure + assert_equal 1, another_good_thesis.archivematica_payloads.count end - test 'does not create payload if post succeeds but APT fails' do + test 'does not create payloads if post succeeds but APT fails' do stub_apt_lambda_200_failure - thesis = theses(:one) - assert_empty thesis.archivematica_payloads + bad_thesis = theses(:one) + good_thesis = setup_thesis + another_good_thesis = setup_thesis_two + assert_empty bad_thesis.archivematica_payloads + assert_empty good_thesis.archivematica_payloads + assert_empty another_good_thesis.archivematica_payloads - PreservationSubmissionJob.perform_now([thesis]) - assert_empty thesis.archivematica_payloads + PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis]) + + # first thesis should succeed and have a payload + assert_equal 1, good_thesis.archivematica_payloads.count + + # second thesis should fail and not have a payload + assert_empty bad_thesis.archivematica_payloads + + # third thesis should succeed and have a payload, despite prior failure + assert_equal 1, another_good_thesis.archivematica_payloads.count end end From 04047b76aebc3e631beb37db995e7724aa0c47e6 Mon Sep 17 00:00:00 2001 From: jazairi <16103405+jazairi@users.noreply.github.com> Date: Thu, 17 Jul 2025 15:43:50 -0400 Subject: [PATCH 3/3] Potential fix for TypeError --- app/models/archivematica_payload.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/archivematica_payload.rb b/app/models/archivematica_payload.rb index 416d8ed1..91efdf23 100644 --- a/app/models/archivematica_payload.rb +++ b/app/models/archivematica_payload.rb @@ -42,7 +42,7 @@ def build_payload { action: 'create-bagit-zip', challenge_secret: ENV.fetch('APT_CHALLENGE_SECRET', nil), - verbose: ENV.fetch('APT_VERBOSE', false), + verbose: ActiveModel::Type::Boolean.new.cast(ENV.fetch('APT_VERBOSE', false)), input_files: build_input_files, checksums_to_generate: ENV.fetch('APT_CHECKSUMS_TO_GENERATE', ['md5']), output_zip_s3_uri: bag_output_uri,