From c9f728bc4bfa941e3d0d59d86ae1523daf6d9608 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:03 -0500 Subject: [PATCH 01/97] add checksum URI values and methods --- .../edu/harvard/iq/dataverse/DataFile.java | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index 45604a5472b..8a08cd15029 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -109,18 +109,22 @@ public class DataFile extends DvObject implements Comparable { * The list of types should be limited to the list above in the technote * because the string gets passed into MessageDigest.getInstance() and you * can't just pass in any old string. + * + * The URIs are used in the OAI_ORE export. They are taken from the associated XML Digital Signature standards. */ public enum ChecksumType { - MD5("MD5"), - SHA1("SHA-1"), - SHA256("SHA-256"), - SHA512("SHA-512"); + MD5("MD5", "http://www.w3.org/2001/04/xmldsig-more#md5"), + SHA1("SHA-1", "http://www.w3.org/2000/09/xmldsig#sha1"), + SHA256("SHA-256", "http://www.w3.org/2001/04/xmlenc#sha256"), + SHA512("SHA-512", "http://www.w3.org/2001/04/xmlenc#sha512"); private final String text; + private final String uri; - private ChecksumType(final String text) { + private ChecksumType(final String text, final String uri) { this.text = text; + this.uri = uri; } public static ChecksumType fromString(String text) { @@ -131,13 +135,30 @@ public static ChecksumType fromString(String text) { } } } - throw new IllegalArgumentException("ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + } + + public static ChecksumType fromUri(String uri) { + if (uri != null) { + for (ChecksumType checksumType : ChecksumType.values()) { + if (uri.equals(checksumType.uri)) { + return checksumType; + } + } + } + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); } @Override public String toString() { return text; } + + public String toUri() { + return uri; + } } //@Expose From a25e47b12cdd4fcb0050a69f0119e9abf4c59183 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:24 -0500 Subject: [PATCH 02/97] update version and use checksum URIs --- src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 4cbc2aa7b9a..aa011e2c70a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -49,7 +49,7 @@ public class OREMap { public static final String NAME = "OREMap"; //NOTE: Update this value whenever the output of this class is changed - private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.1"; + private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.2"; //v1.0.1 - added versionNote private static final String DATAVERSE_SOFTWARE_NAME = "Dataverse"; private static final String DATAVERSE_SOFTWARE_URL = "https://github.com/iqss/dataverse"; @@ -280,7 +280,7 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { JsonObject checksum = null; // Add checksum. RDA recommends SHA-512 if (df.getChecksumType() != null && df.getChecksumValue() != null) { - checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toString()) + checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toUri()) .add("@value", df.getChecksumValue()).build(); aggRes.add(JsonLDTerm.checksum.getLabel(), checksum); } From 6c0cb49513f7748cf6cf026d0b9892005820fbb5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:48 -0500 Subject: [PATCH 03/97] handle multiline descriptions and org names --- .../iq/dataverse/util/bagit/BagGenerator.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..69e9c686133 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -548,7 +548,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( + ChecksumType childHashType = ChecksumType.fromUri( child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { //If one wasn't set as a default, pick up what the first child with one uses @@ -828,7 +828,7 @@ private String generateInfoFile() { // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + WordUtils.wrap(orgAddress, 78, CRLF + " ", true)); + info.append("Organization-Address: " + multilineWrap(orgAddress)); info.append(CRLF); @@ -846,10 +846,8 @@ private String generateInfoFile() { if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append( - // FixMe - handle description having subfields better - WordUtils.wrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()), 78, CRLF + " ", true)); + info.append(multilineWrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), + descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -883,6 +881,20 @@ private String generateInfoFile() { } + private String multilineWrap(String value) { + // Normalize line breaks and ensure all lines after the first are indented + String[] lines =value.split("\\r?\\n"); + StringBuilder wrappedValue = new StringBuilder(); + for (int i = 0; i < lines.length; i++) { + String wrapped = WordUtils.wrap(lines[i].trim(), 78, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } + } + return wrappedValue.toString(); + } + /** * Kludge - compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. From 7a34db8078b4f1605968163bf839267bdd9e5d19 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:01:09 -0500 Subject: [PATCH 04/97] drop blank lines in multiline values Spec doesn't allow empty lines, dropping whitespace-only lines seems reasonable as well (users can't see from the Dataverse display whether an empty line would appear in bag-info.txt or not if we all whotespace only lines (or whitespace beyond the 78 char wrap limit) --- .../iq/dataverse/util/bagit/BagGenerator.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 69e9c686133..cf5bea08d99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -886,10 +886,15 @@ private String multilineWrap(String value) { String[] lines =value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { - String wrapped = WordUtils.wrap(lines[i].trim(), 78, CRLF + " ", true); - wrappedValue.append(wrapped); - if (i < lines.length - 1) { - wrappedValue.append(CRLF).append(" "); + // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, + // While trailing whitespace or whitespace-only lines appear to be allowed, it's not clear that handling them adds value (visually identical entries in Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt file + String line = lines[i].trim(); + if (line.length() > 0) { + String wrapped = WordUtils.wrap(line, 78, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } } } return wrappedValue.toString(); From b0daad7393a5663b5244ac89e04b0de9c630f9bf Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:02:01 -0500 Subject: [PATCH 05/97] remove title as a folder affects manifest and pid-mapping files as well as data file placement --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index cf5bea08d99..31ae06677c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -482,14 +482,6 @@ public static String getValidName(String bagName) { private void processContainer(JsonObject item, String currentPath) throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - } - logger.fine("Adding " + title + "/ to path " + currentPath); - currentPath = currentPath + title + "/"; int containerIndex = -1; try { createDir(currentPath); From e5457a8026f4e2e311b2ef84bea7d60f9f8020b4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:02:19 -0500 Subject: [PATCH 06/97] handle null deaccession reason --- src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index aa011e2c70a..426d5c9aa5f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -130,7 +130,8 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { if(vs.equals(VersionState.DEACCESSIONED)) { JsonObjectBuilder deaccBuilder = Json.createObjectBuilder(); deaccBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), vs.name()); - deaccBuilder.add(JsonLDTerm.DVCore("reason").getLabel(), version.getDeaccessionNote()); + // Reason is supposed to not be null, but historically this has not been enforced (in the API) + addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("reason"), version.getDeaccessionNote()); addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("forwardUrl"), version.getDeaccessionLink()); aggBuilder.add(JsonLDTerm.schemaOrg("creativeWorkStatus").getLabel(), deaccBuilder); From 10b0556e1de1c52a9a9cf9a32c9a3c07582ce60a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 10 Dec 2025 09:55:50 -0500 Subject: [PATCH 07/97] use static to simplify testing --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 31ae06677c3..4f3d0e00280 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -35,7 +35,6 @@ import java.util.logging.Logger; import java.util.zip.ZipEntry; -import edu.harvard.iq.dataverse.util.BundleUtil; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.archivers.zip.ParallelScatterZipCreator; import org.apache.commons.compress.archivers.zip.ScatterZipOutputStream; @@ -77,7 +76,6 @@ import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; -import java.util.Optional; public class BagGenerator { @@ -873,7 +871,7 @@ private String generateInfoFile() { } - private String multilineWrap(String value) { + static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented String[] lines =value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); From 6d241851d8860ddde6d6b1aac952c12ea426eb62 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 10 Dec 2025 13:49:17 -0500 Subject: [PATCH 08/97] Sanitize/split multiline catalog entry, add Dataverse-Bag-Version --- .../iq/dataverse/util/bagit/BagGenerator.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 4f3d0e00280..122ca0b6aba 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -77,6 +77,15 @@ import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +/** + * Creates an archival zipped Bag for long-term storage. It is intended to + * include all the information needed to reconstruct the dataset version in a + * new Dataverse instance. + * + * Note that the Dataverse-Bag-Version written in the generateInfoFile() method + * should be updated any time the content/structure of the bag is changed. + * + */ public class BagGenerator { private static final Logger logger = Logger.getLogger(BagGenerator.class.getCanonicalName()); @@ -864,9 +873,13 @@ private String generateInfoFile() { if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString()); + catalog=catalog.trim().replaceAll("[\\r\\n:]","_"); + info.append(catalog + ":" + multilineWrap(aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); + //Add a version number for our bag type - should be updated with any change to the bag content/structure + info.append("Dataverse-Bag-Version: 1.0"); + info.append(CRLF); return info.toString(); } From c4daf28099d4f91705edbe94efcaeecf229ff274 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:00:07 +0100 Subject: [PATCH 09/97] Added unit tests for multilineWrap --- .../bagit/BagGeneratorMultilineWrapTest.java | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java new file mode 100644 index 00000000000..39a713c14e4 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -0,0 +1,102 @@ +package edu.harvard.iq.dataverse.util.bagit; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Tests adapted for DD-2093: verify the behavior of BagGenerator.multilineWrap. + */ +public class BagGeneratorMultilineWrapTest { + + private static Method multilineWrap; + + @BeforeAll + static void setUp() throws NoSuchMethodException { + // Access the private static method via reflection + multilineWrap = BagGenerator.class.getDeclaredMethod("multilineWrap", String.class); + multilineWrap.setAccessible(true); + } + + private String callMultilineWrap(String input) { + try { + return (String) multilineWrap.invoke(null, input); + } catch (IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + } + + @Test + void shortLine_noWrap() { + String input = "Hello world"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo("Hello world"); + } + + @Test + void exactBoundary_78chars_noWrap() { + String input = repeat('a', 78); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(input); + } + + @Test + void longSingleWord_wrapsAt78WithIndent() { + String input = repeat('a', 100); + String expected = repeat('a', 78) + "\r\n " + repeat('a', 22); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_input_indentsSecondAndSubsequentOriginalLines() { + String input = "Line1\nLine2"; + String expected = "Line1\r\n Line2"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withCRLF_normalizedAndIndented() { + String input = "First line\r\nSecond line"; + String expected = "First line\r\n Second line"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void emptyLines_trimmedAndSkipped() { + String input = "Line1\n\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void whitespaceOnlyLines_ignored() { + String input = "Line1\n \n\t\t\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void longSecondLine_preservesIndentOnWraps() { + String line1 = "Header"; + String line2 = repeat('b', 90); + String input = line1 + "\n" + line2; + String expected = "Header\r\n " + repeat('b', 78) + "\r\n " + repeat('b', 12); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + private static String repeat(char c, int n) { + StringBuilder sb = new StringBuilder(n); + for (int i = 0; i < n; i++) sb.append(c); + return sb.toString(); + } +} From e76bc9135fabbbdd4cb79f8fea7ed98e518f57f8 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:09:00 +0100 Subject: [PATCH 10/97] Removed unnecessary repeat helper method --- .../bagit/BagGeneratorMultilineWrapTest.java | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 39a713c14e4..a212cac6316 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -39,15 +39,15 @@ void shortLine_noWrap() { @Test void exactBoundary_78chars_noWrap() { - String input = repeat('a', 78); + String input = "a".repeat(78); String out = callMultilineWrap(input); assertThat(out).isEqualTo(input); } @Test void longSingleWord_wrapsAt78WithIndent() { - String input = repeat('a', 100); - String expected = repeat('a', 78) + "\r\n " + repeat('a', 22); + String input = "a".repeat(100); + String expected = "a".repeat(78) + "\r\n " + "a".repeat(22); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -87,16 +87,10 @@ void whitespaceOnlyLines_ignored() { @Test void longSecondLine_preservesIndentOnWraps() { String line1 = "Header"; - String line2 = repeat('b', 90); + String line2 = "b".repeat(90); String input = line1 + "\n" + line2; - String expected = "Header\r\n " + repeat('b', 78) + "\r\n " + repeat('b', 12); + String expected = "Header\r\n " + "b".repeat(78) + "\r\n " + "b".repeat(12); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } - - private static String repeat(char c, int n) { - StringBuilder sb = new StringBuilder(n); - for (int i = 0; i < n; i++) sb.append(c); - return sb.toString(); - } } From 108c912ee037d23456650e6d5c49c5a943d5ef42 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:17:42 +0100 Subject: [PATCH 11/97] Alined test names with actual test being done --- .../util/bagit/BagGeneratorMultilineWrapTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index a212cac6316..71ceec61adf 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -54,15 +54,15 @@ void longSingleWord_wrapsAt78WithIndent() { @Test void multiline_input_indentsSecondAndSubsequentOriginalLines() { - String input = "Line1\nLine2"; - String expected = "Line1\r\n Line2"; + String input = "Line1\nLine2\nLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @Test - void multiline_withCRLF_normalizedAndIndented() { - String input = "First line\r\nSecond line"; + void multiline_withLF_normalizedAndIndented() { + String input = "First line\nSecond line"; String expected = "First line\r\n Second line"; String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); From 884b81b2f0f4aa951d38b18ce8f832643275c542 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 16 Dec 2025 09:25:50 -0500 Subject: [PATCH 12/97] DD-2098 - allow archivalstatus calls on deaccessioned versions --- src/main/java/edu/harvard/iq/dataverse/api/Datasets.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 2378388c540..12dd984775d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5006,7 +5006,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers); + headers, true); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5048,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers); + uriInfo, headers, true); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5095,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers); + headers, true); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } From 3076d69b2074326aee55d5d050b8c7628bdaee92 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 17 Dec 2025 15:36:16 -0500 Subject: [PATCH 13/97] set array properly --- .../java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 122ca0b6aba..473e2bab034 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -763,7 +763,6 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - JsonArray contactsArray = new JsonArray(); /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change * so we need to find the labels used. */ @@ -775,6 +774,7 @@ private String generateInfoFile() { JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); if (contacts.isJsonArray()) { + JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { info.append("Contact-Name: "); JsonElement person = contactsArray.get(i); From 1a7dafa9bb71412361890d519af21a9549b7f4da Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 09:59:26 -0500 Subject: [PATCH 14/97] DD-2212 - use configured checksum when no files are present --- .../iq/dataverse/util/bagit/BagGenerator.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 473e2bab034..b9de58dce90 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -75,7 +75,10 @@ import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; + +import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import jakarta.enterprise.inject.spi.CDI; /** * Creates an archival zipped Bag for long-term storage. It is intended to @@ -153,7 +156,6 @@ public class BagGenerator { public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { this.oremap = oreMap; this.oremapObject = oreMap.getOREMap(); - //(JsonObject) new JsonParser().parse(oreMap.getOREMap().toString()); this.dataciteXml = dataciteXml; try { @@ -189,10 +191,6 @@ public void setIgnoreHashes(boolean val) { ignorehashes = val; } - public void setDefaultCheckSumType(ChecksumType type) { - hashtype=type; - } - public static void println(String s) { System.out.println(s); System.out.flush(); @@ -278,6 +276,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { String path = sha1Entry.getKey(); sha1StringBuffer.append(sha1Entry.getValue() + " " + path); } + if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. + try { + //Use the current type if we can retrieve it + hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); + } catch (Exception e) { + // Default to MD5 if we can't + hashtype=DataFile.ChecksumType.MD5; + } + } if (!(hashtype == null)) { String manifestName = "manifest-"; if (hashtype.equals(DataFile.ChecksumType.SHA1)) { From 7eea57c648f462e58fe1d776dfa7fdcee6c3dc68 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 10:37:37 -0500 Subject: [PATCH 15/97] Revert "DD-2098 - allow archivalstatus calls on deaccessioned versions" This reverts commit 884b81b2f0f4aa951d38b18ce8f832643275c542. --- src/main/java/edu/harvard/iq/dataverse/api/Datasets.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 12dd984775d..2378388c540 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5006,7 +5006,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5048,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers, true); + uriInfo, headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5095,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } From 2477cf97a2232ca68f8702dcc3706d25fa7216ec Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 12:01:50 -0500 Subject: [PATCH 16/97] add Source-Org as a potential multiline case, remove change to Int Id --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b9de58dce90..e78d1f3edf7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -830,7 +830,7 @@ private String generateInfoFile() { String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + orgName); + info.append("Source-Organization: " + multilineWrap(orgName)); // ToDo - make configurable info.append(CRLF); @@ -880,8 +880,7 @@ private String generateInfoFile() { if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - catalog=catalog.trim().replaceAll("[\\r\\n:]","_"); - info.append(catalog + ":" + multilineWrap(aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + info.append(multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); //Add a version number for our bag type - should be updated with any change to the bag content/structure From 3f3908f7ccaed5c961b6bcce057b71f4208bc656 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 12:08:05 -0500 Subject: [PATCH 17/97] release note --- doc/release-notes/12063-ORE-and-Bag-updates.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 doc/release-notes/12063-ORE-and-Bag-updates.md diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md new file mode 100644 index 00000000000..e276232f33a --- /dev/null +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -0,0 +1,13 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation \ No newline at end of file From aa44c0895f4cba1dbc6b145b721f2d8b79406440 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 13:58:34 -0500 Subject: [PATCH 18/97] use constants, pass labelLength to wrapping, start custom lineWrap --- .../iq/dataverse/util/bagit/BagGenerator.java | 284 +++++++++++++----- 1 file changed, 205 insertions(+), 79 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index e78d1f3edf7..b253f961b8c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -33,6 +33,8 @@ import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.zip.ZipEntry; import org.apache.commons.codec.digest.DigestUtils; @@ -44,7 +46,6 @@ import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; -import org.apache.commons.text.WordUtils; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; @@ -137,6 +138,20 @@ public class BagGenerator { static PrintWriter pw = null; + // Bag-info.txt field labels + private static final String CONTACT_NAME = "Contact-Name: "; + private static final String CONTACT_EMAIL = "Contact-Email: "; + private static final String SOURCE_ORGANIZATION = "Source-Organization: "; + private static final String ORGANIZATION_ADDRESS = "Organization-Address: "; + private static final String ORGANIZATION_EMAIL = "Organization-Email: "; + private static final String EXTERNAL_DESCRIPTION = "External-Description: "; + private static final String BAGGING_DATE = "Bagging-Date: "; + private static final String EXTERNAL_IDENTIFIER = "External-Identifier: "; + private static final String BAG_SIZE = "Bag-Size: "; + private static final String PAYLOAD_OXUM = "Payload-Oxum: "; + private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; + private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -149,8 +164,9 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. - * @throws Exception - * @throws JsonSyntaxException + * + * @throws Exception + * @throws JsonSyntaxException */ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { @@ -159,8 +175,13 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio this.dataciteXml = dataciteXml; try { - // Using Dataverse, all the URLs to be retrieved should be on the current server, so allowing self-signed certs and not verifying hostnames are useful in testing and - // shouldn't be a significant security issue. This should not be allowed for arbitrary OREMap sources. + /* + * Using Dataverse, all the URLs to be retrieved should be on the current + * server, so allowing self-signed certs and not verifying hostnames are useful + * in testing and shouldn't be a significant security issue. This should not be + * allowed for arbitrary OREMap sources. + * + */ SSLContextBuilder builder = new SSLContextBuilder(); try { builder.loadTrustMaterial(null, new TrustSelfSignedStrategy()); @@ -168,10 +189,11 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), + NoopHostnameVerifier.INSTANCE); Registry registry = RegistryBuilder.create() - .register("http", PlainConnectionSocketFactory.getSocketFactory()) + .register("http", PlainConnectionSocketFactory.getSocketFactory()) .register("https", sslConnectionFactory).build(); cm = new PoolingHttpClientConnectionManager(registry); @@ -190,7 +212,7 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio public void setIgnoreHashes(boolean val) { ignorehashes = val; } - + public static void println(String s) { System.out.println(s); System.out.flush(); @@ -208,18 +230,18 @@ public static void println(String s) { * @return success true/false */ public boolean generateBag(OutputStream outputStream) throws Exception { - File tmp = File.createTempFile("qdr-scatter-dirs", "tmp"); dirs = ScatterZipOutputStream.fileBased(tmp); - // The oremapObject is javax.json.JsonObject and we need com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser().parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + // The oremapObject is javax.json.JsonObject and we need + // com.google.gson.JsonObject for the aggregation object + aggregation = (JsonObject) new JsonParser() + .parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); - String pidString=PidUtil.parseAsGlobalID(pidUrlString).asString(); - bagID = pidString + "v." - + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); - + String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); + bagID = pidString + "v." + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); + logger.info("Generating Bag: " + bagID); try { // Create valid filename from identifier and extend path with @@ -278,11 +300,11 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. try { - //Use the current type if we can retrieve it + // Use the current type if we can retrieve it hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); } catch (Exception e) { // Default to MD5 if we can't - hashtype=DataFile.ChecksumType.MD5; + hashtype = DataFile.ChecksumType.MD5; } } if (!(hashtype == null)) { @@ -300,7 +322,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } createFileFromString(manifestName, sha1StringBuffer.toString()); } else { - logger.warning("No Hash values (no files?) sending empty manifest to nominally comply with BagIT specification requirement"); + logger.warning("No Hash value defined sending empty manifest-md5 to nominally comply with BagIT specification requirement"); createFileFromString("manifest-md5.txt", ""); } // bagit.txt - Required by spec @@ -383,7 +405,7 @@ public boolean generateBag(String bagName, boolean temp) { // Create an output stream backed by the file bagFileOS = new FileOutputStream(bagFile); if (generateBag(bagFileOS)) { - //The generateBag call sets this.bagName to the correct value + // The generateBag call sets this.bagName to the correct value validateBagFile(bagFile); if (usetemp) { logger.fine("Moving tmp zip"); @@ -395,7 +417,7 @@ public boolean generateBag(String bagName, boolean temp) { return false; } } catch (Exception e) { - logger.log(Level.SEVERE,"Bag Exception: ", e); + logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; @@ -452,9 +474,9 @@ public void validateBag(String bagId) { logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { - logger.log(Level.SEVERE,"Could not validate Hashes", io); + logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { - logger.log(Level.SEVERE,"Could not validate Hashes", e); + logger.log(Level.SEVERE, "Could not validate Hashes", e); } finally { IOUtils.closeQuietly(zf); } @@ -479,7 +501,7 @@ public File getBagFile(String bagID) throws Exception { private void validateBagFile(File bagFile) throws IOException { // Run a confirmation test - should verify all files and hashes - + // Check files calculates the hashes and file sizes and reports on // whether hashes are correct checkFiles(checksumMap, bagFile); @@ -547,28 +569,27 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } String childPath = currentPath + childTitle; JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if(directoryLabel!=null) { - childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle; + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; } - String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromUri( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { - //If one wasn't set as a default, pick up what the first child with one uses + // If one wasn't set as a default, pick up what the first child with one uses hashtype = childHashType; } if (hashtype != null && !hashtype.equals(childHashType)) { logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); + + " hashes for " + childTitle); } else { childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); if (checksumMap.containsValue(childHash)) { // Something else has this hash logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); + + childHash + " in: " + bagID); } logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); checksumMap.put(childPath, childHash); @@ -736,7 +757,7 @@ private void checkFiles(HashMap shaMap, File bagFile) { } } catch (InterruptedException e) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); - } + } } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); @@ -770,39 +791,41 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. - */ + /* + * Contact, and it's subfields, are terms from citation.tsv whose mapping to a + * formal vocabulary and label in the oremap may change so we need to find the + * labels used. + */ JsonLDTerm contactTerm = oremap.getContactTerm(); if ((contactTerm != null) && aggregation.has(contactTerm.getLabel())) { JsonElement contacts = aggregation.get(contactTerm.getLabel()); JsonLDTerm contactNameTerm = oremap.getContactNameTerm(); JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); - + if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append("Contact-Name: "); + info.append(CONTACT_NAME); JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { info.append(person.getAsString()); info.append(CRLF); } else { - if(contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); + info.append(CRLF); } - if ((contactEmailTerm!=null) &&((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append("Contact-Email: "); + if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { + info.append(CONTACT_EMAIL); info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); info.append(CRLF); } } } } else { - info.append("Contact-Name: "); + info.append(CONTACT_NAME); if (contacts.isJsonPrimitive()) { info.append((String) contacts.getAsString()); @@ -810,12 +833,12 @@ private String generateInfoFile() { } else { JsonObject person = contacts.getAsJsonObject(); - if(contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(person.get(contactNameTerm.getLabel()).getAsString()); + info.append(CRLF); } - if ((contactEmailTerm!=null) && (person.has(contactEmailTerm.getLabel()))) { - info.append("Contact-Email: "); + if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { + info.append(CONTACT_EMAIL); info.append(person.get(contactEmailTerm.getLabel()).getAsString()); info.append(CRLF); } @@ -826,80 +849,92 @@ private String generateInfoFile() { logger.warning("No contact info available for BagIt Info file"); } - String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class).orElse("Dataverse Installation ()"); + String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class) + .orElse("Dataverse Installation ()"); String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + multilineWrap(orgName)); + info.append(SOURCE_ORGANIZATION + multilineWrap(orgName, SOURCE_ORGANIZATION.length())); // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + multilineWrap(orgAddress)); + info.append(ORGANIZATION_ADDRESS + multilineWrap(orgAddress, ORGANIZATION_ADDRESS.length())); info.append(CRLF); // Not a BagIt standard name - info.append("Organization-Email: " + orgEmail); + info.append(ORGANIZATION_EMAIL + multilineWrap(orgEmail, ORGANIZATION_EMAIL.length())); info.append(CRLF); - info.append("External-Description: "); - - /* Description, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. + info.append(EXTERNAL_DESCRIPTION); + + /* + * Description, and it's subfields, are terms from citation.tsv whose mapping to + * a formal vocabulary and label in the oremap may change so we need to find the + * labels used. */ JsonLDTerm descriptionTerm = oremap.getDescriptionTerm(); JsonLDTerm descriptionTextTerm = oremap.getDescriptionTextTerm(); if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append(multilineWrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()))); + info.append(multilineWrap( + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()), + EXTERNAL_DESCRIPTION.length())); info.append(CRLF); } - info.append("Bagging-Date: "); + info.append(BAGGING_DATE); info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append("External-Identifier: "); + info.append(EXTERNAL_IDENTIFIER); info.append(aggregation.get("@id").getAsString()); info.append(CRLF); - info.append("Bag-Size: "); + info.append(BAG_SIZE); info.append(byteCountToDisplaySize(totalDataSize)); info.append(CRLF); - info.append("Payload-Oxum: "); + info.append(PAYLOAD_OXUM); info.append(Long.toString(totalDataSize)); info.append("."); info.append(Long.toString(dataCount)); info.append(CRLF); - info.append("Internal-Sender-Identifier: "); + info.append(INTERNAL_SENDER_IDENTIFIER); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + info.append( + multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(), + INTERNAL_SENDER_IDENTIFIER.length())); info.append(CRLF); - //Add a version number for our bag type - should be updated with any change to the bag content/structure - info.append("Dataverse-Bag-Version: 1.0"); + // Add a version number for our bag type - should be updated with any change to + // the bag content/structure + info.append(DATAVERSE_BAG_VERSION + "1.0"); info.append(CRLF); return info.toString(); } - static private String multilineWrap(String value) { + static private String multilineWrap(String value, int labelLength) { // Normalize line breaks and ensure all lines after the first are indented - String[] lines =value.split("\\r?\\n"); + String[] lines = value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, - // While trailing whitespace or whitespace-only lines appear to be allowed, it's not clear that handling them adds value (visually identical entries in Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt file + // While trailing whitespace or whitespace-only lines appear to be allowed, it's + // not clear that handling them adds value (visually identical entries in + // Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt + // file String line = lines[i].trim(); if (line.length() > 0) { - String wrapped = WordUtils.wrap(line, 78, CRLF + " ", true); + // Recommended line length, including the label or indents is 79, so we'll wrap + // at 78 to assure subsequent lines with a space are still < 79 total + String wrapped = lineWrap(line, 79, CRLF + " ", true); wrappedValue.append(wrapped); if (i < lines.length - 1) { wrappedValue.append(CRLF).append(" "); @@ -909,25 +944,117 @@ static private String multilineWrap(String value) { return wrappedValue.toString(); } + public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { + if (str == null) { + return null; + } + if (newLineStr == null) { + newLineStr = System.lineSeparator(); + } + if (wrapLength < 1) { + wrapLength = 1; + } + String wrapOn = " "; + final Pattern patternToWrapOn = Pattern.compile(wrapOn); + final int inputLineLength = str.length(); + int offset = 0; + final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); + int matcherSize = -1; + + while (offset < inputLineLength) { + int spaceToWrapAt = -1; + Matcher matcher = patternToWrapOn.matcher(str.substring(offset, + Math.min((int) Math.min(Integer.MAX_VALUE, offset + wrapLength + 1L), inputLineLength))); + if (matcher.find()) { + if (matcher.start() == 0) { + matcherSize = matcher.end(); + if (matcherSize != 0) { + offset += matcher.end(); + continue; + } + offset += 1; + } + spaceToWrapAt = matcher.start() + offset; + } + + // only last line without leading spaces is left + if (inputLineLength - offset <= wrapLength) { + break; + } + + while (matcher.find()) { + spaceToWrapAt = matcher.start() + offset; + } + + if (spaceToWrapAt >= offset) { + // normal case + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + + } else // really long word or URL + if (wrapLongWords) { + if (matcherSize == 0) { + offset--; + } + // wrap really long word one line at a time + wrappedLine.append(str, offset, wrapLength + offset); + wrappedLine.append(newLineStr); + offset += wrapLength; + matcherSize = -1; + } else { + // do not wrap really long word, just extend beyond limit + matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + if (matcher.find()) { + matcherSize = matcher.end() - matcher.start(); + spaceToWrapAt = matcher.start() + offset + wrapLength; + } + + if (spaceToWrapAt >= 0) { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + } else { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, str.length()); + offset = inputLineLength; + matcherSize = -1; + } + } + } + + if (matcherSize == 0 && offset < inputLineLength) { + offset--; + } + + // Whatever is left in line is short enough to just pass through + wrappedLine.append(str, offset, str.length()); + + return wrappedLine.toString(); + } + /** * Kludge - compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. * - * @param jsonElement - * - the root json object - * @param key - * - the key to find a value(s) for + * @param jsonElement - the root json object + * @param key - the key to find a value(s) for * @return - a single string */ String getSingleValue(JsonElement jsonElement, String key) { String val = ""; - if(jsonElement.isJsonObject()) { - JsonObject jsonObject=jsonElement.getAsJsonObject(); + if (jsonElement.isJsonObject()) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); val = jsonObject.get(key).getAsString(); } else if (jsonElement.isJsonArray()) { - + Iterator iter = jsonElement.getAsJsonArray().iterator(); ArrayList stringArray = new ArrayList(); while (iter.hasNext()) { @@ -1127,8 +1254,7 @@ public InputStream get() { * Returns a human-readable version of the file size, where the input represents * a specific number of bytes. * - * @param size - * the number of bytes + * @param size the number of bytes * @return a human-readable display value (includes units) */ public static String byteCountToDisplaySize(long size) { From 8227edff5601ec95ea4f8f2851d630265f23cfd4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 14:28:19 -0500 Subject: [PATCH 19/97] update to handle overall 79 char length --- .../iq/dataverse/util/bagit/BagGenerator.java | 53 +++++++------ .../bagit/BagGeneratorMultilineWrapTest.java | 74 +++++++++++++++++-- 2 files changed, 101 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b253f961b8c..847bcc08141 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -854,20 +854,18 @@ private String generateInfoFile() { String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append(SOURCE_ORGANIZATION + multilineWrap(orgName, SOURCE_ORGANIZATION.length())); + info.append(multilineWrap(SOURCE_ORGANIZATION + orgName)); // ToDo - make configurable info.append(CRLF); - info.append(ORGANIZATION_ADDRESS + multilineWrap(orgAddress, ORGANIZATION_ADDRESS.length())); + info.append(multilineWrap(ORGANIZATION_ADDRESS + orgAddress)); info.append(CRLF); // Not a BagIt standard name - info.append(ORGANIZATION_EMAIL + multilineWrap(orgEmail, ORGANIZATION_EMAIL.length())); + info.append(multilineWrap(ORGANIZATION_EMAIL + orgEmail)); info.append(CRLF); - info.append(EXTERNAL_DESCRIPTION); - /* * Description, and it's subfields, are terms from citation.tsv whose mapping to * a formal vocabulary and label in the oremap may change so we need to find the @@ -878,9 +876,8 @@ private String generateInfoFile() { if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append(multilineWrap( - getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()), - EXTERNAL_DESCRIPTION.length())); + info.append(multilineWrap(EXTERNAL_DESCRIPTION + + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -902,14 +899,12 @@ private String generateInfoFile() { info.append(Long.toString(dataCount)); info.append(CRLF); - info.append(INTERNAL_SENDER_IDENTIFIER); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append( - multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(), - INTERNAL_SENDER_IDENTIFIER.length())); + info.append(multilineWrap(INTERNAL_SENDER_IDENTIFIER + catalog + ":" + + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); // Add a version number for our bag type - should be updated with any change to @@ -920,7 +915,7 @@ private String generateInfoFile() { } - static private String multilineWrap(String value, int labelLength) { + static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented String[] lines = value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); @@ -932,8 +927,7 @@ static private String multilineWrap(String value, int labelLength) { // file String line = lines[i].trim(); if (line.length() > 0) { - // Recommended line length, including the label or indents is 79, so we'll wrap - // at 78 to assure subsequent lines with a space are still < 79 total + // Recommended line length, including the label or indents is 79 String wrapped = lineWrap(line, 79, CRLF + " ", true); wrappedValue.append(wrapped); if (i < lines.length - 1) { @@ -944,6 +938,7 @@ static private String multilineWrap(String value, int labelLength) { return wrappedValue.toString(); } + /** Adapted from Apache WordUtils.wrap() - make subsequent lines shorter by the length of any spaces in newLineStr*/ public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { if (str == null) { return null; @@ -954,17 +949,30 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt if (wrapLength < 1) { wrapLength = 1; } + + // Calculate the indent length (characters after CRLF in newLineStr) + int indentLength = 0; + int crlfIndex = newLineStr.lastIndexOf("\n"); + if (crlfIndex != -1) { + indentLength = newLineStr.length() - crlfIndex -1; + } + String wrapOn = " "; final Pattern patternToWrapOn = Pattern.compile(wrapOn); final int inputLineLength = str.length(); int offset = 0; final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); int matcherSize = -1; + boolean isFirstLine = true; while (offset < inputLineLength) { + // Adjust wrap length based on whether this is the first line or subsequent + // lines + int currentWrapLength = isFirstLine ? wrapLength : (wrapLength - indentLength); + int spaceToWrapAt = -1; Matcher matcher = patternToWrapOn.matcher(str.substring(offset, - Math.min((int) Math.min(Integer.MAX_VALUE, offset + wrapLength + 1L), inputLineLength))); + Math.min((int) Math.min(Integer.MAX_VALUE, offset + currentWrapLength + 1L), inputLineLength))); if (matcher.find()) { if (matcher.start() == 0) { matcherSize = matcher.end(); @@ -978,7 +986,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt } // only last line without leading spaces is left - if (inputLineLength - offset <= wrapLength) { + if (inputLineLength - offset <= currentWrapLength) { break; } @@ -991,6 +999,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt wrappedLine.append(str, offset, spaceToWrapAt); wrappedLine.append(newLineStr); offset = spaceToWrapAt + 1; + isFirstLine = false; } else // really long word or URL if (wrapLongWords) { @@ -998,16 +1007,17 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt offset--; } // wrap really long word one line at a time - wrappedLine.append(str, offset, wrapLength + offset); + wrappedLine.append(str, offset, currentWrapLength + offset); wrappedLine.append(newLineStr); - offset += wrapLength; + offset += currentWrapLength; matcherSize = -1; + isFirstLine = false; } else { // do not wrap really long word, just extend beyond limit - matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + matcher = patternToWrapOn.matcher(str.substring(offset + currentWrapLength)); if (matcher.find()) { matcherSize = matcher.end() - matcher.start(); - spaceToWrapAt = matcher.start() + offset + wrapLength; + spaceToWrapAt = matcher.start() + offset + currentWrapLength; } if (spaceToWrapAt >= 0) { @@ -1017,6 +1027,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt wrappedLine.append(str, offset, spaceToWrapAt); wrappedLine.append(newLineStr); offset = spaceToWrapAt + 1; + isFirstLine = false; } else { if (matcherSize == 0 && offset != 0) { offset--; diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 71ceec61adf..19d478f4b0d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -1,3 +1,4 @@ + package edu.harvard.iq.dataverse.util.bagit; import static org.assertj.core.api.Assertions.assertThat; @@ -47,7 +48,7 @@ void exactBoundary_78chars_noWrap() { @Test void longSingleWord_wrapsAt78WithIndent() { String input = "a".repeat(100); - String expected = "a".repeat(78) + "\r\n " + "a".repeat(22); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -62,8 +63,8 @@ void multiline_input_indentsSecondAndSubsequentOriginalLines() { @Test void multiline_withLF_normalizedAndIndented() { - String input = "First line\nSecond line"; - String expected = "First line\r\n Second line"; + String input = "a".repeat(200); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(43); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -89,8 +90,71 @@ void longSecondLine_preservesIndentOnWraps() { String line1 = "Header"; String line2 = "b".repeat(90); String input = line1 + "\n" + line2; - String expected = "Header\r\n " + "b".repeat(78) + "\r\n " + "b".repeat(12); + String expected = "Header\r\n " + "b".repeat(79) + "\r\n " + "b".repeat(11); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_reducesFirstLineMaxLength() { + // With a label of length 20, first line should wrap at 78-20=58 chars + String label = "l".repeat(20); + String input = label + "a".repeat(150); + // First line: 58 chars, subsequent lines: 78 + String expected = label + "a".repeat(59) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(13); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_zero_behavesAsDefault() { + String input = "a".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_withMultipleLines_onlyAffectsFirstLine() { + String label = "l".repeat(15); + String input = label + "a".repeat(100) + "\nSecond line content"; + // First line wraps at 79-15=64, then continues at 78 per line + // Second line starts fresh and wraps normally + String expected = label + "a".repeat(64) + "\r\n " + "a".repeat(36) + "\r\n Second line content"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_notMidWord() { + // Create a string with a word boundary at position 75 + // "a" repeated 75 times, then a space, then more characters + String input = "a".repeat(75) + " " + "b".repeat(20); + // Should wrap at the space (position 75), not at position 79 + String expected = "a".repeat(75) + "\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_multipleSpaces() { + // Test with word boundary closer to the limit + String input = "a".repeat(70) + " word " + "b".repeat(20); + // Should wrap after "word" (at position 76) + String expected = "a".repeat(70) + " word\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_withLabelLength() { + String label = "l".repeat(20); + // With label length=20, first line wraps at 78-20=58 + // Create string with word boundary at position 55 + String input = label + "a".repeat(55) + " " + "b".repeat(30); + // Should wrap at the space (position 55) + String expected = label + "a".repeat(55) + "\r\n " + "b".repeat(30); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } -} +} \ No newline at end of file From d0749fcd39abefcf0ee13c6fcb042d235f6119dd Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 14:33:41 -0500 Subject: [PATCH 20/97] wrap any other potentially long values --- .../iq/dataverse/util/bagit/BagGenerator.java | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 847bcc08141..b4a80d4d9a9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -806,40 +806,36 @@ private String generateInfoFile() { if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append(CONTACT_NAME); + JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { - info.append(person.getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.getAsString())); info.append(CRLF); } else { if (contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_NAME + ((JsonObject) person).get(contactNameTerm.getLabel()).getAsString())); info.append(CRLF); } if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append(CONTACT_EMAIL); - info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_EMAIL + ((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } } } else { - info.append(CONTACT_NAME); - if (contacts.isJsonPrimitive()) { - info.append((String) contacts.getAsString()); + info.append(multilineWrap(CONTACT_NAME + (String) contacts.getAsString())); info.append(CRLF); } else { JsonObject person = contacts.getAsJsonObject(); if (contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.get(contactNameTerm.getLabel()).getAsString())); info.append(CRLF); } if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { - info.append(CONTACT_EMAIL); - info.append(person.get(contactEmailTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_EMAIL + person.get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } @@ -885,8 +881,7 @@ private String generateInfoFile() { info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append(EXTERNAL_IDENTIFIER); - info.append(aggregation.get("@id").getAsString()); + info.append(multilineWrap(EXTERNAL_IDENTIFIER + aggregation.get("@id").getAsString())); info.append(CRLF); info.append(BAG_SIZE); From 24a625f187ecb662b242d613e3fe8d48dd9a9e92 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 15:03:14 -0500 Subject: [PATCH 21/97] cleanup deprecated code, auto-gen comments --- .../iq/dataverse/util/bagit/BagGenerator.java | 50 +++++++------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b4a80d4d9a9..adca7dd40c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -235,8 +235,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { dirs = ScatterZipOutputStream.fileBased(tmp); // The oremapObject is javax.json.JsonObject and we need // com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser() - .parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + aggregation = (JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); @@ -394,7 +394,6 @@ public boolean generateBag(OutputStream outputStream) throws Exception { public boolean generateBag(String bagName, boolean temp) { usetemp = temp; - FileOutputStream bagFileOS = null; try { File origBagFile = getBagFile(bagName); File bagFile = origBagFile; @@ -403,36 +402,36 @@ public boolean generateBag(String bagName, boolean temp) { logger.fine("Writing to: " + bagFile.getAbsolutePath()); } // Create an output stream backed by the file - bagFileOS = new FileOutputStream(bagFile); - if (generateBag(bagFileOS)) { - // The generateBag call sets this.bagName to the correct value - validateBagFile(bagFile); - if (usetemp) { - logger.fine("Moving tmp zip"); - origBagFile.delete(); - bagFile.renameTo(origBagFile); + try (FileOutputStream bagFileOS = new FileOutputStream(bagFile)) { + if (generateBag(bagFileOS)) { + // The generateBag call sets this.bagName to the correct value + validateBagFile(bagFile); + if (usetemp) { + logger.fine("Moving tmp zip"); + origBagFile.delete(); + bagFile.renameTo(origBagFile); + } + return true; + } else { + return false; } - return true; - } else { - return false; } } catch (Exception e) { logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; - } finally { - IOUtils.closeQuietly(bagFileOS); } } + @SuppressWarnings("deprecation") public void validateBag(String bagId) { logger.info("Validating Bag"); ZipFile zf = null; InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = new ZipFile(bagFile); + zf = ZipFile.builder().setFile(bagFile).get(); ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { logger.info("SHA1 hashes used"); @@ -602,9 +601,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce try { if ((childHash == null) | ignorehashes) { // Generate missing hashInputStream inputStream = null; - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { if (hashtype != null) { if (hashtype.equals(DataFile.ChecksumType.SHA1)) { @@ -621,8 +618,6 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } catch (IOException e) { logger.severe("Failed to read " + childPath); throw e; - } finally { - IOUtils.closeQuietly(inputStream); } if (childHash != null) { JsonObject childHashObject = new JsonObject(); @@ -732,9 +727,7 @@ private void createFileFromURL(final String relPath, final String uri) private void checkFiles(HashMap shaMap, File bagFile) { ExecutorService executor = Executors.newFixedThreadPool(numConnections); - ZipFile zf = null; - try { - zf = new ZipFile(bagFile); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { BagValidationJob.setZipFile(zf); BagValidationJob.setBagGenerator(this); @@ -759,10 +752,7 @@ private void checkFiles(HashMap shaMap, File bagFile) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); } } catch (IOException e1) { - // TODO Auto-generated catch block e1.printStackTrace(); - } finally { - IOUtils.closeQuietly(zf); } logger.fine("Hash Validations Completed"); @@ -1153,10 +1143,8 @@ private HttpGet createNewGetRequest(URI url, String returnType) { urlString = urlString + ((urlString.indexOf('?') != -1) ? "&key=" : "?key=") + apiKey; request = new HttpGet(new URI(urlString)); } catch (MalformedURLException e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } } else { @@ -1211,7 +1199,6 @@ public InputStream get() { } } catch (ClientProtocolException e) { tries += 5; - // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // Retry if this is a potentially temporary error such @@ -1228,7 +1215,6 @@ public InputStream get() { } } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } logger.severe("Could not read: " + uriString); From bf036f3f85066a6a148af9fff3119d8156e63d0b Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 16:26:17 -0500 Subject: [PATCH 22/97] update comment --- .../java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index adca7dd40c3..3c82a9719d3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1035,7 +1035,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt } /** - * Kludge - compound values (e.g. for descriptions) are sent as an array of + * Compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. From be65611fb9578c96ed4a1aa28e730a693b85f437 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 16:26:54 -0500 Subject: [PATCH 23/97] add tests --- .../util/bagit/BagGeneratorInfoFileTest.java | 295 ++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java new file mode 100644 index 00000000000..dbbf3241318 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java @@ -0,0 +1,295 @@ + +package edu.harvard.iq.dataverse.util.bagit; + +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import com.google.gson.JsonParser; + +import jakarta.json.Json; +import jakarta.json.JsonArrayBuilder; +import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class BagGeneratorInfoFileTest { + + private BagGenerator bagGenerator; + private JsonObjectBuilder testAggregationBuilder; + + @Mock + private OREMap mockOreMap; + + @BeforeEach + public void setUp() throws Exception { + MockitoAnnotations.openMocks(this); + + // Create base test aggregation builder with required fields + testAggregationBuilder = Json.createObjectBuilder(); + testAggregationBuilder.add("@id", "doi:10.5072/FK2/TEST123"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), "Test Dataset"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel(), "Test Catalog"); + } + + /** + * Helper method to finalize the aggregation and create the BagGenerator + */ + private void initializeBagGenerator() throws Exception { + JsonObject testAggregation = testAggregationBuilder.build(); + + JsonObjectBuilder oremapJsonBuilder = Json.createObjectBuilder(); + oremapJsonBuilder.add(JsonLDTerm.ore("describes").getLabel(), testAggregation); + JsonObject oremapObject = oremapJsonBuilder.build(); + // Mock the OREMap.getOREMap() method to return the built JSON + when(mockOreMap.getOREMap()).thenReturn(oremapObject); + + // Initialize BagGenerator with test data + bagGenerator = new BagGenerator(mockOreMap, ""); + setPrivateField(bagGenerator, "aggregation", (com.google.gson.JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString())); + setPrivateField(bagGenerator, "totalDataSize", 1024000L); + setPrivateField(bagGenerator, "dataCount", 10L); + } + + @Test + public void testGenerateInfoFileWithSingleContact() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "John Doe"); + contactBuilder.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + } + + @Test + public void testGenerateInfoFileWithMultipleContacts() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonArrayBuilder contactsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder contact1 = Json.createObjectBuilder(); + contact1.add(contactNameTerm.getLabel(), "John Doe"); + contact1.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + + JsonObjectBuilder contact2 = Json.createObjectBuilder(); + contact2.add(contactNameTerm.getLabel(), "Jane Smith"); + contact2.add(contactEmailTerm.getLabel(), "jane.smith@example.com"); + + JsonObjectBuilder contact3 = Json.createObjectBuilder(); + contact3.add(contactNameTerm.getLabel(), "Bob Johnson"); + contact3.add(contactEmailTerm.getLabel(), "bob.johnson@example.com"); + + contactsBuilder.add(contact1); + contactsBuilder.add(contact2); + contactsBuilder.add(contact3); + + testAggregationBuilder.add(contactTerm.getLabel(), contactsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + assertTrue(infoFile.contains("Contact-Name: Jane Smith")); + assertTrue(infoFile.contains("Contact-Email: jane.smith@example.com")); + assertTrue(infoFile.contains("Contact-Name: Bob Johnson")); + assertTrue(infoFile.contains("Contact-Email: bob.johnson@example.com")); + } + + @Test + public void testGenerateInfoFileWithSingleDescription() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "This is a test dataset description."); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("External-Description: This is a test dataset description.")); + } + + @Test + public void testGenerateInfoFileWithMultipleDescriptions() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonArrayBuilder descriptionsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder desc1 = Json.createObjectBuilder(); + desc1.add(descriptionTextTerm.getLabel(), "First description of the dataset."); + + JsonObjectBuilder desc2 = Json.createObjectBuilder(); + desc2.add(descriptionTextTerm.getLabel(), "Second description with additional details."); + + JsonObjectBuilder desc3 = Json.createObjectBuilder(); + desc3.add(descriptionTextTerm.getLabel(), "Third description for completeness."); + + descriptionsBuilder.add(desc1); + descriptionsBuilder.add(desc2); + descriptionsBuilder.add(desc3); + + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + // Assert + assertNotNull(infoFile); + // Multiple descriptions should be concatenated with commas as per getSingleValue method + assertTrue(infoFile.contains("External-Description: First description of the dataset.,Second description with\r\n additional details.,Third description for completeness.")); + } + + @Test + public void testGenerateInfoFileWithRequiredFields() throws Exception { + // Arrange - minimal setup with required fields already in setUp() + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "Test Contact"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "Test description"); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: Test Contact")); + assertTrue(infoFile.contains("External-Description: Test description")); + assertTrue(infoFile.contains("Source-Organization:")); + assertTrue(infoFile.contains("Organization-Address:")); + assertTrue(infoFile.contains("Organization-Email:")); + assertTrue(infoFile.contains("Bagging-Date:")); + assertTrue(infoFile.contains("External-Identifier: doi:10.5072/FK2/TEST123")); + assertTrue(infoFile.contains("Bag-Size:")); + assertTrue(infoFile.contains("Payload-Oxum: 1024000.10")); + assertTrue(infoFile.contains("Internal-Sender-Identifier: Test Catalog:Test Dataset")); + } + + @Test + public void testGenerateInfoFileWithDifferentBagSizes() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(null); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(null); + + initializeBagGenerator(); + + // Test with bytes + setPrivateField(bagGenerator, "totalDataSize", 512L); + setPrivateField(bagGenerator, "dataCount", 5L); + String infoFile1 = invokeGenerateInfoFile(); + assertTrue(infoFile1.contains("Bag-Size: 512 bytes")); + assertTrue(infoFile1.contains("Payload-Oxum: 512.5")); + + // Test with KB + setPrivateField(bagGenerator, "totalDataSize", 2048L); + setPrivateField(bagGenerator, "dataCount", 3L); + String infoFile2 = invokeGenerateInfoFile(); + assertTrue(infoFile2.contains("Bag-Size: 2.05 KB")); + assertTrue(infoFile2.contains("Payload-Oxum: 2048.3")); + + // Test with MB + setPrivateField(bagGenerator, "totalDataSize", 5242880L); + setPrivateField(bagGenerator, "dataCount", 100L); + String infoFile3 = invokeGenerateInfoFile(); + assertTrue(infoFile3.contains("Bag-Size: 5.24 MB")); + assertTrue(infoFile3.contains("Payload-Oxum: 5242880.100")); + + // Test with GB + setPrivateField(bagGenerator, "totalDataSize", 2147483648L); + setPrivateField(bagGenerator, "dataCount", 1000L); + + String infoFile4 = invokeGenerateInfoFile(); + assertTrue(infoFile4.contains("Bag-Size: 2.15 GB")); + assertTrue(infoFile4.contains("Payload-Oxum: 2147483648.1000")); + } + + // Helper methods + + /** + * Invokes the private generateInfoFile method using reflection + */ + private String invokeGenerateInfoFile() throws Exception { + Method method = BagGenerator.class.getDeclaredMethod("generateInfoFile"); + method.setAccessible(true); + return (String) method.invoke(bagGenerator); + } + + /** + * Sets a private field value using reflection + */ + private void setPrivateField(Object target, String fieldName, Object value) throws Exception { + Field field = BagGenerator.class.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } +} \ No newline at end of file From 24d098a0f70dff33c6ca48049ed0e668e8809792 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 17:00:19 -0500 Subject: [PATCH 24/97] QDR updates to apache 5, better fault tolerance for file retrieval --- .../iq/dataverse/util/bagit/BagGenerator.java | 172 +++++++++++------- 1 file changed, 111 insertions(+), 61 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 3c82a9719d3..5c5b88a521b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -4,12 +4,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.InterruptedIOException; import java.io.OutputStream; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; @@ -46,23 +49,24 @@ import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; -import org.apache.http.client.ClientProtocolException; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.config.Registry; -import org.apache.http.config.RegistryBuilder; -import org.apache.http.conn.socket.ConnectionSocketFactory; -import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.NoopHostnameVerifier; -import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.conn.ssl.TrustSelfSignedStrategy; -import org.apache.http.ssl.SSLContextBuilder; -import org.apache.http.util.EntityUtils; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.ClientProtocolException; +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.config.RequestConfig; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.protocol.HttpClientContext; +import org.apache.hc.client5.http.socket.ConnectionSocketFactory; +import org.apache.hc.client5.http.socket.PlainConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; +import org.apache.hc.client5.http.ssl.SSLConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.TrustSelfSignedStrategy; +import org.apache.hc.core5.http.HttpEntity; +import org.apache.hc.core5.http.config.Registry; +import org.apache.hc.core5.http.config.RegistryBuilder; +import org.apache.hc.core5.ssl.SSLContextBuilder; +import org.apache.hc.core5.util.Timeout; import org.json.JSONArray; import com.google.gson.JsonArray; import com.google.gson.JsonElement; @@ -103,10 +107,11 @@ public class BagGenerator { private HashMap pidMap = new LinkedHashMap(); private HashMap checksumMap = new LinkedHashMap(); - private int timeout = 60; - private RequestConfig config = RequestConfig.custom().setConnectTimeout(timeout * 1000) - .setConnectionRequestTimeout(timeout * 1000).setSocketTimeout(timeout * 1000) - .setCookieSpec(CookieSpecs.STANDARD).build(); + private int timeout = 300; + private RequestConfig config = RequestConfig.custom() + .setConnectionRequestTimeout(Timeout.ofSeconds(timeout)) + .setResponseTimeout(Timeout.ofSeconds(timeout)) + .build(); protected CloseableHttpClient client; private PoolingHttpClientConnectionManager cm = null; @@ -131,7 +136,7 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); private OREMap oremap; @@ -152,6 +157,11 @@ public class BagGenerator { private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + // Implement exponential backoff with jitter + static final long baseWaitTimeMs = 1000; // Start with 1 second + static final long maxWaitTimeMs = 30000; // Cap at 30 seconds + + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -189,8 +199,10 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), - NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory( + builder.build(), + NoopHostnameVerifier.INSTANCE + ); Registry registry = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.getSocketFactory()) @@ -200,11 +212,14 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio cm.setDefaultMaxPerRoute(numConnections); cm.setMaxTotal(numConnections > 20 ? numConnections : 20); - client = HttpClients.custom().setConnectionManager(cm).setDefaultRequestConfig(config).build(); + client = HttpClients.custom() + .setConnectionManager(cm) + .setDefaultRequestConfig(config) + .build(); scatterZipCreator = new ParallelScatterZipCreator(Executors.newFixedThreadPool(numConnections)); } catch (NoSuchAlgorithmException | KeyManagementException e) { - logger.warning("Aint gonna work"); + logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } } @@ -424,7 +439,6 @@ public boolean generateBag(String bagName, boolean temp) { } } - @SuppressWarnings("deprecation") public void validateBag(String bagId) { logger.info("Validating Bag"); ZipFile zf = null; @@ -1156,6 +1170,10 @@ private HttpGet createNewGetRequest(URI url, String returnType) { return request; } + /** Get a stream supplier for the given URI. + * + * Caller must close the stream when done. + */ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { @@ -1168,56 +1186,88 @@ public InputStream get() { logger.fine("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); - logger.finest("Retrieving " + tries + ": " + uriString); - CloseableHttpResponse response = null; + try { - response = client.execute(getFile); - // Note - if we ever need to pass an HttpClientContext, we need a new one per - // thread. - int statusCode = response.getStatusLine().getStatusCode(); + // Execute the request directly and keep the response open + final CloseableHttpResponse response = (CloseableHttpResponse) client.executeOpen(null, getFile, HttpClientContext.create()); + int statusCode = response.getCode(); + if (statusCode == 200) { logger.finest("Retrieved: " + uri); - return response.getEntity().getContent(); - } - logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString - + " : " + statusCode); - if (statusCode < 500) { - logger.fine("Will not retry for 40x errors"); - tries += 5; + // Return a wrapped stream that will close the response when the stream is closed + final HttpEntity entity = response.getEntity(); + if (entity != null) { + // Create a wrapper stream that closes the response when the stream is closed + return new FilterInputStream(entity.getContent()) { + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + response.close(); + } + } + }; + } else { + response.close(); + logger.warning("No content in response for: " + uriString); + return null; + } } else { + // Close the response for non-200 responses + response.close(); + + logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString + + " : " + statusCode); tries++; - } - // Error handling - if (response != null) { try { - EntityUtils.consumeQuietly(response.getEntity()); - response.close(); - } catch (IOException io) { - logger.warning( - "Exception closing response after status: " + statusCode + " on " + uri); + // Calculate exponential backoff: 2^tries * baseWaitTimeMs (1 sec) + long waitTime = (long) (Math.pow(2, tries) * baseWaitTimeMs); + + // Add jitter: random value between 0-30% of the wait time + long jitter = (long) (waitTime * 0.3 * Math.random()); + waitTime = waitTime + jitter; + + // Cap the wait time at maxWaitTimeMs (30 seconds) + waitTime = Math.min(waitTime, maxWaitTimeMs); + + logger.fine("Sleeping for " + waitTime + "ms before retry attempt " + tries); + Thread.sleep(waitTime); + } catch (InterruptedException ie) { + logger.log(Level.SEVERE, "InterruptedException during retry delay for file: " + uriString, ie); + Thread.currentThread().interrupt(); // Restore interrupt status + tries += 5; // Skip remaining attempts } } } catch (ClientProtocolException e) { tries += 5; - e.printStackTrace(); + logger.log(Level.SEVERE, "ClientProtocolException when retrieving file: " + uriString + " (attempt " + tries + ")", e); + } catch (SocketTimeoutException e) { + // Specific handling for timeout exceptions + tries++; + logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of 5) - Request exceeded timeout", e); + if (tries == 5) { + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries due to timeouts: " + uriString, e); + } + } catch (InterruptedIOException e) { + // Catches interruptions during I/O operations + tries += 5; + logger.log(Level.SEVERE, "InterruptedIOException when retrieving file: " + uriString + " - Operation was interrupted", e); + Thread.currentThread().interrupt(); // Restore interrupt status } catch (IOException e) { - // Retry if this is a potentially temporary error such - // as a timeout + // Retry if this is a potentially temporary error such as a timeout tries++; - logger.log(Level.WARNING, "Attempt# " + tries + " : Unable to retrieve file: " + uriString, - e); + logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of 5)", e); if (tries == 5) { - logger.severe("Final attempt failed for " + uriString); + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries: " + uriString, e); } - e.printStackTrace(); } } - } catch (URISyntaxException e) { - e.printStackTrace(); + logger.log(Level.SEVERE, "URISyntaxException for file: " + uriString + " - Invalid URI format", e); } - logger.severe("Could not read: " + uriString); + logger.severe("FAILED TO RETRIEVE FILE after all retries: " + uriString); return null; } }; @@ -1268,9 +1318,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will use " + numConnections + " threads"); } } \ No newline at end of file From b4a3799ca82aa48e299e8d5a4351da62b4cad29c Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 17:06:56 -0500 Subject: [PATCH 25/97] release note update --- doc/release-notes/12063-ORE-and-Bag-updates.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md index e276232f33a..b2926f40c96 100644 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -10,4 +10,5 @@ Archival Bag - a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed - values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). - the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) -- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation \ No newline at end of file +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling \ No newline at end of file From f8f7739423c1f1af8fa7b1d1092b73523181a285 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 7 Jan 2026 13:35:19 -0500 Subject: [PATCH 26/97] initial impl --- .../impl/AbstractSubmitToArchiveCommand.java | 83 +++++++++++++++++-- .../settings/SettingsServiceBean.java | 6 ++ .../ArchivalSubmissionWorkflowStep.java | 2 +- 3 files changed, 82 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 29c27d0396d..b4400e7b957 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -15,15 +15,21 @@ import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.pidproviders.doi.datacite.DOIDataCiteRegisterService; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key; +import edu.harvard.iq.dataverse.util.ListSplitUtil; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; import java.io.IOException; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.security.DigestInputStream; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.logging.Logger; @@ -45,14 +51,16 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion @Override public DatasetVersion execute(CommandContext ctxt) throws CommandException { + + String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); - String[] settingsArray = settings.split(","); - for (String setting : settingsArray) { - setting = setting.trim(); - if (!setting.startsWith(":")) { - logger.warning("Invalid Archiver Setting: " + setting); + List settingsList = ListSplitUtil.split(settings); + for (String settingName : settingsList) { + Key setting = Key.parse(settingName); + if (setting == null) { + logger.warning("Invalid Archiver Setting: " + settingName); } else { - requestedSettings.put(setting, ctxt.settings().get(setting)); + requestedSettings.put(settingName, ctxt.settings().getValueForKey(setting)); } } @@ -62,22 +70,81 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { //No un-expired token token = ctxt.authentication().generateApiTokenForUser(user); } - performArchiveSubmission(version, token, requestedSettings); + runArchivingProcess(version, token, requestedSettings); return ctxt.em().merge(version); } + /** + * Note that this method may be called from the execute method above OR from a + * workflow in which execute() is never called and therefore in which all + * variables must be sent as method parameters. (Nominally version is set in the + * constructor and could be dropped from the parameter list.) + * @param ctxt + * + * @param version - the DatasetVersion to archive + * @param token - an API Token for the user performing this action + * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). + */ + public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSetttings) { + // Check if earlier versions must be archived first + String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived.toString()); + boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); + if (requireEarlierArchived) { + + Dataset dataset = version.getDataset(); + List versions = dataset.getVersions(); + + // Check all earlier versions (those with version numbers less than current) + for (DatasetVersion earlierVersion : versions) { + // Skip the current version and any versions that come after it + if (earlierVersion.getId().equals(version.getId())) { + continue; + } + + // Compare version numbers to ensure we only check earlier versions + if (earlierVersion.getVersionNumber() != null && version.getVersionNumber() != null) { + if (earlierVersion.getVersionNumber() < version.getVersionNumber() + || (earlierVersion.getVersionNumber().equals(version.getVersionNumber()) + && earlierVersion.getMinorVersionNumber() < version.getMinorVersionNumber())) { + + // Check if this earlier version has been successfully archived + String archivalStatus = earlierVersion.getArchivalCopyLocationStatus(); + if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) +// || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE) + ) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + return new Failure( + "Earlier versions must be successfully archived first", + "Archival prerequisites not met" + ); + } + } + } + } + } + // Delegate to the archiver-specific implementation + return performArchiveSubmission(version, token, requestedSettings); + } + + /** * This method is the only one that should be overwritten by other classes. Note * that this method may be called from the execute method above OR from a * workflow in which execute() is never called and therefore in which all * variables must be sent as method parameters. (Nominally version is set in the * constructor and could be dropped from the parameter list.) + * @param ctxt * * @param version - the DatasetVersion to archive * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, Map requestedSetttings); + protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, + Map requestedSettings); protected int getNumberOfBagGeneratorThreads() { if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 37d26995017..d8495a2dc8a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -485,6 +485,12 @@ Whether Harvesting (OAI) service is enabled */ ArchiverClassName, + /* + * Only create an archival Bag for a dataset version if all prior versions have + * been successfully archived + */ + ArchiverOnlyIfEarlierVersionsAreArchived, + /** * Custom settings for each archiver. See list below. */ diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index b0567bff107..3e3962d0334 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -49,7 +49,7 @@ public WorkflowStepResult run(WorkflowContext context) { String className = requestedSettings.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvr, context.getDataset().getReleasedVersion()); if (archiveCommand != null) { - return (archiveCommand.performArchiveSubmission(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); + return (archiveCommand.runArchivingProcess(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); } else { logger.severe("No Archiver instance could be created for name: " + className); return new Failure("No Archiver", "Could not create instance of class: " + className); From 5bd6f8d92581ed8ee6e65b4cb394d0f67be804cc Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 8 Jan 2026 09:25:41 -0500 Subject: [PATCH 27/97] fix requestedSettings handling --- .../impl/AbstractSubmitToArchiveCommand.java | 15 ++++++--------- .../command/impl/DRSSubmitToArchiveCommand.java | 5 ++--- .../impl/DuraCloudSubmitToArchiveCommand.java | 3 +-- .../impl/GoogleCloudSubmitToArchiveCommand.java | 2 +- .../command/impl/LocalSubmitToArchiveCommand.java | 3 +-- .../command/impl/S3SubmitToArchiveCommand.java | 3 +-- .../dataverse/workflow/WorkflowServiceBean.java | 4 ++-- 7 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index b4400e7b957..bcb8f37dede 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -3,7 +3,6 @@ import edu.harvard.iq.dataverse.DataCitation; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.SettingsWrapper; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -37,7 +36,7 @@ public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand { private final DatasetVersion version; - private final Map requestedSettings = new HashMap(); + protected final Map requestedSettings = new HashMap(); protected boolean success=false; private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName()); private static final int MAX_ZIP_WAIT = 20000; @@ -50,8 +49,6 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion @Override public DatasetVersion execute(CommandContext ctxt) throws CommandException { - - String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); List settingsList = ListSplitUtil.split(settings); @@ -85,7 +82,9 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSetttings) { + public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSettings) { + // this.requestedSettings won't be set yet in the workflow case, so set it now (used in getNumberOfBagGeneratorThreads) + this.requestedSettings.putAll(requestedSettings); // Check if earlier versions must be archived first String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived.toString()); boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); @@ -127,7 +126,7 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t } } // Delegate to the archiver-specific implementation - return performArchiveSubmission(version, token, requestedSettings); + return performArchiveSubmission(version, token); } @@ -141,10 +140,8 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t * * @param version - the DatasetVersion to archive * @param token - an API Token for the user performing this action - * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, - Map requestedSettings); + protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token); protected int getNumberOfBagGeneratorThreads() { if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java index 78e8454255b..01b9b4621e1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java @@ -82,8 +82,7 @@ public DRSSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion versi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In DRSSubmitToArchiveCommand..."); JsonObject drsConfigObject = null; @@ -113,7 +112,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObject collectionConfig = adminMetadata.getJsonObject(COLLECTIONS).getJsonObject(alias); - WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token, requestedSettings); + WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token); JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..71855abd927 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -49,8 +49,7 @@ public DuraCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) : DEFAULT_PORT; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..5d27e71583b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -45,7 +45,7 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..d590e605985 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -36,8 +36,7 @@ public LocalSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion ver } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In LocalCloudSubmitToArchive..."); String localPath = requestedSettings.get(BagItLocalPath.toString()); String zipName = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..e3d5a0d8ae0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -78,8 +78,7 @@ public S3SubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion versio } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In S3SubmitToArchiveCommand..."); JsonObject configObject = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java index ae1175f0e1d..fce13d1c181 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java @@ -180,12 +180,12 @@ private Map retrieveRequestedSettings(Map requir break; } case "boolean": { - retrievedSettings.put(setting, settings.isTrue(settingType, false)); + retrievedSettings.put(setting, settings.isTrue(setting, false)); break; } case "long": { retrievedSettings.put(setting, - settings.getValueForKeyAsLong(SettingsServiceBean.Key.valueOf(setting))); + settings.getValueForKeyAsLong(SettingsServiceBean.Key.parse(setting))); break; } } From 4aaf6ca3ceff1f772dad5821e3f7a8b76342060d Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 8 Jan 2026 09:26:00 -0500 Subject: [PATCH 28/97] efficiency improvement --- .../iq/dataverse/settings/SettingsServiceBean.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index d8495a2dc8a..1c67cb85060 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -802,16 +802,13 @@ public static SettingsServiceBean.Key parse(String key) { // Cut off the ":" we verified is present before String normalizedKey = key.substring(1); - // Iterate through all the known keys and return on match (case sensitive!) // We are case sensitive here because Dataverse implicitely uses case sensitive keys everywhere! - for (SettingsServiceBean.Key k : SettingsServiceBean.Key.values()) { - if (k.name().equals(normalizedKey)) { - return k; - } + try { + return SettingsServiceBean.Key.valueOf(normalizedKey); + } catch (IllegalArgumentException e) { + // Fall through on no match - return null for invalid keys + return null; } - - // Fall through on no match - return null; } } From 7cdef818079a6c3aa253063e1c68b41a54c4c0ed Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 8 Jan 2026 09:40:53 -0500 Subject: [PATCH 29/97] QDR fixes transx timeout, ignored bag thread setting, add deletable --- .../impl/AbstractSubmitToArchiveCommand.java | 18 +++++++++++++++++- .../iq/dataverse/util/bagit/BagGenerator.java | 8 ++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index bcb8f37dede..98e9dfb68e1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -20,6 +20,9 @@ import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; + +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; @@ -48,6 +51,7 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override + @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public DatasetVersion execute(CommandContext ctxt) throws CommandException { String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); @@ -174,8 +178,8 @@ public Thread startBagThread(DatasetVersion dv, PipedInputStream in, DigestInput public void run() { try (PipedOutputStream out = new PipedOutputStream(in)) { // Generate bag + BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads()); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); success = true; @@ -247,4 +251,16 @@ public static boolean isSingleVersion(SettingsWrapper settingsWrapper) { public static boolean isSingleVersion(SettingsServiceBean settingsService) { return false; } + + /** Whether the archiver can delete existing archival files (and thus can retry when the existing files are incomplete/obsolete) + * A static version supports calls via reflection while the instance method supports inheritance for use on actual command instances (see DatasetPage for both use cases). + * @return + */ + public static boolean supportsDelete() { + return false; + } + + public boolean canDelete() { + return supportsDelete(); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..3035694ae3d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -120,7 +120,7 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); private OREMap oremap; @@ -1124,9 +1124,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will use " + numConnections + " threads"); } } \ No newline at end of file From ce974dc1dbae0c282a7f34bd11978c23e198481a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 19 Jan 2026 10:55:20 -0500 Subject: [PATCH 30/97] create lock in finalize --- .../command/impl/FinalizeDatasetPublicationCommand.java | 9 +++++++++ .../iq/dataverse/workflow/WorkflowServiceBean.java | 8 +++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 1ef68ae4853..e867b69ae10 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -28,6 +28,7 @@ import edu.harvard.iq.dataverse.privateurl.PrivateUrl; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; +import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.WorkflowContext.TriggerType; import java.awt.datatransfer.StringSelection; @@ -248,6 +249,14 @@ public Dataset execute(CommandContext ctxt) throws CommandException { //Should this be in onSuccess()? ctxt.workflows().getDefaultWorkflow(TriggerType.PostPublishDataset).ifPresent(wf -> { try { + // Create the workflow lock BEFORE starting the workflow + DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); + workflowLock.setDataset(ds); + ctxt.datasets().addDatasetLock(ds, workflowLock); + + // Build context with the lock attached + WorkflowContext context = buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased); + context.setLockId(ds.getLockFor(DatasetLock.Reason.Workflow).getId()); ctxt.workflows().start(wf, buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased), false); } catch (CommandException ex) { ctxt.datasets().removeDatasetLocks(ds, DatasetLock.Reason.Workflow); diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java index ae1175f0e1d..4974f1c6dde 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java @@ -133,8 +133,8 @@ public void start(Workflow wf, WorkflowContext ctxt, boolean findDataset) throws * (e.g. if this method is not asynchronous) * */ - - if (!findDataset) { + boolean isLocked = ctxt.getLockId()!=null; + if (!findDataset && !isLocked) { /* * Sleep here briefly to make sure the database update from the callers * transaction completes which avoids any concurrency/optimistic lock issues. @@ -152,7 +152,9 @@ public void start(Workflow wf, WorkflowContext ctxt, boolean findDataset) throws } //Refresh will only em.find the dataset if findDataset is true. (otherwise the dataset is em.merged) ctxt = refresh(ctxt, retrieveRequestedSettings( wf.getRequiredSettings()), getCurrentApiToken(ctxt.getRequest().getAuthenticatedUser()), findDataset); - lockDataset(ctxt, new DatasetLock(DatasetLock.Reason.Workflow, ctxt.getRequest().getAuthenticatedUser())); + if(!isLocked) { + lockDataset(ctxt, new DatasetLock(DatasetLock.Reason.Workflow, ctxt.getRequest().getAuthenticatedUser())); + } forward(wf, ctxt); } From 900033c2b00e1ef220956db30cba25db81a8cc15 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 20 Jan 2026 12:36:59 -0500 Subject: [PATCH 31/97] add lock before workflow in publish and api --- .../edu/harvard/iq/dataverse/api/Datasets.java | 14 +++++++++++--- .../engine/command/impl/PublishDatasetCommand.java | 14 ++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 1b3016ec2f4..cf18d343aaa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1388,17 +1388,25 @@ public Response publishMigratedDataset(@Context ContainerRequestContext crc, Str */ String errorMsg = null; Optional prePubWf = wfService.getDefaultWorkflow(TriggerType.PrePublishDataset); - + DataverseRequest dataverseRequest = createDataverseRequest(user); try { // ToDo - should this be in onSuccess()? May relate to todo above if (prePubWf.isPresent()) { + // Create the workflow lock BEFORE starting the workflow + DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, user); + workflowLock.setDataset(ds); + datasetSvc.addDatasetLock(ds, workflowLock); + + // Build context with the lock attached + WorkflowContext context = new WorkflowContext(dataverseRequest, ds, TriggerType.PrePublishDataset, !contactPIDProvider); + context.setLockId(ds.getLockFor(DatasetLock.Reason.Workflow).getId()); // Start the workflow, the workflow will call FinalizeDatasetPublication later wfService.start(prePubWf.get(), - new WorkflowContext(createDataverseRequest(user), ds, TriggerType.PrePublishDataset, !contactPIDProvider), + new WorkflowContext(dataverseRequest, ds, TriggerType.PrePublishDataset, !contactPIDProvider), false); } else { FinalizeDatasetPublicationCommand cmd = new FinalizeDatasetPublicationCommand(ds, - createDataverseRequest(user), !contactPIDProvider); + dataverseRequest, !contactPIDProvider); ds = commandEngine.submit(cmd); } } catch (CommandException ex) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java index 915ef6ea2a1..62974534cc8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java @@ -12,6 +12,7 @@ import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.workflow.Workflow; +import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.WorkflowContext.TriggerType; import jakarta.persistence.OptimisticLockException; @@ -111,10 +112,15 @@ public PublishDatasetResult execute(CommandContext ctxt) throws CommandException if ( prePubWf.isPresent() ) { // We start a workflow try { - theDataset = ctxt.em().merge(theDataset); - ctxt.em().flush(); - ctxt.workflows().start(prePubWf.get(), - buildContext(theDataset, TriggerType.PrePublishDataset, datasetExternallyReleased), true); + // Create the workflow lock BEFORE starting the workflow + DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); + workflowLock.setDataset(theDataset); + ctxt.datasets().addDatasetLock(theDataset, workflowLock); + theDataset = ctxt.em().merge(theDataset); + ctxt.em().flush(); + WorkflowContext context = buildContext(theDataset, TriggerType.PrePublishDataset, datasetExternallyReleased); + context.setLockId(theDataset.getLockFor(DatasetLock.Reason.Workflow).getId()); + ctxt.workflows().start(prePubWf.get(), context, true); return new PublishDatasetResult(theDataset, Status.Workflow); } catch (OptimisticLockException e) { throw new CommandException(e.getMessage(), e, this); From d1be22e9384f6b914915afb730f2f3f6fd3e2391 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 12:47:25 -0500 Subject: [PATCH 32/97] use update context --- .../engine/command/impl/FinalizeDatasetPublicationCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index e867b69ae10..8500ca0ff3b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -257,7 +257,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // Build context with the lock attached WorkflowContext context = buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased); context.setLockId(ds.getLockFor(DatasetLock.Reason.Workflow).getId()); - ctxt.workflows().start(wf, buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased), false); + ctxt.workflows().start(wf, context, false); } catch (CommandException ex) { ctxt.datasets().removeDatasetLocks(ds, DatasetLock.Reason.Workflow); logger.log(Level.SEVERE, "Error invoking post-publish workflow: " + ex.getMessage(), ex); From e6426c9cdefe8162c5f373bded10f8153e8aa069 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 12:51:29 -0500 Subject: [PATCH 33/97] move post wf to onSuccess --- .../FinalizeDatasetPublicationCommand.java | 51 ++++++++----------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 8500ca0ff3b..5a59508741b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -248,20 +248,10 @@ public Dataset execute(CommandContext ctxt) throws CommandException { //Should this be in onSuccess()? ctxt.workflows().getDefaultWorkflow(TriggerType.PostPublishDataset).ifPresent(wf -> { - try { - // Create the workflow lock BEFORE starting the workflow - DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); - workflowLock.setDataset(ds); - ctxt.datasets().addDatasetLock(ds, workflowLock); - - // Build context with the lock attached - WorkflowContext context = buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased); - context.setLockId(ds.getLockFor(DatasetLock.Reason.Workflow).getId()); - ctxt.workflows().start(wf, context, false); - } catch (CommandException ex) { - ctxt.datasets().removeDatasetLocks(ds, DatasetLock.Reason.Workflow); - logger.log(Level.SEVERE, "Error invoking post-publish workflow: " + ex.getMessage(), ex); - } + // Create the workflow lock BEFORE starting the workflow + DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); + workflowLock.setDataset(ds); + ctxt.datasets().addDatasetLock(ds, workflowLock); }); Dataset readyDataset = ctxt.em().merge(ds); @@ -297,6 +287,22 @@ public boolean onSuccess(CommandContext ctxt, Object r) { } catch (Exception e) { logger.warning("Failure to send dataset published messages for : " + dataset.getId() + " : " + e.getMessage()); } + + final Dataset ds = dataset; + ctxt.workflows().getDefaultWorkflow(TriggerType.PostPublishDataset).ifPresent(wf -> { + // Build context with the lock attached + WorkflowContext context = buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased); + context.setLockId(ds.getLockFor(DatasetLock.Reason.Workflow).getId()); + try { + ctxt.workflows().start(wf, context, false); + } catch (CommandException e) { + logger.log(Level.SEVERE, "Error invoking post-publish workflow: " + e.getMessage(), e); + } + }); + // Metadata export: + ctxt.datasets().reExportDatasetAsync(dataset); + + ctxt.index().asyncIndexDataset(dataset, true); //re-indexing dataverses that have additional subjects if (!dataversesToIndex.isEmpty()){ @@ -312,23 +318,6 @@ public boolean onSuccess(CommandContext ctxt, Object r) { } } - // Metadata export: - - try { - ExportService instance = ExportService.getInstance(); - instance.exportAllFormats(dataset); - dataset = ctxt.datasets().merge(dataset); - } catch (Exception ex) { - // Something went wrong! - // Just like with indexing, a failure to export is not a fatal - // condition. We'll just log the error as a warning and keep - // going: - logger.log(Level.WARNING, "Finalization: exception caught while exporting: "+ex.getMessage(), ex); - // ... but it is important to only update the export time stamp if the - // export was indeed successful. - } - ctxt.index().asyncIndexDataset(dataset, true); - return retVal; } From baaa1db25c5f27fd21e4e686cd01120a7ecce3f4 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 13:36:26 -0500 Subject: [PATCH 34/97] prepub wf in onSuccess --- .../command/impl/PublishDatasetCommand.java | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java index 62974534cc8..8282aa076ca 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java @@ -18,6 +18,7 @@ import jakarta.persistence.OptimisticLockException; import java.util.Optional; +import java.util.logging.Level; import java.util.logging.Logger; import static java.util.stream.Collectors.joining; import static edu.harvard.iq.dataverse.engine.command.impl.PublishDatasetResult.Status; @@ -107,25 +108,21 @@ public PublishDatasetResult execute(CommandContext ctxt) throws CommandException } } - //ToDo - should this be in onSuccess()? May relate to todo above Optional prePubWf = ctxt.workflows().getDefaultWorkflow(TriggerType.PrePublishDataset); - if ( prePubWf.isPresent() ) { + if (prePubWf.isPresent()) { // We start a workflow try { - // Create the workflow lock BEFORE starting the workflow - DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); - workflowLock.setDataset(theDataset); - ctxt.datasets().addDatasetLock(theDataset, workflowLock); - theDataset = ctxt.em().merge(theDataset); - ctxt.em().flush(); - WorkflowContext context = buildContext(theDataset, TriggerType.PrePublishDataset, datasetExternallyReleased); - context.setLockId(theDataset.getLockFor(DatasetLock.Reason.Workflow).getId()); - ctxt.workflows().start(prePubWf.get(), context, true); + // Create the workflow lock BEFORE starting the workflow + DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); + workflowLock.setDataset(theDataset); + ctxt.datasets().addDatasetLock(theDataset, workflowLock); + theDataset = ctxt.em().merge(theDataset); + ctxt.em().flush(); + return new PublishDatasetResult(theDataset, Status.Workflow); } catch (OptimisticLockException e) { throw new CommandException(e.getMessage(), e, this); } - } else{ // We will skip trying to register the global identifiers for datafiles // if "dependent" file-level identifiers are requested, AND the naming @@ -137,7 +134,7 @@ public PublishDatasetResult execute(CommandContext ctxt) throws CommandException // than the configured limit number of files, then call Finalize // asychronously (default is 10) // ... - // Additionaly in 4.9.3 we have added a system variable to disable + // Additionally in 4.9.3 we have added a system variable to disable // registering file PIDs on the installation level. boolean registerGlobalIdsForFiles = ctxt.systemConfig().isFilePIDsEnabledForCollection(getDataset().getOwner()) && @@ -263,10 +260,23 @@ public boolean onSuccess(CommandContext ctxt, Object r) { dataset = ((PublishDatasetResult) r).getDataset(); } + final Dataset ds = dataset; + if (dataset != null) { + Optional prePubWf = ctxt.workflows().getDefaultWorkflow(TriggerType.PrePublishDataset); - //A pre-publication workflow will call FinalizeDatasetPublicationCommand itself when it completes - if (! prePubWf.isPresent() ) { + // A pre-publication workflow will call FinalizeDatasetPublicationCommand itself when it completes + if (prePubWf.isPresent()) { + WorkflowContext context = buildContext(ds, TriggerType.PrePublishDataset, datasetExternallyReleased); + context.setLockId(ds.getLockFor(DatasetLock.Reason.Workflow).getId()); + try { + ctxt.workflows().start(prePubWf.get(), context, true); + } catch (CommandException e) { + logger.log(Level.SEVERE, "Error invoking pre-publish workflow: " + e.getMessage(), e); + return false; + } + } + else { logger.fine("From onSuccess, calling FinalizeDatasetPublicationCommand for dataset " + dataset.getGlobalId().asString()); ctxt.datasets().callFinalizePublishCommandAsynchronously(dataset.getId(), ctxt, request, datasetExternallyReleased); } From f64a80e30a29e907a90314aa17820c937af767bd Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 13:38:07 -0500 Subject: [PATCH 35/97] Cleanup, fix generic message for pre and post pub wf --- .../engine/command/impl/FinalizeDatasetPublicationCommand.java | 1 - src/main/java/propertyFiles/Bundle.properties | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 5a59508741b..7cc5bb47d97 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -246,7 +246,6 @@ public Dataset execute(CommandContext ctxt) throws CommandException { //Remove any pre-pub workflow lock (not needed as WorkflowServiceBean.workflowComplete() should already have removed it after setting the finalizePublication lock?) ctxt.datasets().removeDatasetLocks(ds, DatasetLock.Reason.Workflow); - //Should this be in onSuccess()? ctxt.workflows().getDefaultWorkflow(TriggerType.PostPublishDataset).ifPresent(wf -> { // Create the workflow lock BEFORE starting the workflow DatasetLock workflowLock = new DatasetLock(DatasetLock.Reason.Workflow, (AuthenticatedUser) getRequest().getUser()); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index f6c0054a43a..23b4a895363 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -1649,7 +1649,7 @@ dataset.share.datasetShare=Share Dataset dataset.share.datasetShare.tip=Share this dataset on your favorite social media networks. dataset.share.datasetShare.shareText=View this dataset. dataset.locked.message=Dataset Locked -dataset.locked.message.details=This dataset is locked until publication. +dataset.locked.message.details=This dataset is temporarily locked while background processing related to publication completes. dataset.locked.inReview.message=Submitted for Review dataset.locked.ingest.message=The tabular data files uploaded are being processed and converted into the archival format dataset.unlocked.ingest.message=The tabular files have been ingested. @@ -1680,7 +1680,6 @@ dataset.compute.computeBatchListHeader=Compute Batch dataset.compute.computeBatchRestricted=This dataset contains restricted files you may not compute on because you have not been granted access. dataset.delete.error=Could not deaccession the dataset because the {0} update failed. dataset.publish.workflow.message=Publish in Progress -dataset.publish.workflow.inprogress=This dataset is locked until publication. dataset.pidRegister.workflow.inprogress=The dataset is locked while the persistent identifiers are being registered or updated, and/or the physical files are being validated. dataset.versionUI.draft=Draft dataset.versionUI.inReview=In Review From 6e1138282a2e27f9531ede25a7cbd770189508d7 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 14:03:33 -0500 Subject: [PATCH 36/97] handle possible OLE --- .../harvard/iq/dataverse/DatasetServiceBean.java | 15 +++++++++++++++ .../harvest/server/OAIRecordServiceBean.java | 7 ++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index a58dad4f4c7..8b820fbc7a4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -1140,4 +1140,19 @@ public void saveStorageQuota(Dataset target, Long allocation) { } em.flush(); } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public void setLastExportTimeInNewTransaction(Long datasetId, Date lastExportTime) { + try { + Dataset currentDataset = find(datasetId); + if (currentDataset != null) { + currentDataset.setLastExportTime(lastExportTime); + merge(currentDataset); + } else { + logger.log(Level.SEVERE, "Could not find Dataset with id={0} to retry persisting archival copy location after OptimisticLockException.", datasetId); + } + } catch (Exception e) { + logger.log(Level.SEVERE, "Failed to retry export after OptimisticLockException for dataset id=" + datasetId, e); + } + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java index cc15d4c978b..b31268725b0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java @@ -26,6 +26,7 @@ import static jakarta.ejb.TransactionAttributeType.REQUIRES_NEW; import jakarta.inject.Named; import jakarta.persistence.EntityManager; +import jakarta.persistence.OptimisticLockException; import jakarta.persistence.PersistenceContext; import jakarta.persistence.TypedQuery; import jakarta.persistence.TemporalType; @@ -262,7 +263,11 @@ public void exportAllFormatsInNewTransaction(Dataset dataset) throws ExportExcep try { ExportService exportServiceInstance = ExportService.getInstance(); exportServiceInstance.exportAllFormats(dataset); - dataset = datasetService.merge(dataset); + //Use em.merge or the jakarta OLE we want to catch will be wrapped + dataset = em.merge(dataset); + em.flush(); + } catch (OptimisticLockException ole) { + datasetService.setLastExportTimeInNewTransaction(dataset.getId(), dataset.getLastExportTime()); } catch (Exception e) { logger.log(Level.FINE, "Caught unknown exception while trying to export", e); throw new ExportException(e.getMessage()); From 709b4da87c0186efc049a9b7625fd04fdddd9797 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 24 Nov 2025 16:16:15 -0500 Subject: [PATCH 37/97] try async command for archiving --- .../edu/harvard/iq/dataverse/DatasetPage.java | 25 ++++++++---------- .../iq/dataverse/EjbDataverseEngine.java | 26 +++++++++++++++++++ src/main/java/propertyFiles/Bundle.properties | 1 + 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 20617160a1c..b97b8ec6578 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6101,20 +6101,17 @@ public void archiveVersion(Long id) { AbstractSubmitToArchiveCommand cmd = ArchiverUtil.createSubmitToArchiveCommand(className, dvRequestService.getDataverseRequest(), dv); if (cmd != null) { try { - DatasetVersion version = commandEngine.submit(cmd); - if (!version.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { - logger.info( - "DatasetVersion id=" + version.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); - } else { - logger.severe("Error submitting version " + version.getId() + " due to conflict/error at Archive"); - } - if (version.getArchivalCopyLocation() != null) { - setVersionTabList(resetVersionTabList()); - this.setVersionTabListForPostLoad(getVersionTabList()); - JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.success")); - } else { - JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("datasetversion.archive.failure")); - } + commandEngine.submitAsync(cmd); + + // Set initial pending status + dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); + + logger.info( + "DatasetVersion id=" + dv.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); + setVersionTabList(resetVersionTabList()); + this.setVersionTabListForPostLoad(getVersionTabList()); + JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); + } catch (CommandException ex) { logger.log(Level.SEVERE, "Unexpected Exception calling submit archive command", ex); JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("datasetversion.archive.failure")); diff --git a/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java b/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java index 4d6d59cb013..5a3f105497d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java +++ b/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java @@ -31,6 +31,9 @@ import java.util.Map; import java.util.Set; + +import jakarta.ejb.AsyncResult; +import jakarta.ejb.Asynchronous; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.inject.Named; @@ -45,6 +48,7 @@ import java.util.Arrays; import java.util.EnumSet; import java.util.Stack; +import java.util.concurrent.Future; import java.util.logging.Level; import java.util.logging.Logger; import jakarta.annotation.Resource; @@ -348,6 +352,28 @@ public R submit(Command aCommand) throws CommandException { logSvc.log(logRec); } } + + /** + * Submits a command for asynchronous execution. + * The command will be executed in a separate thread and won't block the caller. + * + * @param The return type of the command + * @param aCommand The command to execute + * @param user The user executing the command + * @return A Future representing the pending result + * @throws CommandException if the command cannot be submitted + */ + @Asynchronous + public Future submitAsync(Command aCommand) throws CommandException { + try { + logger.log(Level.INFO, "Submitting async command: {0}", aCommand.getClass().getSimpleName()); + R result = submit(aCommand); + return new AsyncResult<>(result); + } catch (Exception e) { + logger.log(Level.SEVERE, "Async command execution failed: " + aCommand.getClass().getSimpleName(), e); + throw e; + } + } protected void completeCommand(Command command, Object r, Stack called) { diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index f6c0054a43a..d9b9fd7bc48 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -2699,6 +2699,7 @@ dataset.notlinked.msg=There was a problem linking this dataset to yours: dataset.linking.popop.already.linked.note=Note: This dataset is already linked to the following dataverse(s): dataset.linking.popup.not.linked.note=Note: This dataset is not linked to any of your accessible dataverses datasetversion.archive.success=Archival copy of Version successfully submitted +datasetversion.archive.inprogress= Data Project archiving has been started datasetversion.archive.failure=Error in submitting an archival copy datasetversion.update.failure=Dataset Version Update failed. Changes are still in the DRAFT version. datasetversion.update.archive.failure=Dataset Version Update succeeded, but the attempt to update the archival copy failed. From 6487c1433f1c960d645250cea421c1659120d3c9 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 24 Nov 2025 17:07:48 -0500 Subject: [PATCH 38/97] save status --- src/main/java/edu/harvard/iq/dataverse/DatasetPage.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index b97b8ec6578..0bf0db42728 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6101,10 +6101,12 @@ public void archiveVersion(Long id) { AbstractSubmitToArchiveCommand cmd = ArchiverUtil.createSubmitToArchiveCommand(className, dvRequestService.getDataverseRequest(), dv); if (cmd != null) { try { - commandEngine.submitAsync(cmd); - + // Set initial pending status dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); + dv = datasetVersionService.merge(dv); + + commandEngine.submitAsync(cmd); logger.info( "DatasetVersion id=" + dv.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); From 9d32051fe76d0914fc35d21f693211054fc0c38a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 8 Jan 2026 13:07:23 -0500 Subject: [PATCH 39/97] refactor, use persistArchivalCopyLocation everywhere --- .../edu/harvard/iq/dataverse/DatasetPage.java | 2 +- .../iq/dataverse/DatasetVersionServiceBean.java | 17 +++++++++++++++++ .../edu/harvard/iq/dataverse/api/Datasets.java | 1 + .../impl/AbstractSubmitToArchiveCommand.java | 3 ++- 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 0bf0db42728..281734cd66e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6104,7 +6104,7 @@ public void archiveVersion(Long id) { // Set initial pending status dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); - dv = datasetVersionService.merge(dv); + datasetVersionService.persistArchivalCopyLocation(dv); commandEngine.submitAsync(cmd); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 60df1fd3dfd..7656f975d2a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -1333,4 +1333,21 @@ public Long getDatasetVersionCount(Long datasetId, boolean canViewUnpublishedVer return em.createQuery(cq).getSingleResult(); } + + + /** + * Update the archival copy location for a specific version of a dataset. Archiving can be long-running and other parallel updates to the datasetversion have likely occurred + * + * @param dv + * The dataset version whose archival copy location we want to update. Must not be {@code null}. + * @param archivalStatusPending + * the JSON status string, may be {@code null}. + */ + public void persistArchivalCopyLocation(DatasetVersion dv) { + em.createNativeQuery( + "UPDATE datasetversion SET archivalcopylocation = ?1 WHERE id = ?2") + .setParameter(1, dv.getArchivalCopyLocation()) + .setParameter(2, dv.getId()) + .executeUpdate(); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 1b3016ec2f4..c8e66115575 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1280,6 +1280,7 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( if (archiveCommand != null) { // Delete the record of any existing copy since it is now out of date/incorrect updateVersion.setArchivalCopyLocation(null); + datasetVersionSvc.persistArchivalCopyLocation(updateVersion); /* * Then try to generate and submit an archival copy. Note that running this * command within the CuratePublishedDatasetVersionCommand was causing an error: diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 29c27d0396d..7e39a8e7b85 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -63,7 +63,8 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { token = ctxt.authentication().generateApiTokenForUser(user); } performArchiveSubmission(version, token, requestedSettings); - return ctxt.em().merge(version); + ctxt.datasetVersion().persistArchivalCopyLocation(version); + return version; } /** From ec5046cc161193fd102481a9a53cb439c5768f94 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 12 Jan 2026 10:55:48 -0500 Subject: [PATCH 40/97] catch OLE when persisting archivalcopylocation --- .../dataverse/DatasetVersionServiceBean.java | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 7656f975d2a..b5e964e5673 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -33,6 +33,7 @@ import jakarta.json.JsonObjectBuilder; import jakarta.persistence.EntityManager; import jakarta.persistence.NoResultException; +import jakarta.persistence.OptimisticLockException; import jakarta.persistence.PersistenceContext; import jakarta.persistence.Query; import jakarta.persistence.TypedQuery; @@ -1336,18 +1337,25 @@ public Long getDatasetVersionCount(Long datasetId, boolean canViewUnpublishedVer /** - * Update the archival copy location for a specific version of a dataset. Archiving can be long-running and other parallel updates to the datasetversion have likely occurred + * Update the archival copy location for a specific version of a dataset. Archiving can be long-running and other parallel updates to the datasetversion have likely occurred so this method will check + * for OptimisticLockExceptions and retry the update with the latest version. * * @param dv * The dataset version whose archival copy location we want to update. Must not be {@code null}. - * @param archivalStatusPending - * the JSON status string, may be {@code null}. */ public void persistArchivalCopyLocation(DatasetVersion dv) { - em.createNativeQuery( - "UPDATE datasetversion SET archivalcopylocation = ?1 WHERE id = ?2") - .setParameter(1, dv.getArchivalCopyLocation()) - .setParameter(2, dv.getId()) - .executeUpdate(); + try { + em.merge(dv); + em.flush(); // Force the update and version check immediately + } catch (OptimisticLockException ole) { + logger.log(Level.INFO, "OptimisticLockException while persisting archival copy location for DatasetVersion id={0}. Retrying on latest version.", dv.getId()); + DatasetVersion currentVersion = find(dv.getId()); + if (currentVersion != null) { + currentVersion.setArchivalCopyLocation(dv.getArchivalCopyLocation()); + em.merge(currentVersion); + } else { + logger.log(Level.SEVERE, "Could not find DatasetVersion with id={0} to retry persisting archival copy location after OptimisticLockException.", dv.getId()); + } + } } } From c1055b87cd3445adc0a21f4248c1ec2fb4442774 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 25 Nov 2025 14:23:22 -0500 Subject: [PATCH 41/97] Add obsolete state, update display, add supportsDelete --- .../edu/harvard/iq/dataverse/DatasetPage.java | 81 ++++++++++++------- .../harvard/iq/dataverse/DatasetVersion.java | 1 + .../impl/AbstractSubmitToArchiveCommand.java | 4 + .../GoogleCloudSubmitToArchiveCommand.java | 39 ++++++++- src/main/java/propertyFiles/Bundle.properties | 1 + src/main/webapp/dataset-versions.xhtml | 8 +- 6 files changed, 103 insertions(+), 31 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 281734cd66e..0832560eafb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -42,6 +42,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.UpdateDatasetVersionCommand; import edu.harvard.iq.dataverse.export.ExportService; import edu.harvard.iq.dataverse.util.cache.CacheFactoryBean; +import edu.harvard.iq.dataverse.util.json.JsonUtil; import io.gdcc.spi.export.ExportException; import io.gdcc.spi.export.Exporter; import edu.harvard.iq.dataverse.ingest.IngestRequest; @@ -105,6 +106,9 @@ import jakarta.faces.view.ViewScoped; import jakarta.inject.Inject; import jakarta.inject.Named; +import jakarta.json.Json; +import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; import jakarta.persistence.OptimisticLockException; import org.apache.commons.lang3.StringUtils; @@ -2992,27 +2996,40 @@ public String updateCurrentVersion() { String className = settingsService.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvRequestService.getDataverseRequest(), updateVersion); if (archiveCommand != null) { - // Delete the record of any existing copy since it is now out of date/incorrect - updateVersion.setArchivalCopyLocation(null); - /* - * Then try to generate and submit an archival copy. Note that running this - * command within the CuratePublishedDatasetVersionCommand was causing an error: - * "The attribute [id] of class - * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary - * key column in the database. Updates are not allowed." To avoid that, and to - * simplify reporting back to the GUI whether this optional step succeeded, I've - * pulled this out as a separate submit(). - */ - try { - updateVersion = commandEngine.submit(archiveCommand); - if (!updateVersion.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.success"); - } else { - errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure"); + //There is an archiver configured, so now decide what to dO: + // If a successful copy exists, don't automatically update, just note the old copy is obsolete (and enable the superadmin button in the display to allow a ~manual update if desired) + // If pending or an obsolete copy exists, do nothing (nominally if a pending run succeeds and we're updating the current version here, it should be marked as obsolete - ignoring for now since updates within the time an archiving run is pending should be rare + // If a failure or null, rerun archiving now. If a failure is due to an exiting copy in the repo, we'll fail again + String status = updateVersion.getArchivalCopyLocationStatus(); + if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)){ + // Delete the record of any existing copy since it is now out of date/incorrect + updateVersion.setArchivalCopyLocation(null); + /* + * Then try to generate and submit an archival copy. Note that running this + * command within the CuratePublishedDatasetVersionCommand was causing an error: + * "The attribute [id] of class + * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary + * key column in the database. Updates are not allowed." To avoid that, and to + * simplify reporting back to the GUI whether this optional step succeeded, I've + * pulled this out as a separate submit(). + */ + try { + updateVersion = commandEngine.submit(archiveCommand); + if (!updateVersion.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { + successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.success"); + } else { + errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure"); + } + } catch (CommandException ex) { + errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); + logger.severe(ex.getMessage()); } - } catch (CommandException ex) { - errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); - logger.severe(ex.getMessage()); + } else if(status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + JsonObject archivalLocation = JsonUtil.getJsonObject(updateVersion.getArchivalCopyLocation()); + JsonObjectBuilder job = Json.createObjectBuilder(archivalLocation); + job.add(DatasetVersion.ARCHIVAL_STATUS,DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); + updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + datasetVersionService.merge(updateVersion); } } } @@ -6094,14 +6111,16 @@ public void refreshPaginator() { * * @param id - the id of the datasetversion to archive. */ - public void archiveVersion(Long id) { + public void archiveVersion(Long id, boolean force) { if (session.getUser() instanceof AuthenticatedUser) { DatasetVersion dv = datasetVersionService.retrieveDatasetVersionByVersionId(id).getDatasetVersion(); String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null); AbstractSubmitToArchiveCommand cmd = ArchiverUtil.createSubmitToArchiveCommand(className, dvRequestService.getDataverseRequest(), dv); if (cmd != null) { try { - + String status = dv.getArchivalCopyLocationStatus(); + if(status == null || (force && cmd.supportsDelete())){ + // Set initial pending status dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); datasetVersionService.persistArchivalCopyLocation(dv); @@ -6113,7 +6132,7 @@ public void archiveVersion(Long id) { setVersionTabList(resetVersionTabList()); this.setVersionTabListForPostLoad(getVersionTabList()); JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); - + } } catch (CommandException ex) { logger.log(Level.SEVERE, "Unexpected Exception calling submit archive command", ex); JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("datasetversion.archive.failure")); @@ -6146,21 +6165,26 @@ public boolean isArchivable() { return archivable; } + /** Method to decide if a 'Submit' button should be enabled for archiving a dataset version. */ public boolean isVersionArchivable() { if (versionArchivable == null) { // If this dataset isn't in an archivable collection return false versionArchivable = false; if (isArchivable()) { - boolean checkForArchivalCopy = false; + // Otherwise, we need to know if the archiver is single-version-only // If it is, we have to check for an existing archived version to answer the // question String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null); if (className != null) { try { + boolean checkForArchivalCopy = false; Class clazz = Class.forName(className); Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); + Method m2 = clazz.getMethod("supportsDelete"); + Object[] params = { settingsWrapper }; + boolean supportsDelete = (Boolean) m2.invoke(null); checkForArchivalCopy = (Boolean) m.invoke(null, params); if (checkForArchivalCopy) { @@ -6168,9 +6192,12 @@ public boolean isVersionArchivable() { // one version is already archived (or attempted - any non-null status) versionArchivable = !isSomeVersionArchived(); } else { - // If we allow multiple versions or didn't find one that has had archiving run - // on it, we can archive, so return true - versionArchivable = true; + // If we didn't find one that has had archiving run + // on it, or we archiving per version is supported and either + // the status is null or the archiver can delete prior runs and status isn't success, + // we can archive, so return true + String status = workingVersion.getArchivalCopyLocationStatus(); + versionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete)); } } catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 93b0ccfef61..0de0dedc860 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -132,6 +132,7 @@ public enum VersionState { public static final String ARCHIVAL_STATUS_PENDING = "pending"; public static final String ARCHIVAL_STATUS_SUCCESS = "success"; public static final String ARCHIVAL_STATUS_FAILURE = "failure"; + public static final String ARCHIVAL_STATUS_OBSOLETE = "obsolete"; @Id @GeneratedValue(strategy = GenerationType.IDENTITY) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 7e39a8e7b85..f7716534b7f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -184,4 +184,8 @@ public static boolean isSingleVersion(SettingsWrapper settingsWrapper) { public static boolean isSingleVersion(SettingsServiceBean settingsService) { return false; } + + public static boolean supportsDelete() { + return false; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..97ca104f01c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -44,6 +44,11 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi super(aRequest, version); } + @Override + public static boolean supportsDelete() { + return true; + } + @Override public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); @@ -73,6 +78,34 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') .replace('.', '-').toLowerCase(); + // Check for and delete existing files for this version + String dataciteFileName = spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; + String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + + logger.fine("Checking for existing files in archive..."); + + try { + Blob existingDatacite = bucket.get(dataciteFileName); + if (existingDatacite != null && existingDatacite.exists()) { + logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); + existingDatacite.delete(); + logger.fine("Deleted existing datacite.xml"); + } + } catch (StorageException se) { + logger.warning("Error checking/deleting existing datacite.xml: " + se.getMessage()); + } + + try { + Blob existingBag = bucket.get(bagFileName); + if (existingBag != null && existingBag.exists()) { + logger.fine("Found existing bag file, deleting: " + bagFileName); + existingBag.delete(); + logger.fine("Deleted existing bag file"); + } + } catch (StorageException se) { + logger.warning("Error checking/deleting existing bag file: " + se.getMessage()); + } + String dataciteXml = getDataCiteXml(dv); MessageDigest messageDigest = MessageDigest.getInstance("MD5"); try (PipedInputStream dataciteIn = new PipedInputStream(); @@ -102,7 +135,7 @@ public void run() { Thread.sleep(10); i++; } - Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + Blob dcXml = bucket.create(dataciteFileName, digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); dcThread.join(); String checksum = dcXml.getMd5ToHexString(); @@ -131,7 +164,7 @@ public void run() { try (PipedInputStream in = new PipedInputStream(100000); DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", + Blob bag = bucket.create(bagFileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist()); if (bag.getSize() == 0) { throw new IOException("Empty Bag"); @@ -139,7 +172,7 @@ public void run() { bagThread.join(); checksum = bag.getMd5ToHexString(); - logger.fine("Bag: " + fileName + " added with checksum: " + checksum); + logger.fine("Bag: " + bagFileName + " added with checksum: " + checksum); localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); if (!success || !checksum.equals(localchecksum)) { logger.severe(success ? checksum + " not equal to " + localchecksum diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index d9b9fd7bc48..dbc2ce40657 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -2141,6 +2141,7 @@ file.dataFilesTab.versions.headers.contributors.withheld=Contributor name(s) wit file.dataFilesTab.versions.headers.published=Published on file.dataFilesTab.versions.headers.archived=Archival Status file.dataFilesTab.versions.headers.archived.success=Archived +file.dataFilesTab.versions.headers.archived.obsolete=Original Version Archived file.dataFilesTab.versions.headers.archived.pending=Pending file.dataFilesTab.versions.headers.archived.failure=Failed file.dataFilesTab.versions.headers.archived.notarchived=Not Archived diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index 9e5f0a9b24d..1f33675bd3d 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -169,9 +169,15 @@ + + + + + + - From f912fd043945850ac87d396833cdc9c94d62f56c Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 25 Nov 2025 14:34:32 -0500 Subject: [PATCH 42/97] doc that api doesn't handls supportsDelete yet --- src/main/java/edu/harvard/iq/dataverse/api/Admin.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index 18f28569d7d..10aadde57b6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -2067,6 +2067,7 @@ public Response submitDatasetVersionToArchive(@Context ContainerRequestContext c if(dv==null) { return error(Status.BAD_REQUEST, "Requested version not found."); } + //ToDo - allow forcing with a non-success status for archivers that supportsDelete() if (dv.getArchivalCopyLocation() == null) { String className = settingsService.getValueForKey(SettingsServiceBean.Key.ArchiverClassName); // Note - the user is being sent via the createDataverseRequest(au) call to the @@ -2132,7 +2133,7 @@ public Response archiveAllUnarchivedDatasetVersions(@Context ContainerRequestCon try { AuthenticatedUser au = getRequestAuthenticatedUserOrDie(crc); - + //ToDo - allow forcing with a non-success status for archivers that supportsDelete() List dsl = datasetversionService.getUnarchivedDatasetVersions(); if (dsl != null) { if (listonly) { From 00f115e23e50f8d70338256fbd34d8270a9900a1 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 25 Nov 2025 14:55:51 -0500 Subject: [PATCH 43/97] support reflective and instance calls re: delete capability --- .../java/edu/harvard/iq/dataverse/DatasetPage.java | 2 +- .../impl/AbstractSubmitToArchiveCommand.java | 14 +++++++++++--- .../impl/GoogleCloudSubmitToArchiveCommand.java | 5 ++++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 0832560eafb..09669fb789e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6119,7 +6119,7 @@ public void archiveVersion(Long id, boolean force) { if (cmd != null) { try { String status = dv.getArchivalCopyLocationStatus(); - if(status == null || (force && cmd.supportsDelete())){ + if(status == null || (force && cmd.canDelete())){ // Set initial pending status dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index f7716534b7f..aaeef193ff4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -185,7 +185,15 @@ public static boolean isSingleVersion(SettingsServiceBean settingsService) { return false; } - public static boolean supportsDelete() { - return false; - } + /** Whether the archiver can delete existing archival files (and thus can retry when the existing files are incomplete/obsolete) + * A static version supports calls via reflection while the instance method supports inheritance for use on actual command instances (see DatasetPage for both use cases). + * @return + */ + public static boolean supportsDelete() { + return false; + } + + public boolean canDelete() { + return supportsDelete(); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 97ca104f01c..61a38cffc99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -44,10 +44,13 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi super(aRequest, version); } - @Override public static boolean supportsDelete() { return true; } + @Override + public boolean canDelete() { + return supportsDelete(); + } @Override public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { From bc403703ab672d1ac30ba16d928a3eaa1de87214 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 10 Dec 2025 16:14:30 -0500 Subject: [PATCH 44/97] use query to update status, async everywhere --- .../java/edu/harvard/iq/dataverse/DatasetPage.java | 14 +++++--------- .../edu/harvard/iq/dataverse/api/Datasets.java | 10 +++------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 09669fb789e..db9e9caa671 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -3014,21 +3014,18 @@ public String updateCurrentVersion() { * pulled this out as a separate submit(). */ try { - updateVersion = commandEngine.submit(archiveCommand); - if (!updateVersion.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.success"); - } else { - errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure"); - } + commandEngine.submitAsync(archiveCommand); + JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); } catch (CommandException ex) { errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); logger.severe(ex.getMessage()); } } else if(status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + //Not automatically replacing the old archival copy as creating it is expensive JsonObject archivalLocation = JsonUtil.getJsonObject(updateVersion.getArchivalCopyLocation()); JsonObjectBuilder job = Json.createObjectBuilder(archivalLocation); job.add(DatasetVersion.ARCHIVAL_STATUS,DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); - updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + datasetVersionService.setArchivalCopyLocation(updateVersion, JsonUtil.prettyPrint(job.build())); datasetVersionService.merge(updateVersion); } } @@ -6122,8 +6119,7 @@ public void archiveVersion(Long id, boolean force) { if(status == null || (force && cmd.canDelete())){ // Set initial pending status - dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); - datasetVersionService.persistArchivalCopyLocation(dv); + datasetVersionService.setArchivalCopyLocation(dv, DatasetVersion.ARCHIVAL_STATUS_PENDING); commandEngine.submitAsync(cmd); diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index c8e66115575..bf0f7c6668a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1279,7 +1279,7 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, createDataverseRequest(user), updateVersion); if (archiveCommand != null) { // Delete the record of any existing copy since it is now out of date/incorrect - updateVersion.setArchivalCopyLocation(null); + datasetVersionSvc.setArchivalCopyLocation(updateVersion, null); datasetVersionSvc.persistArchivalCopyLocation(updateVersion); /* * Then try to generate and submit an archival copy. Note that running this @@ -1291,12 +1291,8 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( * pulled this out as a separate submit(). */ try { - updateVersion = commandEngine.submit(archiveCommand); - if (!updateVersion.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.success"); - } else { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure"); - } + commandEngine.submitAsync(archiveCommand); + successMsg = BundleUtil.getStringFromBundle("datasetversion.archive.inprogress"); } catch (CommandException ex) { successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); logger.severe(ex.getMessage()); From df9b5cec3c83ec066dc274d35edea9ee9f9e98a6 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 12 Dec 2025 18:23:56 -0500 Subject: [PATCH 45/97] fixes for dataset page re: archiving --- src/main/webapp/dataset-versions.xhtml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index 1f33675bd3d..89a8162c135 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -170,14 +170,14 @@ - + - From a64e1f749c2f44c14b4386e1c22195e1c65d8ea8 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 16 Jan 2026 13:33:19 -0500 Subject: [PATCH 46/97] merge issues --- src/main/java/edu/harvard/iq/dataverse/DatasetPage.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index db9e9caa671..4b559af3878 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6101,10 +6101,7 @@ public void refreshPaginator() { /** * This method can be called from *.xhtml files to allow archiving of a dataset - * version from the user interface. It is not currently (11/18) used in the IQSS/develop - * branch, but is used by QDR and is kept here in anticipation of including a - * GUI option to archive (already published) versions after other dataset page - * changes have been completed. + * version from the user interface. * * @param id - the id of the datasetversion to archive. */ From c55230ee81481b465323b16800e98679fe5fa36c Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 17:38:37 -0500 Subject: [PATCH 47/97] merge fix of persistArchivalCopy method refactors --- .../edu/harvard/iq/dataverse/DatasetPage.java | 30 +++++++++---------- .../harvard/iq/dataverse/DatasetVersion.java | 24 ++++++++++----- .../harvard/iq/dataverse/api/Datasets.java | 2 +- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 4b559af3878..fe17a137361 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -3022,10 +3022,8 @@ public String updateCurrentVersion() { } } else if(status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { //Not automatically replacing the old archival copy as creating it is expensive - JsonObject archivalLocation = JsonUtil.getJsonObject(updateVersion.getArchivalCopyLocation()); - JsonObjectBuilder job = Json.createObjectBuilder(archivalLocation); - job.add(DatasetVersion.ARCHIVAL_STATUS,DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); - datasetVersionService.setArchivalCopyLocation(updateVersion, JsonUtil.prettyPrint(job.build())); + updateVersion.setArchivalStatus(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); + datasetVersionService.persistArchivalCopyLocation(updateVersion); datasetVersionService.merge(updateVersion); } } @@ -6113,18 +6111,18 @@ public void archiveVersion(Long id, boolean force) { if (cmd != null) { try { String status = dv.getArchivalCopyLocationStatus(); - if(status == null || (force && cmd.canDelete())){ - - // Set initial pending status - datasetVersionService.setArchivalCopyLocation(dv, DatasetVersion.ARCHIVAL_STATUS_PENDING); - - commandEngine.submitAsync(cmd); - - logger.info( - "DatasetVersion id=" + dv.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); - setVersionTabList(resetVersionTabList()); - this.setVersionTabListForPostLoad(getVersionTabList()); - JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); + if (status == null || (force && cmd.canDelete())) { + + // Set initial pending status + dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); + datasetVersionService.persistArchivalCopyLocation(dv); + commandEngine.submitAsync(cmd); + + logger.info( + "DatasetVersion id=" + dv.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); + setVersionTabList(resetVersionTabList()); + this.setVersionTabListForPostLoad(getVersionTabList()); + JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); } } catch (CommandException ex) { logger.log(Level.SEVERE, "Unexpected Exception calling submit archive command", ex); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 0de0dedc860..1248a8266ab 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -232,8 +232,9 @@ public enum VersionState { @Transient private DatasetVersionDifference dvd; + //The Json version of the archivalCopyLocation string @Transient - private JsonObject archivalStatus; + private JsonObject archivalCopyLocationJson; public Long getId() { return this.id; @@ -384,24 +385,24 @@ public String getArchivalCopyLocation() { public String getArchivalCopyLocationStatus() { populateArchivalStatus(false); - if(archivalStatus!=null) { - return archivalStatus.getString(ARCHIVAL_STATUS); + if(archivalCopyLocationJson!=null) { + return archivalCopyLocationJson.getString(ARCHIVAL_STATUS); } return null; } public String getArchivalCopyLocationMessage() { populateArchivalStatus(false); - if(archivalStatus!=null) { - return archivalStatus.getString(ARCHIVAL_STATUS_MESSAGE); + if(archivalCopyLocationJson!=null) { + return archivalCopyLocationJson.getString(ARCHIVAL_STATUS_MESSAGE); } return null; } private void populateArchivalStatus(boolean force) { - if(archivalStatus ==null || force) { + if(archivalCopyLocationJson ==null || force) { if(archivalCopyLocation!=null) { try { - archivalStatus = JsonUtil.getJsonObject(archivalCopyLocation); + archivalCopyLocationJson = JsonUtil.getJsonObject(archivalCopyLocation); } catch(Exception e) { logger.warning("DatasetVersion id: " + id + "has a non-JsonObject value, parsing error: " + e.getMessage()); logger.fine(archivalCopyLocation); @@ -415,6 +416,15 @@ public void setArchivalCopyLocation(String location) { populateArchivalStatus(true); } + // COnvenience method to set only the status + public void setArchivalStatus(String status) { + populateArchivalStatus(false); + JsonObjectBuilder job = Json.createObjectBuilder(archivalCopyLocationJson); + job.add(DatasetVersion.ARCHIVAL_STATUS, status); + archivalCopyLocationJson = job.build(); + archivalCopyLocation = JsonUtil.prettyPrint(archivalCopyLocationJson); + } + public String getDeaccessionLink() { return deaccessionLink; } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index bf0f7c6668a..dba4b36d4da 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1279,7 +1279,7 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, createDataverseRequest(user), updateVersion); if (archiveCommand != null) { // Delete the record of any existing copy since it is now out of date/incorrect - datasetVersionSvc.setArchivalCopyLocation(updateVersion, null); + updateVersion.setArchivalCopyLocation(null); datasetVersionSvc.persistArchivalCopyLocation(updateVersion); /* * Then try to generate and submit an archival copy. Note that running this From 905570a81563b8428042398ac1778fd4d380b61d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 22 Jan 2026 12:57:38 -0500 Subject: [PATCH 48/97] add flag, docs --- doc/sphinx-guides/source/installation/config.rst | 10 ++++++++++ .../java/edu/harvard/iq/dataverse/DatasetPage.java | 3 ++- .../harvard/iq/dataverse/settings/FeatureFlags.java | 13 +++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9d5c7c0041..68982881d77 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2263,6 +2263,9 @@ At present, archiving classes include the DuraCloudSubmitToArchiveCommand, Local All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). +Archival Bags are created per dataset version. By default, if a version is republished (via the superuser-only 'Update Current Version' publication option in the UI/API), a new archival bag is not created for the version. +If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`:feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. + .. _Duracloud Configuration: Duracloud Configuration @@ -4031,6 +4034,13 @@ dataverse.feature.only-update-datacite-when-needed Only contact DataCite to update a DOI after checking to see if DataCite has outdated information (for efficiency, lighter load on DataCite, especially when using file DOIs). +.. _dataverse.feature.archive-on-version-update: + +dataverse.feature.archive-on-version-update ++++++++++++++++++++++++++++++++++++++++++++ + +Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, +i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index fe17a137361..a091005b392 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -164,6 +164,7 @@ import edu.harvard.iq.dataverse.search.SearchFields; import edu.harvard.iq.dataverse.search.SearchUtil; import edu.harvard.iq.dataverse.search.SolrClientService; +import edu.harvard.iq.dataverse.settings.FeatureFlags; import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.util.SignpostingResources; import edu.harvard.iq.dataverse.util.FileMetadataUtil; @@ -3001,7 +3002,7 @@ public String updateCurrentVersion() { // If pending or an obsolete copy exists, do nothing (nominally if a pending run succeeds and we're updating the current version here, it should be marked as obsolete - ignoring for now since updates within the time an archiving run is pending should be rare // If a failure or null, rerun archiving now. If a failure is due to an exiting copy in the repo, we'll fail again String status = updateVersion.getArchivalCopyLocationStatus(); - if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)){ + if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE) || (FeatureFlags.ARCHIVE_ON_VERSION_UPDATE.enabled() && archiveCommand.canDelete())){ // Delete the record of any existing copy since it is now out of date/incorrect updateVersion.setArchivalCopyLocation(null); /* diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java index 2e86fae610e..fdbdb257dbe 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java @@ -249,6 +249,19 @@ public enum FeatureFlags { * @since Dataverse 6.9 */ ONLY_UPDATE_DATACITE_WHEN_NEEDED("only-update-datacite-when-needed"), + /** + * Indicates whether archival bag creation should be triggered (if configured) when a version + * is updated and was already successfully archived, i.e via the Update-Current-Version publication option. + * Since archiving can be resource intensive, it may not be worthwhile to automatically re-archive for the + * types of minor changes "Update-Current-Version" is intended for. Note that this flag is only effective + * for archivers that support deletion of existing files. When the flag is false, or the archiver cannot + * delete, the existing archival status will be changed to "Obsolete". + * + * * @apiNote Raise flag by setting "dataverse.feature.archive-on-version-update" + * + * @since Dataverse 6.10 + */ + ARCHIVE_ON_VERSION_UPDATE("archive-on-version-update"), ; From 521fbf68f2d6ba72b06343c32cf6154b027c899f Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 22 Jan 2026 15:01:50 -0500 Subject: [PATCH 49/97] add delete to local and S3 --- .../impl/LocalSubmitToArchiveCommand.java | 49 +++++++++- .../impl/S3SubmitToArchiveCommand.java | 94 ++++++++++++++++--- 2 files changed, 129 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..34fadbed703 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -34,6 +34,14 @@ public class LocalSubmitToArchiveCommand extends AbstractSubmitToArchiveCommand public LocalSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version); } + + public static boolean supportsDelete() { + return true; + } + @Override + public boolean canDelete() { + return supportsDelete(); + } @Override public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, @@ -57,15 +65,52 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') .replace('.', '-').toLowerCase(); + // Define file paths + String dataciteFileName = localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; + zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; + + // Check for and delete existing files for this version + logger.fine("Checking for existing files in archive..."); + + File existingDatacite = new File(dataciteFileName); + if (existingDatacite.exists()) { + logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); + if (existingDatacite.delete()) { + logger.fine("Deleted existing datacite.xml"); + } else { + logger.warning("Failed to delete existing datacite.xml: " + dataciteFileName); + } + } + + File existingBag = new File(zipName); + if (existingBag.exists()) { + logger.fine("Found existing bag file, deleting: " + zipName); + if (existingBag.delete()) { + logger.fine("Deleted existing bag file"); + } else { + logger.warning("Failed to delete existing bag file: " + zipName); + } + } + + // Also check for and delete the .partial file if it exists + File existingPartial = new File(zipName + ".partial"); + if (existingPartial.exists()) { + logger.fine("Found existing partial bag file, deleting: " + zipName + ".partial"); + if (existingPartial.delete()) { + logger.fine("Deleted existing partial bag file"); + } else { + logger.warning("Failed to delete existing partial bag file: " + zipName + ".partial"); + } + } + String dataciteXml = getDataCiteXml(dv); FileUtils.writeStringToFile( - new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), + new File(dataciteFileName), dataciteXml, StandardCharsets.UTF_8); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); - zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; //ToDo: generateBag(File f, true) seems to do the same thing (with a .tmp extension) - since we don't have to use a stream here, could probably just reuse the existing code? bagger.generateBag(new FileOutputStream(zipName + ".partial")); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..768d5d03e1d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -14,9 +14,7 @@ import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; -import java.io.ByteArrayInputStream; import java.io.File; -import java.io.FileInputStream; import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.concurrent.CompletableFuture; @@ -38,18 +36,15 @@ import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; import software.amazon.awssdk.core.async.AsyncRequestBody; -import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.GetObjectAttributesRequest; -import software.amazon.awssdk.services.s3.model.GetObjectAttributesResponse; -import software.amazon.awssdk.services.s3.model.ObjectAttributes; +import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; +import software.amazon.awssdk.services.s3.model.DeleteObjectResponse; +import software.amazon.awssdk.services.s3.model.HeadObjectRequest; +import software.amazon.awssdk.services.s3.model.NoSuchKeyException; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.services.s3.model.PutObjectResponse; -import software.amazon.awssdk.services.s3.S3ClientBuilder; -import software.amazon.awssdk.services.s3.S3Configuration; import software.amazon.awssdk.http.async.SdkAsyncHttpClient; import software.amazon.awssdk.http.nio.netty.NettyNioAsyncHttpClient; import software.amazon.awssdk.utils.StringUtils; @@ -76,6 +71,14 @@ public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { public S3SubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version); } + + public static boolean supportsDelete() { + return true; + } + @Override + public boolean canDelete() { + return supportsDelete(); + } @Override public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, @@ -105,10 +108,78 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t if (dataset.getLockFor(Reason.finalizePublication) == null) { spaceName = getSpaceName(dataset); - String dataciteXml = getDataCiteXml(dv); - // Add datacite.xml file + + // Define keys for datacite.xml and bag file String dcKey = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + String bagKey = spaceName + "/" + getFileName(spaceName, dv) + ".zip"; + + // Check for and delete existing files for this version + logger.fine("Checking for existing files in archive..."); + + try { + HeadObjectRequest headDcRequest = HeadObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); + + s3.headObject(headDcRequest).join(); + + // If we get here, the object exists, so delete it + logger.fine("Found existing datacite.xml, deleting: " + dcKey); + DeleteObjectRequest deleteDcRequest = DeleteObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); + + CompletableFuture deleteDcFuture = s3.deleteObject(deleteDcRequest); + DeleteObjectResponse deleteDcResponse = deleteDcFuture.join(); + + if (deleteDcResponse.sdkHttpResponse().isSuccessful()) { + logger.fine("Deleted existing datacite.xml"); + } else { + logger.warning("Failed to delete existing datacite.xml: " + dcKey); + } + } catch (Exception e) { + if (e.getCause() instanceof NoSuchKeyException) { + logger.fine("No existing datacite.xml found"); + } else { + logger.warning("Error checking/deleting existing datacite.xml: " + e.getMessage()); + } + } + try { + HeadObjectRequest headBagRequest = HeadObjectRequest.builder() + .bucket(bucketName) + .key(bagKey) + .build(); + + s3.headObject(headBagRequest).join(); + + // If we get here, the object exists, so delete it + logger.fine("Found existing bag file, deleting: " + bagKey); + DeleteObjectRequest deleteBagRequest = DeleteObjectRequest.builder() + .bucket(bucketName) + .key(bagKey) + .build(); + + CompletableFuture deleteBagFuture = s3.deleteObject(deleteBagRequest); + DeleteObjectResponse deleteBagResponse = deleteBagFuture.join(); + + if (deleteBagResponse.sdkHttpResponse().isSuccessful()) { + logger.fine("Deleted existing bag file"); + } else { + logger.warning("Failed to delete existing bag file: " + bagKey); + } + } catch (Exception e) { + if (e.getCause() instanceof NoSuchKeyException) { + logger.fine("No existing bag file found"); + } else { + logger.warning("Error checking/deleting existing bag file: " + e.getMessage()); + } + } + + String dataciteXml = getDataCiteXml(dv); + // Add datacite.xml file PutObjectRequest putRequest = PutObjectRequest.builder() .bucket(bucketName) .key(dcKey) @@ -128,7 +199,6 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t // Store BagIt file String fileName = getFileName(spaceName, dv); - String bagKey = spaceName + "/" + fileName + ".zip"; // Add BagIt ZIP file // Google uses MD5 as one way to verify the // transfer From ba04ba2455529ed7f8f5bba5cf5818fc255f364e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 27 Jan 2026 16:50:42 -0500 Subject: [PATCH 50/97] fix doc ref --- doc/sphinx-guides/source/installation/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 68982881d77..d0b4eac6ab2 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2264,7 +2264,7 @@ At present, archiving classes include the DuraCloudSubmitToArchiveCommand, Local All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). Archival Bags are created per dataset version. By default, if a version is republished (via the superuser-only 'Update Current Version' publication option in the UI/API), a new archival bag is not created for the version. -If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`:feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. +If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`:dataverse.feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. .. _Duracloud Configuration: From 7a186693a02683b752f898b18eb425d3ea84134d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 27 Jan 2026 17:11:32 -0500 Subject: [PATCH 51/97] remove errant : char --- doc/sphinx-guides/source/installation/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index d0b4eac6ab2..d6cea5b16e3 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2264,7 +2264,7 @@ At present, archiving classes include the DuraCloudSubmitToArchiveCommand, Local All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). Archival Bags are created per dataset version. By default, if a version is republished (via the superuser-only 'Update Current Version' publication option in the UI/API), a new archival bag is not created for the version. -If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`:dataverse.feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. +If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`dataverse.feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. .. _Duracloud Configuration: From ae91b78dbbec09899c9040730567e978698d406c Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 23 Jan 2026 15:05:21 -0500 Subject: [PATCH 52/97] no transaction time limit during bagging from command (not workflow) --- .../impl/AbstractSubmitToArchiveCommand.java | 63 +++- .../impl/DRSSubmitToArchiveCommand.java | 78 ++++- .../impl/DuraCloudSubmitToArchiveCommand.java | 295 +++++++++--------- .../GoogleCloudSubmitToArchiveCommand.java | 215 +++++++------ .../impl/LocalSubmitToArchiveCommand.java | 129 ++++---- .../impl/S3SubmitToArchiveCommand.java | 219 ++++++------- .../iq/dataverse/util/bagit/BagGenerator.java | 32 +- .../workflow/WorkflowServiceBean.java | 2 +- .../ArchivalSubmissionWorkflowStep.java | 40 ++- 9 files changed, 600 insertions(+), 473 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index aaeef193ff4..ffa79456902 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -2,8 +2,9 @@ import edu.harvard.iq.dataverse.DataCitation; import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetFieldConstant; +import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.SettingsWrapper; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -17,7 +18,11 @@ import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; +import jakarta.json.JsonObject; import java.io.IOException; import java.io.PipedInputStream; @@ -30,8 +35,8 @@ @RequiredPermissions(Permission.PublishDataset) public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand { - private final DatasetVersion version; - private final Map requestedSettings = new HashMap(); + protected final DatasetVersion version; + protected final Map requestedSettings = new HashMap(); protected boolean success=false; private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName()); private static final int MAX_ZIP_WAIT = 20000; @@ -43,8 +48,16 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override + @TransactionAttribute(TransactionAttributeType.REQUIRED) public DatasetVersion execute(CommandContext ctxt) throws CommandException { + // Check for locks while we're still in a transaction + Dataset dataset = version.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) != null + || dataset.getLockFor(Reason.FileValidationFailed) != null) { + throw new CommandException("Dataset is locked and cannot be archived", this); + } + String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); String[] settingsArray = settings.split(","); for (String setting : settingsArray) { @@ -62,11 +75,40 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { //No un-expired token token = ctxt.authentication().generateApiTokenForUser(user); } - performArchiveSubmission(version, token, requestedSettings); - ctxt.datasetVersion().persistArchivalCopyLocation(version); + String dataCiteXml = getDataCiteXml(version); + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + Map terms = getJsonLDTerms(oreMap); + performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); return version; } + // While we have a transaction context, get the terms needed to create the baginfo file + public Map getJsonLDTerms(OREMap oreMap) { + Map terms = new HashMap(); + terms.put(DatasetFieldConstant.datasetContact, oreMap.getContactTerm()); + terms.put(DatasetFieldConstant.datasetContactName, oreMap.getContactNameTerm()); + terms.put(DatasetFieldConstant.datasetContactEmail, oreMap.getContactEmailTerm()); + terms.put(DatasetFieldConstant.description, oreMap.getDescriptionTerm()); + terms.put(DatasetFieldConstant.descriptionText, oreMap.getDescriptionTextTerm()); + + return terms; + } + + @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) + public WorkflowStepResult performArchivingAndPersist(CommandContext ctxt, DatasetVersion version, String dataCiteXml, JsonObject ore, Map terms, ApiToken token, Map requestedSetttings) { + // This runs OUTSIDE any transaction + BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads()); + WorkflowStepResult wfsr = performArchiveSubmission(version, dataCiteXml, ore, terms, token, requestedSettings); + persistResult(ctxt, version); + return wfsr; + } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + private void persistResult(CommandContext ctxt, DatasetVersion versionWithStatus) { + // New transaction just for this quick operation + ctxt.datasetVersion().persistArchivalCopyLocation(versionWithStatus); + } /** * This method is the only one that should be overwritten by other classes. Note * that this method may be called from the execute method above OR from a @@ -75,10 +117,13 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { * constructor and could be dropped from the parameter list.) * * @param version - the DatasetVersion to archive + * @param ore + * @param dataCiteXml + * @param terms * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, Map requestedSetttings); + abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, String dataCiteXml, JsonObject ore, Map terms, ApiToken token, Map requestedSetttings); protected int getNumberOfBagGeneratorThreads() { if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { @@ -98,7 +143,7 @@ public String describe() { + version.getFriendlyVersionNumber()+")]"; } - String getDataCiteXml(DatasetVersion dv) { + public String getDataCiteXml(DatasetVersion dv) { DataCitation dc = new DataCitation(dv); Map metadata = dc.getDataCiteMetadata(); return DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, @@ -106,13 +151,13 @@ String getDataCiteXml(DatasetVersion dv) { } public Thread startBagThread(DatasetVersion dv, PipedInputStream in, DigestInputStream digestInputStream2, - String dataciteXml, ApiToken token) throws IOException, InterruptedException { + String dataciteXml, JsonObject ore, Map terms, ApiToken token) throws IOException, InterruptedException { Thread bagThread = new Thread(new Runnable() { public void run() { try (PipedOutputStream out = new PipedOutputStream(in)) { // Generate bag - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); bagger.setNumConnections(getNumberOfBagGeneratorThreads()); + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); success = true; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java index 78e8454255b..81bcbc25dda 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java @@ -4,13 +4,19 @@ import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.SettingsWrapper; +import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; +import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.branding.BrandingUtil; import edu.harvard.iq.dataverse.engine.command.Command; +import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -34,6 +40,8 @@ import java.util.Set; import java.util.logging.Logger; +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; import jakarta.json.Json; import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; @@ -77,13 +85,73 @@ public class DRSSubmitToArchiveCommand extends S3SubmitToArchiveCommand implemen private static final String TRUST_CERT = "trust_cert"; private static final String TIMEOUT = "timeout"; + private String archivableAncestorAlias; + public DRSSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version); } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + @TransactionAttribute(TransactionAttributeType.REQUIRED) + public DatasetVersion execute(CommandContext ctxt) throws CommandException { + + + // Check for locks while we're still in a transaction + Dataset dataset = version.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) != null + || dataset.getLockFor(Reason.FileValidationFailed) != null) { + throw new CommandException("Dataset is locked and cannot be archived", this); + } + + String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); + String[] settingsArray = settings.split(","); + for (String setting : settingsArray) { + setting = setting.trim(); + if (!setting.startsWith(":")) { + logger.warning("Invalid Archiver Setting: " + setting); + } else { + requestedSettings.put(setting, ctxt.settings().get(setting)); + } + } + + // Compute archivable ancestor while we're in a transaction and entities are managed + JsonObject drsConfigObject = null; + try { + drsConfigObject = JsonUtil.getJsonObject(requestedSettings.get(DRS_CONFIG)); + } catch (Exception e) { + logger.warning("Unable to parse " + DRS_CONFIG + " setting as a Json object"); + } + + if (drsConfigObject != null) { + JsonObject adminMetadata = drsConfigObject.getJsonObject(ADMIN_METADATA); + if (adminMetadata != null) { + JsonObject collectionsObj = adminMetadata.getJsonObject(COLLECTIONS); + if (collectionsObj != null) { + Set collections = collectionsObj.keySet(); + Dataverse ancestor = dataset.getOwner(); + // Compute this while entities are still managed + archivableAncestorAlias = getArchivableAncestor(ancestor, collections); + } + } + } + + AuthenticatedUser user = getRequest().getAuthenticatedUser(); + ApiToken token = ctxt.authentication().findApiTokenByUser(user); + if (token == null) { + //No un-expired token + token = ctxt.authentication().generateApiTokenForUser(user); + } + String dataCiteXml = getDataCiteXml(version); + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + Map terms = getJsonLDTerms(oreMap); + performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); + return version; + } + + @Override + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { logger.fine("In DRSSubmitToArchiveCommand..."); JsonObject drsConfigObject = null; @@ -97,7 +165,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t Set collections = adminMetadata.getJsonObject(COLLECTIONS).keySet(); Dataset dataset = dv.getDataset(); Dataverse ancestor = dataset.getOwner(); - String alias = getArchivableAncestor(ancestor, collections); + String alias = archivableAncestorAlias; // Use the pre-computed alias instead of calling getArchivableAncestor again String spaceName = getSpaceName(dataset); String packageId = getFileName(spaceName, dv); @@ -113,7 +181,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObject collectionConfig = adminMetadata.getJsonObject(COLLECTIONS).getJsonObject(alias); - WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token, requestedSettings); + WorkflowStepResult s3Result = super.performArchiveSubmission(dv, dataciteXml, ore, terms, token, requestedSettings); JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); @@ -242,7 +310,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t logger.severe("DRS Ingest Failed for: " + packageId + " - response does not include status and message"); return new Failure( - "DRS Archiver fail in Ingest call \" - response does not include status and message"); + "DRS Archiver fail in Ingest call - response does not include status and message"); } } else { logger.severe("DRS Ingest Failed for: " + packageId + " with status code: " + code); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..b1fa777478b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -2,7 +2,6 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; @@ -10,6 +9,8 @@ import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudContext; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudHost; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudPort; + +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -49,8 +50,8 @@ public DuraCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, jakarta.json.JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) : DEFAULT_PORT; @@ -64,173 +65,165 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t // This will make the archivalCopyLocation non-null after a failure which should // stop retries - if (dataset.getLockFor(Reason.finalizePublication) == null - && dataset.getLockFor(Reason.FileValidationFailed) == null) { - // Use Duracloud client classes to login - ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext); - Credential credential = new Credential(System.getProperty("duracloud.username"), - System.getProperty("duracloud.password")); - storeManager.login(credential); + // Use Duracloud client classes to login + ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext); + Credential credential = new Credential(System.getProperty("duracloud.username"), + System.getProperty("duracloud.password")); + storeManager.login(credential); + /* + * Aliases can contain upper case characters which are not allowed in space + * names. Similarly, aliases can contain '_' which isn't allowed in a space + * name. The line below replaces any upper case chars with lowercase and + * replaces any '_' with '.-' . The '-' after the dot assures we don't break the + * rule that + * "The last period in a aspace may not immediately be followed by a number". + * (Although we could check, it seems better to just add '.-' all the time.As + * written the replaceAll will also change any chars not valid in a spaceName to + * '.' which would avoid code breaking if the alias constraints change. That + * said, this line may map more than one alias to the same spaceName, e.g. + * "test" and "Test" aliases both map to the "test" space name. This does not + * break anything but does potentially put bags from more than one collection in + * the same space. + */ + String spaceName = dataset.getOwner().getAlias().toLowerCase().replaceAll("[^a-z0-9-]", ".dcsafe"); + String baseFileName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') + .replace('.', '-').toLowerCase() + "_v" + dv.getFriendlyVersionNumber(); + + ContentStore store; + //Set a failure status that will be updated if we succeed + JsonObjectBuilder statusObject = Json.createObjectBuilder(); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); + + try { /* - * Aliases can contain upper case characters which are not allowed in space - * names. Similarly, aliases can contain '_' which isn't allowed in a space - * name. The line below replaces any upper case chars with lowercase and - * replaces any '_' with '.-' . The '-' after the dot assures we don't break the - * rule that - * "The last period in a aspace may not immediately be followed by a number". - * (Although we could check, it seems better to just add '.-' all the time.As - * written the replaceAll will also change any chars not valid in a spaceName to - * '.' which would avoid code breaking if the alias constraints change. That - * said, this line may map more than one alias to the same spaceName, e.g. - * "test" and "Test" aliases both map to the "test" space name. This does not - * break anything but does potentially put bags from more than one collection in - * the same space. + * If there is a failure in creating a space, it is likely that a prior version + * has not been fully processed (snapshot created, archiving completed and files + * and space deleted - currently manual operations done at the project's + * duracloud website) */ - String spaceName = dataset.getOwner().getAlias().toLowerCase().replaceAll("[^a-z0-9-]", ".dcsafe"); - String baseFileName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase() + "_v" + dv.getFriendlyVersionNumber(); - - ContentStore store; - //Set a failure status that will be updated if we succeed - JsonObjectBuilder statusObject = Json.createObjectBuilder(); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - - try { - /* - * If there is a failure in creating a space, it is likely that a prior version - * has not been fully processed (snapshot created, archiving completed and files - * and space deleted - currently manual operations done at the project's - * duracloud website) - */ - store = storeManager.getPrimaryContentStore(); - // Create space to copy archival files to - if (!store.spaceExists(spaceName)) { - store.createSpace(spaceName); - } - String dataciteXml = getDataCiteXml(dv); - - MessageDigest messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream dataciteIn = new PipedInputStream(); - DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { - // Add datacite.xml file - - Thread dcThread = new Thread(new Runnable() { - public void run() { - try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { - - dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); - dataciteOut.close(); - success=true; - } catch (Exception e) { - logger.severe("Error creating datacite.xml: " + e.getMessage()); - // TODO Auto-generated catch block - e.printStackTrace(); - } + store = storeManager.getPrimaryContentStore(); + // Create space to copy archival files to + if (!store.spaceExists(spaceName)) { + store.createSpace(spaceName); + } + + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); + DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + Thread dcThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); + dataciteOut.close(); + success=true; + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); } - }); - dcThread.start(); - // Have seen Pipe Closed errors for other archivers when used as a workflow - // without this delay loop - int i = 0; - while (digestInputStream.available() <= 0 && i < 100) { - Thread.sleep(10); - i++; } - String checksum = store.addContent(spaceName, baseFileName + "_datacite.xml", digestInputStream, - -1l, null, null, null); - logger.fine("Content: datacite.xml added with checksum: " + checksum); - dcThread.join(); - String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + }); + dcThread.start(); + // Have seen Pipe Closed errors for other archivers when used as a workflow + // without this delay loop + int i = 0; + while (digestInputStream.available() <= 0 && i < 100) { + Thread.sleep(10); + i++; + } + String checksum = store.addContent(spaceName, baseFileName + "_datacite.xml", digestInputStream, + -1l, null, null, null); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + dcThread.join(); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!success || !checksum.equals(localchecksum)) { + logger.severe("Failure on " + baseFileName); + logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); + try { + store.deleteContent(spaceName, baseFileName + "_datacite.xml"); + } catch (ContentStoreException cse) { + logger.warning(cse.getMessage()); + } + return new Failure("Error in transferring DataCite.xml file to DuraCloud", + "DuraCloud Submission Failure: incomplete metadata transfer"); + } + + // Store BagIt file + success = false; + String fileName = baseFileName + ".zip"; + + // Add BagIt ZIP file + // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the + // transfer + + messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream in = new PipedInputStream(100000); + DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { + Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, ore, terms, token); + checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); + bagThread.join(); + if (success) { + logger.fine("Content: " + fileName + " added with checksum: " + checksum); + localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + } if (!success || !checksum.equals(localchecksum)) { - logger.severe("Failure on " + baseFileName); + logger.severe("Failure on " + fileName); logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); try { + store.deleteContent(spaceName, fileName); store.deleteContent(spaceName, baseFileName + "_datacite.xml"); } catch (ContentStoreException cse) { logger.warning(cse.getMessage()); } - return new Failure("Error in transferring DataCite.xml file to DuraCloud", - "DuraCloud Submission Failure: incomplete metadata transfer"); - } - - // Store BagIt file - success = false; - String fileName = baseFileName + ".zip"; - - // Add BagIt ZIP file - // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the - // transfer - - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); - bagThread.join(); - if (success) { - logger.fine("Content: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - } - if (!success || !checksum.equals(localchecksum)) { - logger.severe("Failure on " + fileName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); - try { - store.deleteContent(spaceName, fileName); - store.deleteContent(spaceName, baseFileName + "_datacite.xml"); - } catch (ContentStoreException cse) { - logger.warning(cse.getMessage()); - } - return new Failure("Error in transferring Zip file to DuraCloud", - "DuraCloud Submission Failure: incomplete archive transfer"); - } + return new Failure("Error in transferring Zip file to DuraCloud", + "DuraCloud Submission Failure: incomplete archive transfer"); } + } - logger.fine("DuraCloud Submission step: Content Transferred"); + logger.fine("DuraCloud Submission step: Content Transferred"); - // Document the location of dataset archival copy location (actually the URL - // where you can - // view it as an admin) - StringBuffer sb = new StringBuffer("https://"); - sb.append(host); - if (!port.equals("443")) { - sb.append(":" + port); - } - sb.append("/duradmin/spaces/sm/"); - sb.append(store.getStoreId()); - sb.append("/" + spaceName + "/" + fileName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - - logger.fine("DuraCloud Submission step complete: " + sb.toString()); - } catch (ContentStoreException | IOException e) { - // TODO Auto-generated catch block - logger.warning(e.getMessage()); - e.printStackTrace(); - return new Failure("Error in transferring file to DuraCloud", - "DuraCloud Submission Failure: archive file not transferred"); - } catch (InterruptedException e) { - logger.warning(e.getLocalizedMessage()); - e.printStackTrace(); + // Document the location of dataset archival copy location (actually the URL + // where you can + // view it as an admin) + StringBuffer sb = new StringBuffer("https://"); + sb.append(host); + if (!port.equals("443")) { + sb.append(":" + port); } - } catch (ContentStoreException e) { + sb.append("/duradmin/spaces/sm/"); + sb.append(store.getStoreId()); + sb.append("/" + spaceName + "/" + fileName); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); + + logger.fine("DuraCloud Submission step complete: " + sb.toString()); + } catch (ContentStoreException | IOException e) { + // TODO Auto-generated catch block logger.warning(e.getMessage()); e.printStackTrace(); - String mesg = "DuraCloud Submission Failure"; - if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) { - mesg = mesg + ": Prior Version archiving not yet complete?"; - } - return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); - } catch (NoSuchAlgorithmException e) { - logger.severe("MD5 MessageDigest not available!"); + return new Failure("Error in transferring file to DuraCloud", + "DuraCloud Submission Failure: archive file not transferred"); + } catch (InterruptedException e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); } - finally { - dv.setArchivalCopyLocation(statusObject.build().toString()); + } catch (ContentStoreException e) { + logger.warning(e.getMessage()); + e.printStackTrace(); + String mesg = "DuraCloud Submission Failure"; + if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) { + mesg = mesg + ": Prior Version archiving not yet complete?"; } - } else { - logger.warning( - "DuraCloud Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed"); - return new Failure("Dataset locked"); + return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); + } catch (NoSuchAlgorithmException e) { + logger.severe("MD5 MessageDigest not available!"); + } + finally { + dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; } else { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 61a38cffc99..f662de36792 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -7,7 +7,6 @@ import com.google.cloud.storage.StorageException; import com.google.cloud.storage.StorageOptions; import edu.harvard.iq.dataverse.Dataset; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -16,11 +15,15 @@ import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudBucket; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudProject; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; + import org.apache.commons.codec.binary.Hex; import jakarta.json.Json; +import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; import java.io.File; import java.io.FileInputStream; @@ -53,7 +56,7 @@ public boolean canDelete() { } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, Map terms, ApiToken token, Map requestedSettings) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); @@ -76,135 +79,127 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t Bucket bucket = storage.get(bucketName); Dataset dataset = dv.getDataset(); - if (dataset.getLockFor(Reason.finalizePublication) == null) { - String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase(); + String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') + .replace('.', '-').toLowerCase(); - // Check for and delete existing files for this version - String dataciteFileName = spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; - String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + // Check for and delete existing files for this version + String dataciteFileName = spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; + String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; - logger.fine("Checking for existing files in archive..."); + logger.fine("Checking for existing files in archive..."); - try { - Blob existingDatacite = bucket.get(dataciteFileName); - if (existingDatacite != null && existingDatacite.exists()) { - logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); - existingDatacite.delete(); - logger.fine("Deleted existing datacite.xml"); - } - } catch (StorageException se) { - logger.warning("Error checking/deleting existing datacite.xml: " + se.getMessage()); + try { + Blob existingDatacite = bucket.get(dataciteFileName); + if (existingDatacite != null && existingDatacite.exists()) { + logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); + existingDatacite.delete(); + logger.fine("Deleted existing datacite.xml"); } + } catch (StorageException se) { + logger.warning("Error checking/deleting existing datacite.xml: " + se.getMessage()); + } - try { - Blob existingBag = bucket.get(bagFileName); - if (existingBag != null && existingBag.exists()) { - logger.fine("Found existing bag file, deleting: " + bagFileName); - existingBag.delete(); - logger.fine("Deleted existing bag file"); - } - } catch (StorageException se) { - logger.warning("Error checking/deleting existing bag file: " + se.getMessage()); + try { + Blob existingBag = bucket.get(bagFileName); + if (existingBag != null && existingBag.exists()) { + logger.fine("Found existing bag file, deleting: " + bagFileName); + existingBag.delete(); + logger.fine("Deleted existing bag file"); } + } catch (StorageException se) { + logger.warning("Error checking/deleting existing bag file: " + se.getMessage()); + } - String dataciteXml = getDataCiteXml(dv); - MessageDigest messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream dataciteIn = new PipedInputStream(); - DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { - // Add datacite.xml file - - Thread dcThread = new Thread(new Runnable() { - public void run() { - try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { - - dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); - dataciteOut.close(); - success = true; - } catch (Exception e) { - logger.severe("Error creating datacite.xml: " + e.getMessage()); - // TODO Auto-generated catch block - e.printStackTrace(); - // throw new RuntimeException("Error creating datacite.xml: " + e.getMessage()); - } + // Upload datacite.xml + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); + DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + Thread dcThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); + dataciteOut.close(); + success = true; + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + e.printStackTrace(); } - }); - dcThread.start(); - // Have seen Pipe Closed errors for other archivers when used as a workflow - // without this delay loop - int i = 0; - while (digestInputStream.available() <= 0 && i < 100) { - Thread.sleep(10); - i++; } - Blob dcXml = bucket.create(dataciteFileName, digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + }); + dcThread.start(); + // Have seen Pipe Closed errors for other archivers when used as a workflow + // without this delay loop + int i = 0; + while (digestInputStream.available() <= 0 && i < 100) { + Thread.sleep(10); + i++; + } + Blob dcXml = bucket.create(dataciteFileName, digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + + dcThread.join(); + String checksum = dcXml.getMd5ToHexString(); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!success || !checksum.equals(localchecksum)) { + logger.severe("Failure on " + spaceName); + logger.severe(success ? checksum + " not equal to " + localchecksum : "datacite.xml transfer did not succeed"); + try { + dcXml.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); + } + return new Failure("Error in transferring DataCite.xml file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete metadata transfer"); + } - dcThread.join(); - String checksum = dcXml.getMd5ToHexString(); - logger.fine("Content: datacite.xml added with checksum: " + checksum); - String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + // Store BagIt file + success = false; + + // Add BagIt ZIP file + // Google uses MD5 as one way to verify the + // transfer + messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream in = new PipedInputStream(100000); + DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { + Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, ore, terms, token); + Blob bag = bucket.create(bagFileName, digestInputStream2, "application/zip", + Bucket.BlobWriteOption.doesNotExist()); + if (bag.getSize() == 0) { + throw new IOException("Empty Bag"); + } + bagThread.join(); + + checksum = bag.getMd5ToHexString(); + logger.fine("Bag: " + bagFileName + " added with checksum: " + checksum); + localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); if (!success || !checksum.equals(localchecksum)) { - logger.severe("Failure on " + spaceName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "datacite.xml transfer did not succeed"); + logger.severe(success ? checksum + " not equal to " + localchecksum + : "bag transfer did not succeed"); try { - dcXml.delete(Blob.BlobSourceOption.generationMatch()); + bag.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { logger.warning(se.getMessage()); } - return new Failure("Error in transferring DataCite.xml file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete metadata transfer"); + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete archive transfer"); } + } - // Store BagIt file - success = false; - String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; - - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - Blob bag = bucket.create(bagFileName, digestInputStream2, "application/zip", - Bucket.BlobWriteOption.doesNotExist()); - if (bag.getSize() == 0) { - throw new IOException("Empty Bag"); - } - bagThread.join(); - - checksum = bag.getMd5ToHexString(); - logger.fine("Bag: " + bagFileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe(success ? checksum + " not equal to " + localchecksum - : "bag transfer did not succeed"); - try { - bag.delete(Blob.BlobSourceOption.generationMatch()); - } catch (StorageException se) { - logger.warning(se.getMessage()); - } - return new Failure("Error in transferring Zip file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete archive transfer"); - } - } + logger.fine("GoogleCloud Submission step: Content Transferred"); - logger.fine("GoogleCloud Submission step: Content Transferred"); + // Document the location of dataset archival copy location (actually the URL + // where you can view it as an admin) + // Changed to point at bucket where the zip and datacite.xml are visible - // Document the location of dataset archival copy location (actually the URL - // where you can view it as an admin) - // Changed to point at bucket where the zip and datacite.xml are visible + StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); + sb.append(bucketName + "/" + spaceName); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); - sb.append(bucketName + "/" + spaceName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - - } - } else { - logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); - return new Failure("Dataset locked"); } } catch (Exception e) { logger.warning(e.getLocalizedMessage()); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 34fadbed703..38951c8a218 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -2,7 +2,6 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.Command; @@ -10,7 +9,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagItLocalPath; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; -import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -19,6 +18,7 @@ import java.util.logging.Logger; import jakarta.json.Json; +import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; import java.io.File; @@ -44,94 +44,91 @@ public boolean canDelete() { } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { - logger.fine("In LocalCloudSubmitToArchive..."); + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { + logger.fine("In LocalSubmitToArchive..."); String localPath = requestedSettings.get(BagItLocalPath.toString()); String zipName = null; - //Set a failure status that will be updated if we succeed + // Set a failure status that will be updated if we succeed JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); try { - Dataset dataset = dv.getDataset(); - if (dataset.getLockFor(Reason.finalizePublication) == null - && dataset.getLockFor(Reason.FileValidationFailed) == null) { - - String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase(); + String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') + .replace('.', '-').toLowerCase(); - // Define file paths - String dataciteFileName = localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; - zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; + // Define file paths + String dataciteFileName = localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; + zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; - // Check for and delete existing files for this version - logger.fine("Checking for existing files in archive..."); - - File existingDatacite = new File(dataciteFileName); - if (existingDatacite.exists()) { - logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); - if (existingDatacite.delete()) { - logger.fine("Deleted existing datacite.xml"); - } else { - logger.warning("Failed to delete existing datacite.xml: " + dataciteFileName); - } - } + // Check for and delete existing files for this version + logger.fine("Checking for existing files in archive..."); - File existingBag = new File(zipName); - if (existingBag.exists()) { - logger.fine("Found existing bag file, deleting: " + zipName); - if (existingBag.delete()) { - logger.fine("Deleted existing bag file"); - } else { - logger.warning("Failed to delete existing bag file: " + zipName); - } + File existingDatacite = new File(dataciteFileName); + if (existingDatacite.exists()) { + logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); + if (existingDatacite.delete()) { + logger.fine("Deleted existing datacite.xml"); + } else { + logger.warning("Failed to delete existing datacite.xml: " + dataciteFileName); } + } - // Also check for and delete the .partial file if it exists - File existingPartial = new File(zipName + ".partial"); - if (existingPartial.exists()) { - logger.fine("Found existing partial bag file, deleting: " + zipName + ".partial"); - if (existingPartial.delete()) { - logger.fine("Deleted existing partial bag file"); - } else { - logger.warning("Failed to delete existing partial bag file: " + zipName + ".partial"); - } + File existingBag = new File(zipName); + if (existingBag.exists()) { + logger.fine("Found existing bag file, deleting: " + zipName); + if (existingBag.delete()) { + logger.fine("Deleted existing bag file"); + } else { + logger.warning("Failed to delete existing bag file: " + zipName); } + } - String dataciteXml = getDataCiteXml(dv); - - FileUtils.writeStringToFile( - new File(dataciteFileName), - dataciteXml, StandardCharsets.UTF_8); - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); - bagger.setAuthenticationKey(token.getTokenString()); - //ToDo: generateBag(File f, true) seems to do the same thing (with a .tmp extension) - since we don't have to use a stream here, could probably just reuse the existing code? - bagger.generateBag(new FileOutputStream(zipName + ".partial")); - - File srcFile = new File(zipName + ".partial"); - File destFile = new File(zipName); - - if (srcFile.renameTo(destFile)) { - logger.fine("Localhost Submission step: Content Transferred"); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "file://" + zipName); + // Also check for and delete the .partial file if it exists + File existingPartial = new File(zipName + ".partial"); + if (existingPartial.exists()) { + logger.fine("Found existing partial bag file, deleting: " + zipName + ".partial"); + if (existingPartial.delete()) { + logger.fine("Deleted existing partial bag file"); } else { - logger.warning("Unable to move " + zipName + ".partial to " + zipName); + logger.warning("Failed to delete existing partial bag file: " + zipName + ".partial"); } + } + + // Write datacite.xml file + FileUtils.writeStringToFile(new File(dataciteFileName), dataciteXml, StandardCharsets.UTF_8); + logger.fine("Datacite XML written to: " + dataciteFileName); + + // Generate bag + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); + bagger.setAuthenticationKey(token.getTokenString()); + + boolean bagSuccess = bagger.generateBag(new FileOutputStream(zipName + ".partial")); + + if (!bagSuccess) { + logger.severe("Bag generation failed for " + zipName); + return new Failure("Local Submission Failure", "Bag generation failed"); + } + + File srcFile = new File(zipName + ".partial"); + File destFile = new File(zipName); + + if (srcFile.renameTo(destFile)) { + logger.fine("Localhost Submission step: Content Transferred to " + zipName); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "file://" + zipName); } else { - logger.warning( - "Localhost Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed"); - return new Failure("Dataset locked"); + logger.severe("Unable to move " + zipName + ".partial to " + zipName); + return new Failure("Local Submission Failure", "Unable to rename partial file to final file"); } } catch (Exception e) { logger.warning("Failed to archive " + zipName + " : " + e.getLocalizedMessage()); e.printStackTrace(); + return new Failure("Local Submission Failure", e.getLocalizedMessage() + ": check log for details"); } finally { dv.setArchivalCopyLocation(statusObject.build().toString()); } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 768d5d03e1d..4198cb19fe9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -11,6 +11,7 @@ import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.util.json.JsonUtil; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -81,8 +82,8 @@ public boolean canDelete() { } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { logger.fine("In S3SubmitToArchiveCommand..."); JsonObject configObject = null; @@ -105,139 +106,127 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t try { Dataset dataset = dv.getDataset(); - if (dataset.getLockFor(Reason.finalizePublication) == null) { - - spaceName = getSpaceName(dataset); - - // Define keys for datacite.xml and bag file - String dcKey = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; - String bagKey = spaceName + "/" + getFileName(spaceName, dv) + ".zip"; - - // Check for and delete existing files for this version - logger.fine("Checking for existing files in archive..."); - - try { - HeadObjectRequest headDcRequest = HeadObjectRequest.builder() - .bucket(bucketName) - .key(dcKey) - .build(); - - s3.headObject(headDcRequest).join(); - - // If we get here, the object exists, so delete it - logger.fine("Found existing datacite.xml, deleting: " + dcKey); - DeleteObjectRequest deleteDcRequest = DeleteObjectRequest.builder() - .bucket(bucketName) - .key(dcKey) - .build(); - - CompletableFuture deleteDcFuture = s3.deleteObject(deleteDcRequest); - DeleteObjectResponse deleteDcResponse = deleteDcFuture.join(); - - if (deleteDcResponse.sdkHttpResponse().isSuccessful()) { - logger.fine("Deleted existing datacite.xml"); - } else { - logger.warning("Failed to delete existing datacite.xml: " + dcKey); - } - } catch (Exception e) { - if (e.getCause() instanceof NoSuchKeyException) { - logger.fine("No existing datacite.xml found"); - } else { - logger.warning("Error checking/deleting existing datacite.xml: " + e.getMessage()); - } - } + spaceName = getSpaceName(dataset); - try { - HeadObjectRequest headBagRequest = HeadObjectRequest.builder() - .bucket(bucketName) - .key(bagKey) - .build(); - - s3.headObject(headBagRequest).join(); - - // If we get here, the object exists, so delete it - logger.fine("Found existing bag file, deleting: " + bagKey); - DeleteObjectRequest deleteBagRequest = DeleteObjectRequest.builder() - .bucket(bucketName) - .key(bagKey) - .build(); - - CompletableFuture deleteBagFuture = s3.deleteObject(deleteBagRequest); - DeleteObjectResponse deleteBagResponse = deleteBagFuture.join(); - - if (deleteBagResponse.sdkHttpResponse().isSuccessful()) { - logger.fine("Deleted existing bag file"); - } else { - logger.warning("Failed to delete existing bag file: " + bagKey); - } - } catch (Exception e) { - if (e.getCause() instanceof NoSuchKeyException) { - logger.fine("No existing bag file found"); - } else { - logger.warning("Error checking/deleting existing bag file: " + e.getMessage()); - } - } + // Define keys for datacite.xml and bag file + String dcKey = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + String bagKey = spaceName + "/" + getFileName(spaceName, dv) + ".zip"; + + // Check for and delete existing files for this version + logger.fine("Checking for existing files in archive..."); + + try { + HeadObjectRequest headDcRequest = HeadObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); + + s3.headObject(headDcRequest).join(); - String dataciteXml = getDataCiteXml(dv); - // Add datacite.xml file - PutObjectRequest putRequest = PutObjectRequest.builder() + // If we get here, the object exists, so delete it + logger.fine("Found existing datacite.xml, deleting: " + dcKey); + DeleteObjectRequest deleteDcRequest = DeleteObjectRequest.builder() .bucket(bucketName) .key(dcKey) .build(); - CompletableFuture putFuture = s3.putObject(putRequest, - AsyncRequestBody.fromString(dataciteXml, StandardCharsets.UTF_8)); + CompletableFuture deleteDcFuture = s3.deleteObject(deleteDcRequest); + DeleteObjectResponse deleteDcResponse = deleteDcFuture.join(); + + if (deleteDcResponse.sdkHttpResponse().isSuccessful()) { + logger.fine("Deleted existing datacite.xml"); + } else { + logger.warning("Failed to delete existing datacite.xml: " + dcKey); + } + } catch (Exception e) { + if (e.getCause() instanceof NoSuchKeyException) { + logger.fine("No existing datacite.xml found"); + } else { + logger.warning("Error checking/deleting existing datacite.xml: " + e.getMessage()); + } + } + + try { + HeadObjectRequest headBagRequest = HeadObjectRequest.builder() + .bucket(bucketName) + .key(bagKey) + .build(); + + s3.headObject(headBagRequest).join(); + + // If we get here, the object exists, so delete it + logger.fine("Found existing bag file, deleting: " + bagKey); + DeleteObjectRequest deleteBagRequest = DeleteObjectRequest.builder() + .bucket(bucketName) + .key(bagKey) + .build(); - // Wait for the put operation to complete - PutObjectResponse putResponse = putFuture.join(); + CompletableFuture deleteBagFuture = s3.deleteObject(deleteBagRequest); + DeleteObjectResponse deleteBagResponse = deleteBagFuture.join(); - if (!putResponse.sdkHttpResponse().isSuccessful()) { - logger.warning("Could not write datacite xml to S3"); - return new Failure("S3 Archiver failed writing datacite xml file"); + if (deleteBagResponse.sdkHttpResponse().isSuccessful()) { + logger.fine("Deleted existing bag file"); + } else { + logger.warning("Failed to delete existing bag file: " + bagKey); + } + } catch (Exception e) { + if (e.getCause() instanceof NoSuchKeyException) { + logger.fine("No existing bag file found"); + } else { + logger.warning("Error checking/deleting existing bag file: " + e.getMessage()); } + } + + // Add datacite.xml file + PutObjectRequest putRequest = PutObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); + + CompletableFuture putFuture = s3.putObject(putRequest, + AsyncRequestBody.fromString(dataciteXml, StandardCharsets.UTF_8)); + + // Wait for the put operation to complete + PutObjectResponse putResponse = putFuture.join(); - // Store BagIt file - String fileName = getFileName(spaceName, dv); + if (!putResponse.sdkHttpResponse().isSuccessful()) { + logger.warning("Could not write datacite xml to S3"); + return new Failure("S3 Archiver failed writing datacite xml file"); + } - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer + // Store BagIt file + String fileName = getFileName(spaceName, dv); - // Generate bag - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setAuthenticationKey(token.getTokenString()); - if (bagger.generateBag(fileName, false)) { - File bagFile = bagger.getBagFile(fileName); + // Generate bag + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); + bagger.setAuthenticationKey(token.getTokenString()); + if (bagger.generateBag(fileName, false)) { + File bagFile = bagger.getBagFile(fileName); - UploadFileRequest uploadFileRequest = UploadFileRequest.builder() - .putObjectRequest(req -> req.bucket(bucketName).key(bagKey)).source(bagFile.toPath()) - .build(); + UploadFileRequest uploadFileRequest = UploadFileRequest.builder() + .putObjectRequest(req -> req.bucket(bucketName).key(bagKey)).source(bagFile.toPath()) + .build(); - FileUpload fileUpload = tm.uploadFile(uploadFileRequest); + FileUpload fileUpload = tm.uploadFile(uploadFileRequest); - CompletedFileUpload uploadResult = fileUpload.completionFuture().join(); + CompletedFileUpload uploadResult = fileUpload.completionFuture().join(); - if (uploadResult.response().sdkHttpResponse().isSuccessful()) { - logger.fine("S3 Submission step: Content Transferred"); + if (uploadResult.response().sdkHttpResponse().isSuccessful()) { + logger.fine("S3 Submission step: Content Transferred"); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, - String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); - } else { - logger.severe("Error sending file to S3: " + fileName); - return new Failure("Error in transferring Bag file to S3", - "S3 Submission Failure: incomplete transfer"); - } + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); } else { - logger.warning("Could not write local Bag file " + fileName); - return new Failure("S3 Archiver fail writing temp local bag"); + logger.severe("Error sending file to S3: " + fileName); + return new Failure("Error in transferring Bag file to S3", + "S3 Submission Failure: incomplete transfer"); } - } else { - logger.warning( - "S3 Archiver Submision Workflow aborted: Dataset locked for publication/pidRegister"); - return new Failure("Dataset locked"); + logger.warning("Could not write local Bag file " + fileName); + return new Failure("S3 Archiver fail writing temp local bag"); } + } catch (Exception e) { logger.warning(e.getLocalizedMessage()); e.printStackTrace(); @@ -253,7 +242,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t return WorkflowStepResult.OK; } else { return new Failure( - "S3 Submission not configured - no \":S3ArchivalProfile\" and/or \":S3ArchivalConfig\" or no bucket-name defined in config."); + "S3 Submission not configured - no \":S3ArchivalProfile\" and/or \":S3ArchivalConfig\" or no bucket-name defined in config."); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..12501d170d6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -24,6 +24,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -72,6 +73,7 @@ import com.google.gson.JsonSyntaxException; import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.DatasetFieldConstant; import edu.harvard.iq.dataverse.DataFile.ChecksumType; import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.settings.JvmSettings; @@ -120,10 +122,10 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; - public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); + private Map terms; - private OREMap oremap; + private static int numConnections = 8; + public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); static PrintWriter pw = null; @@ -139,15 +141,15 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. + * @param terms * @throws Exception * @throws JsonSyntaxException */ - public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { - this.oremap = oreMap; - this.oremapObject = oreMap.getOREMap(); - //(JsonObject) new JsonParser().parse(oreMap.getOREMap().toString()); + public BagGenerator(jakarta.json.JsonObject oremapObject, String dataciteXml, Map terms) throws JsonSyntaxException, Exception { + this.oremapObject = oremapObject; this.dataciteXml = dataciteXml; + this.terms = terms; try { // Using Dataverse, all the URLs to be retrieved should be on the current server, so allowing self-signed certs and not verifying hostnames are useful in testing and @@ -768,12 +770,12 @@ private String generateInfoFile() { /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change * so we need to find the labels used. */ - JsonLDTerm contactTerm = oremap.getContactTerm(); + JsonLDTerm contactTerm = terms.get(DatasetFieldConstant.datasetContact); if ((contactTerm != null) && aggregation.has(contactTerm.getLabel())) { JsonElement contacts = aggregation.get(contactTerm.getLabel()); - JsonLDTerm contactNameTerm = oremap.getContactNameTerm(); - JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); + JsonLDTerm contactNameTerm = terms.get(DatasetFieldConstant.datasetContactName); + JsonLDTerm contactEmailTerm = terms.get(DatasetFieldConstant.datasetContactEmail); if (contacts.isJsonArray()) { for (int i = 0; i < contactsArray.size(); i++) { @@ -841,8 +843,8 @@ private String generateInfoFile() { /* Description, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change * so we need to find the labels used. */ - JsonLDTerm descriptionTerm = oremap.getDescriptionTerm(); - JsonLDTerm descriptionTextTerm = oremap.getDescriptionTextTerm(); + JsonLDTerm descriptionTerm = terms.get(DatasetFieldConstant.description); + JsonLDTerm descriptionTextTerm = terms.get(DatasetFieldConstant.descriptionText); if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { @@ -1124,9 +1126,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will now use " + numConnections + " threads"); } } \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java index ae1175f0e1d..d7fc3f96b02 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java @@ -290,7 +290,7 @@ private void executeSteps(Workflow wf, WorkflowContext ctxt, int initialStepIdx try { if (res == WorkflowStepResult.OK) { logger.log(Level.INFO, "Workflow {0} step {1}: OK", new Object[]{ctxt.getInvocationId(), stepIdx}); - em.merge(ctxt.getDataset()); + // The dataset is merged in refresh(ctxt) ctxt = refresh(ctxt); } else if (res instanceof Failure) { logger.log(Level.WARNING, "Workflow {0} failed: {1}", new Object[]{ctxt.getInvocationId(), ((Failure) res).getReason()}); diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index b0567bff107..c6a5c8626ae 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -1,9 +1,14 @@ package edu.harvard.iq.dataverse.workflow.internalspi; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetLock.Reason; +import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.impl.AbstractSubmitToArchiveCommand; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.ArchiverUtil; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStep; @@ -14,6 +19,7 @@ import java.util.logging.Level; import java.util.logging.Logger; +import jakarta.json.JsonObject; import jakarta.servlet.http.HttpServletRequest; /** @@ -45,11 +51,43 @@ public WorkflowStepResult run(WorkflowContext context) { } } + Dataset d = context.getDataset(); + if (d.isLockedFor(Reason.FileValidationFailed)) { + logger.severe("Dataset locked for file validation failure - will not archive"); + return new Failure("File Validation Lock", "Dataset has file validation problem - will not archive"); + } DataverseRequest dvr = new DataverseRequest(context.getRequest().getAuthenticatedUser(), (HttpServletRequest) null); String className = requestedSettings.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvr, context.getDataset().getReleasedVersion()); if (archiveCommand != null) { - return (archiveCommand.performArchiveSubmission(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); + // Generate the required components for archiving + DatasetVersion version = context.getDataset().getReleasedVersion(); + + // Generate DataCite XML + String dataCiteXml = archiveCommand.getDataCiteXml(version); + + // Generate OREMap + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + + // Get JSON-LD terms + Map terms = archiveCommand.getJsonLDTerms(oreMap); + + // Call the updated method with all required parameters + /* + * Note: because this must complete before the workflow can complete and update the version status in the db a long-running archive submission via workflow could hit a transaction timeout and fail. + * The commands themselves have been updated to run archive submission outside of any transaction and update the status in a separate transaction, so archiving a given version that way could succeed + * where this workflow failed. + */ + return archiveCommand.performArchiveSubmission( + version, + dataCiteXml, + ore, + terms, + context.getApiToken(), + requestedSettings + ); + } else { logger.severe("No Archiver instance could be created for name: " + className); return new Failure("No Archiver", "Could not create instance of class: " + className); From d2a25c392c4434d960871f13a6ed8f86458fc3f0 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sat, 24 Jan 2026 09:58:33 -0500 Subject: [PATCH 53/97] use new transaction to start --- .../dataverse/DatasetVersionServiceBean.java | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index b5e964e5673..9c04acd6c5e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -28,6 +28,8 @@ import jakarta.ejb.EJB; import jakarta.ejb.EJBException; import jakarta.ejb.Stateless; +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; import jakarta.inject.Named; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; @@ -1337,25 +1339,21 @@ public Long getDatasetVersionCount(Long datasetId, boolean canViewUnpublishedVer /** - * Update the archival copy location for a specific version of a dataset. Archiving can be long-running and other parallel updates to the datasetversion have likely occurred so this method will check - * for OptimisticLockExceptions and retry the update with the latest version. + * Update the archival copy location for a specific version of a dataset. + * Archiving can be long-running and other parallel updates to the datasetversion have likely occurred + * so this method will just re-find the version rather than risking an + * OptimisticLockException and then having to retry in yert another transaction (since the OLE rolls this one back). * * @param dv * The dataset version whose archival copy location we want to update. Must not be {@code null}. */ + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void persistArchivalCopyLocation(DatasetVersion dv) { - try { - em.merge(dv); - em.flush(); // Force the update and version check immediately - } catch (OptimisticLockException ole) { - logger.log(Level.INFO, "OptimisticLockException while persisting archival copy location for DatasetVersion id={0}. Retrying on latest version.", dv.getId()); - DatasetVersion currentVersion = find(dv.getId()); - if (currentVersion != null) { - currentVersion.setArchivalCopyLocation(dv.getArchivalCopyLocation()); - em.merge(currentVersion); - } else { - logger.log(Level.SEVERE, "Could not find DatasetVersion with id={0} to retry persisting archival copy location after OptimisticLockException.", dv.getId()); - } + DatasetVersion currentVersion = find(dv.getId()); + if (currentVersion != null) { + currentVersion.setArchivalCopyLocation(dv.getArchivalCopyLocation()); + } else { + logger.log(Level.SEVERE, "Could not find DatasetVersion with id={0} to retry persisting archival copy location after OptimisticLockException.", dv.getId()); } } } From a45b76b2cc4ad1d6e6ea324eeb51cb3cfcc37189 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sat, 24 Jan 2026 09:59:00 -0500 Subject: [PATCH 54/97] typo --- .../edu/harvard/iq/dataverse/DatasetVersionServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 9c04acd6c5e..a5dd724104f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -1342,7 +1342,7 @@ public Long getDatasetVersionCount(Long datasetId, boolean canViewUnpublishedVer * Update the archival copy location for a specific version of a dataset. * Archiving can be long-running and other parallel updates to the datasetversion have likely occurred * so this method will just re-find the version rather than risking an - * OptimisticLockException and then having to retry in yert another transaction (since the OLE rolls this one back). + * OptimisticLockException and then having to retry in yet another transaction (since the OLE rolls this one back). * * @param dv * The dataset version whose archival copy location we want to update. Must not be {@code null}. From a4c583e1e6ab3db3cb5c8e570ca7ffbd1867d567 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sat, 24 Jan 2026 10:38:13 -0500 Subject: [PATCH 55/97] Use pending, use JSON --- .../java/edu/harvard/iq/dataverse/DatasetPage.java | 14 ++++++++++---- .../edu/harvard/iq/dataverse/DatasetVersion.java | 4 ++-- .../edu/harvard/iq/dataverse/api/Datasets.java | 4 +++- .../ArchivalSubmissionWorkflowStep.java | 12 +++++++++--- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index a091005b392..7e168047f05 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -3004,7 +3004,11 @@ public String updateCurrentVersion() { String status = updateVersion.getArchivalCopyLocationStatus(); if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE) || (FeatureFlags.ARCHIVE_ON_VERSION_UPDATE.enabled() && archiveCommand.canDelete())){ // Delete the record of any existing copy since it is now out of date/incorrect - updateVersion.setArchivalCopyLocation(null); + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + //Persist to db now + datasetVersionService.persistArchivalCopyLocation(updateVersion); /* * Then try to generate and submit an archival copy. Note that running this * command within the CuratePublishedDatasetVersionCommand was causing an error: @@ -3023,9 +3027,8 @@ public String updateCurrentVersion() { } } else if(status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { //Not automatically replacing the old archival copy as creating it is expensive - updateVersion.setArchivalStatus(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); + updateVersion.setArchivalStatusOnly(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); datasetVersionService.persistArchivalCopyLocation(updateVersion); - datasetVersionService.merge(updateVersion); } } } @@ -6115,7 +6118,10 @@ public void archiveVersion(Long id, boolean force) { if (status == null || (force && cmd.canDelete())) { // Set initial pending status - dv.setArchivalCopyLocation(DatasetVersion.ARCHIVAL_STATUS_PENDING); + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + dv.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + //Persist now datasetVersionService.persistArchivalCopyLocation(dv); commandEngine.submitAsync(cmd); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 1248a8266ab..4ff6ae5e723 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -416,8 +416,8 @@ public void setArchivalCopyLocation(String location) { populateArchivalStatus(true); } - // COnvenience method to set only the status - public void setArchivalStatus(String status) { + // Convenience method to just change the status without changing the location + public void setArchivalStatusOnly(String status) { populateArchivalStatus(false); JsonObjectBuilder job = Json.createObjectBuilder(archivalCopyLocationJson); job.add(DatasetVersion.ARCHIVAL_STATUS, status); diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index dba4b36d4da..155522bbb5b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1279,7 +1279,9 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, createDataverseRequest(user), updateVersion); if (archiveCommand != null) { // Delete the record of any existing copy since it is now out of date/incorrect - updateVersion.setArchivalCopyLocation(null); + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); datasetVersionSvc.persistArchivalCopyLocation(updateVersion); /* * Then try to generate and submit an archival copy. Note that running this diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index c6a5c8626ae..9e9b434ba03 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -75,9 +75,15 @@ public WorkflowStepResult run(WorkflowContext context) { // Call the updated method with all required parameters /* - * Note: because this must complete before the workflow can complete and update the version status in the db a long-running archive submission via workflow could hit a transaction timeout and fail. - * The commands themselves have been updated to run archive submission outside of any transaction and update the status in a separate transaction, so archiving a given version that way could succeed - * where this workflow failed. + * Note: because this must complete before the workflow can complete and update the version status + * in the db a long-running archive submission via workflow could hit a transaction timeout and fail. + * The commands themselves have been updated to run archive submission outside of any transaction + * and update the status in a separate transaction, so archiving a given version that way could + * succeed where this workflow failed. + * + * Another difference when running in a workflow - this step has no way to set the archiving status to + * pending as is done when running archiving from the UI/API. Instead, there is a generic workflow + * lock on the dataset. */ return archiveCommand.performArchiveSubmission( version, From 305f7e3b73f7ec299bb25a86bddf645f34f23607 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 21 Jan 2026 17:38:37 -0500 Subject: [PATCH 56/97] merge fix of persistArchivalCopy method refactors --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 4ff6ae5e723..8a4a0cf3f53 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -392,7 +392,7 @@ public String getArchivalCopyLocationStatus() { } public String getArchivalCopyLocationMessage() { populateArchivalStatus(false); - if(archivalCopyLocationJson!=null) { + if(archivalCopyLocationJson!=null && archivalCopyLocationJson.containsKey(ARCHIVAL_STATUS_MESSAGE)) { return archivalCopyLocationJson.getString(ARCHIVAL_STATUS_MESSAGE); } return null; From d2282d9d02280f27491f3a87dbb9ac39ee6794ac Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 28 Jan 2026 15:20:14 -0500 Subject: [PATCH 57/97] combined release note --- doc/release-notes/12122-archiving updates.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 doc/release-notes/12122-archiving updates.md diff --git a/doc/release-notes/12122-archiving updates.md b/doc/release-notes/12122-archiving updates.md new file mode 100644 index 00000000000..2dd4eb6909f --- /dev/null +++ b/doc/release-notes/12122-archiving updates.md @@ -0,0 +1,8 @@ +## Notifications + +This release includes multiple updates to the process of creating archival bags including +- performance/scaling improvements for large datasets (multiple changes) +- bug fixes for when superusers see the "Submit" button to launch archiving from the dataset page version table +- new functionality to optionally suppress an archiving workflow when using the Update Current Version functionality and mark the current archive as out of date +- new functionality to support recreating an archival bag when Update Current Version has been used, which is available for archivers that can delete existing files +- \ No newline at end of file From 236fca47f9f5e57792c8201fd34fadc992f2c6ec Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 28 Jan 2026 15:30:21 -0500 Subject: [PATCH 58/97] missed change to static --- .../engine/command/impl/AbstractSubmitToArchiveCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index ffa79456902..2b049f1c42a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -156,7 +156,7 @@ public Thread startBagThread(DatasetVersion dv, PipedInputStream in, DigestInput public void run() { try (PipedOutputStream out = new PipedOutputStream(in)) { // Generate bag - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); + BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads()); BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); From 1b429780634c9cb0140fdc2a4cd5475320559cd4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 28 Jan 2026 15:51:35 -0500 Subject: [PATCH 59/97] suppress counting file retrieval to bag as a download in gb table --- doc/release-notes/12063-ORE-and-Bag-updates.md | 3 ++- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md index b2926f40c96..bbc22b22182 100644 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -11,4 +11,5 @@ Archival Bag - values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). - the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) - a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation -- improvements to file retrieval w.r.t. retries on errors or throttling \ No newline at end of file +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 5c5b88a521b..1864361d755 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1179,12 +1179,14 @@ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { try { - URI uri = new URI(uriString); - + // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) + String modifiedUriString = uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + URI uri = new URI(modifiedUriString); + logger.finest("Final URI used (with gbrecs param): " + modifiedUriString); int tries = 0; while (tries < 5) { - logger.fine("Get # " + tries + " for " + uriString); + logger.finest("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); try { From 67e01e050d933005f69b4bae93678c865671130a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 16 Dec 2025 10:35:40 -0500 Subject: [PATCH 60/97] archival submit fix - per version cache --- .../edu/harvard/iq/dataverse/DatasetPage.java | 28 +++++++++++++------ src/main/webapp/dataset-versions.xhtml | 4 +-- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 20617160a1c..8eba6cbeab9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -387,7 +387,7 @@ public void setSelectedHostDataverse(Dataverse selectedHostDataverse) { private boolean showIngestSuccess; private Boolean archivable = null; - private Boolean versionArchivable = null; + private HashMap versionArchivable = new HashMap<>(); private Boolean someVersionArchived = null; public boolean isShowIngestSuccess() { @@ -6147,10 +6147,11 @@ public boolean isArchivable() { return archivable; } - public boolean isVersionArchivable() { - if (versionArchivable == null) { + public boolean isVersionArchivable(Long id) { + Boolean thisVersionArchivable = versionArchivable.get(id); + if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false - versionArchivable = false; + thisVersionArchivable = false; if (isArchivable()) { boolean checkForArchivalCopy = false; // Otherwise, we need to know if the archiver is single-version-only @@ -6167,11 +6168,19 @@ public boolean isVersionArchivable() { if (checkForArchivalCopy) { // If we have to check (single version archiving), we can't allow archiving if // one version is already archived (or attempted - any non-null status) - versionArchivable = !isSomeVersionArchived(); + thisVersionArchivable = !isSomeVersionArchived(); } else { - // If we allow multiple versions or didn't find one that has had archiving run - // on it, we can archive, so return true - versionArchivable = true; + // If we didn't find one that has had archiving run + // on it, or archiving per version is supported and either + // the status is null or the archiver can delete prior runs and status isn't success, + // we can archive, so return true + // Find the specific version by id + DatasetVersion targetVersion = dataset.getVersions().stream() + .filter(v -> v.getId().equals(id)) + .findFirst() + .orElse(null); + String status = targetVersion.getArchivalCopyLocationStatus(); + thisVersionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete)); } } catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { @@ -6180,8 +6189,9 @@ public boolean isVersionArchivable() { } } } + versionArchivable.put(id, thisVersionArchivable); } - return versionArchivable; + return thisVersionArchivable; } public boolean isSomeVersionArchived() { diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index 9e5f0a9b24d..ee726bb5eee 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -171,11 +171,11 @@ - - + From 50e8c61a8e1a878a7056f0da980e1a9e5271f957 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 29 Jan 2026 17:20:31 -0500 Subject: [PATCH 61/97] Add check to display submit button only if prior versions are archvd --- .../edu/harvard/iq/dataverse/DatasetPage.java | 34 +++++++++++++------ .../dataverse/FileMetadataVersionsHelper.java | 14 ++------ .../iq/dataverse/dataset/DatasetUtil.java | 17 ++++++++++ .../iq/dataverse/util/ArchiverUtil.java | 11 ++++++ 4 files changed, 53 insertions(+), 23 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 8eba6cbeab9..375489484c0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -387,6 +387,8 @@ public void setSelectedHostDataverse(Dataverse selectedHostDataverse) { private boolean showIngestSuccess; private Boolean archivable = null; + private Boolean checkForArchivalCopy; + private Boolean supportsDelete; private HashMap versionArchivable = new HashMap<>(); private Boolean someVersionArchived = null; @@ -6152,19 +6154,33 @@ public boolean isVersionArchivable(Long id) { if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false thisVersionArchivable = false; + boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived, false); if (isArchivable()) { - boolean checkForArchivalCopy = false; // Otherwise, we need to know if the archiver is single-version-only // If it is, we have to check for an existing archived version to answer the // question String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null); if (className != null) { try { - Class clazz = Class.forName(className); - Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); - Object[] params = { settingsWrapper }; - checkForArchivalCopy = (Boolean) m.invoke(null, params); - + DatasetVersion targetVersion = dataset.getVersions().stream() + .filter(v -> v.getId().equals(id)).findFirst().orElse(null); + if (requiresEarlierVersionsToBeArchived) {// Find the specific version by id + DatasetVersion priorVersion = DatasetUtil.getPriorVersion(targetVersion); + + if (priorVersion== null || (isVersionArchivable(priorVersion.getId()) + && ArchiverUtil.isVersionArchived(priorVersion))) { + thisVersionArchivable = true; + } + } + if (checkForArchivalCopy == null) { + //Only check once + Class clazz = Class.forName(className); + Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); + Method m2 = clazz.getMethod("supportsDelete"); + Object[] params = { settingsWrapper }; + checkForArchivalCopy = (Boolean) m.invoke(null, params); + supportsDelete = (Boolean) m2.invoke(null); + } if (checkForArchivalCopy) { // If we have to check (single version archiving), we can't allow archiving if // one version is already archived (or attempted - any non-null status) @@ -6175,16 +6191,12 @@ public boolean isVersionArchivable(Long id) { // the status is null or the archiver can delete prior runs and status isn't success, // we can archive, so return true // Find the specific version by id - DatasetVersion targetVersion = dataset.getVersions().stream() - .filter(v -> v.getId().equals(id)) - .findFirst() - .orElse(null); String status = targetVersion.getArchivalCopyLocationStatus(); thisVersionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete)); } } catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { - logger.warning("Failed to call isSingleVersion on configured archiver class: " + className); + logger.warning("Failed to call methods on configured archiver class: " + className); e.printStackTrace(); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java index 4d408a72c8c..cc632054642 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse; import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.dataset.DatasetUtil; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; @@ -95,18 +96,7 @@ private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, FileMeta //TODO: this could use some refactoring to cut down on the number of for loops! private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, DatasetVersion currentversion) { List allfiles = allRelatedFiles(fileMetadata); - boolean foundCurrent = false; - DatasetVersion priorVersion = null; - for (DatasetVersion versionLoop : fileMetadata.getDatasetVersion().getDataset().getVersions()) { - if (foundCurrent) { - priorVersion = versionLoop; - break; - } - if (versionLoop.equals(currentversion)) { - foundCurrent = true; - } - - } + DatasetVersion priorVersion = DatasetUtil.getPriorVersion(fileMetadata.getDatasetVersion()); if (priorVersion != null && priorVersion.getFileMetadatasSorted() != null) { for (FileMetadata fmdTest : priorVersion.getFileMetadatasSorted()) { for (DataFile fileTest : allfiles) { diff --git a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java index 2ce5471a523..79451a61a84 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java @@ -740,4 +740,21 @@ public static String getLocaleCurationStatusLabelFromString(String label) { } return localizedName; } + + // Find the prior version - relies on version sorting by major/minor numbers + public static DatasetVersion getPriorVersion(DatasetVersion version) { + boolean foundCurrent = false; + DatasetVersion priorVersion = null; + for (DatasetVersion versionLoop : version.getDataset().getVersions()) { + if (foundCurrent) { + priorVersion = versionLoop; + break; + } + if (versionLoop.equals(version)) { + foundCurrent = true; + } + + } + return priorVersion; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java index 18ec6243d5a..7d03004f3f7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java @@ -71,5 +71,16 @@ public static boolean isSomeVersionArchived(Dataset dataset) { return someVersionArchived; } + + /** + * Checks if a version has been successfully archived. + * + * @param version the version to check + * @return true if the version has been successfully archived, false otherwise + */ + public static boolean isVersionArchived(DatasetVersion version) { + String status = version.getArchivalCopyLocationStatus(); + return status != null && status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + } } \ No newline at end of file From 06428970ac2a7644f7ca7b5d6c929ad639c20c31 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 29 Jan 2026 18:06:43 -0500 Subject: [PATCH 62/97] setting name tweak, add docs, release note --- doc/release-notes/12122-archiving in sequence.md | 3 +++ doc/sphinx-guides/source/installation/config.rst | 12 ++++++++++++ .../java/edu/harvard/iq/dataverse/DatasetPage.java | 2 +- .../command/impl/AbstractSubmitToArchiveCommand.java | 2 +- .../iq/dataverse/settings/SettingsServiceBean.java | 2 +- 5 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 doc/release-notes/12122-archiving in sequence.md diff --git a/doc/release-notes/12122-archiving in sequence.md b/doc/release-notes/12122-archiving in sequence.md new file mode 100644 index 00000000000..6f4373a1e31 --- /dev/null +++ b/doc/release-notes/12122-archiving in sequence.md @@ -0,0 +1,3 @@ +This release introduces an additial setting related to archival bag creation, ArchiveOnlyIfEarlierVersionsAreArchived (default false). +If it is true, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. +This is intended to support use cases where deduplication of files between dataset versions will be done (i.e. by a third-party service running at the archival copy location) and is a step towards supporting the Oxford Common File Layout (OCFL) as an archival format. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9d5c7c0041..e0dffd10ac9 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2263,6 +2263,13 @@ At present, archiving classes include the DuraCloudSubmitToArchiveCommand, Local All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). +Two settings that can be used with all current Archivers are: + +- \:BagGeneratorThreads - the number of threads to use when adding data files to the zipped bag. The default is 2. Values of 4 or more may increase performance on larger machines but may cause problems if file access is throttled +- \:ArchiveOnlyIfEarlierVersionsAreArchived - when true, requires dataset versions to be archived in order by confirming that all prior versions have been successfully archived before allowing a new version to be archived. Default is false + +These must be included in the \:ArchiverSettings for the Archiver to work + .. _Duracloud Configuration: Duracloud Configuration @@ -5333,6 +5340,11 @@ This setting specifies which storage system to use by identifying the particular For examples, see the specific configuration above in :ref:`BagIt Export`. +:ArchiveOnlyIfEarlierVersionsAreArchived +++++++++++++++++++++++++++++++++++++++++ + +This setting, if true, only allows creation of an archival Bag for a dataset version if all prior versions have been successfully archived. The default is false (any version can be archived independently as long as other settings allow it) + :ArchiverSettings +++++++++++++++++ diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 375489484c0..5b267007887 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6154,7 +6154,7 @@ public boolean isVersionArchivable(Long id) { if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false thisVersionArchivable = false; - boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived, false); + boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived, false); if (isArchivable()) { // Otherwise, we need to know if the archiver is single-version-only // If it is, we have to check for an existing archived version to answer the diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 98e9dfb68e1..72f45ab5d2b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -90,7 +90,7 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t // this.requestedSettings won't be set yet in the workflow case, so set it now (used in getNumberOfBagGeneratorThreads) this.requestedSettings.putAll(requestedSettings); // Check if earlier versions must be archived first - String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived.toString()); + String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived.toString()); boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); if (requireEarlierArchived) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index fac136042ce..1cc9fda7645 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -489,7 +489,7 @@ Whether Harvesting (OAI) service is enabled * Only create an archival Bag for a dataset version if all prior versions have * been successfully archived */ - ArchiverOnlyIfEarlierVersionsAreArchived, + ArchiveOnlyIfEarlierVersionsAreArchived, /** * Custom settings for each archiver. See list below. From ca0af05c7f80fc28114ebde71b460917199d2a60 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 29 Jan 2026 18:15:55 -0500 Subject: [PATCH 63/97] simplify --- .../impl/AbstractSubmitToArchiveCommand.java | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 72f45ab5d2b..8949f346567 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -97,36 +97,31 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t Dataset dataset = version.getDataset(); List versions = dataset.getVersions(); - // Check all earlier versions (those with version numbers less than current) - for (DatasetVersion earlierVersion : versions) { - // Skip the current version and any versions that come after it - if (earlierVersion.getId().equals(version.getId())) { - continue; - } - - // Compare version numbers to ensure we only check earlier versions - if (earlierVersion.getVersionNumber() != null && version.getVersionNumber() != null) { - if (earlierVersion.getVersionNumber() < version.getVersionNumber() - || (earlierVersion.getVersionNumber().equals(version.getVersionNumber()) - && earlierVersion.getMinorVersionNumber() < version.getMinorVersionNumber())) { + boolean foundCurrent = false; - // Check if this earlier version has been successfully archived - String archivalStatus = earlierVersion.getArchivalCopyLocationStatus(); - if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) + // versions are ordered, all versions after the current one have lower + // major/minor version numbers + for (DatasetVersion versionInLoop : versions) { + if (foundCurrent) { + // Once foundCurrent is true, we are looking at prior versions + // Check if this earlier version has been successfully archived + String archivalStatus = versionInLoop.getArchivalCopyLocationStatus(); + if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) // || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE) - ) { - JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); - statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); - statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, - "Successful archiving of earlier versions is required."); - version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); - return new Failure( - "Earlier versions must be successfully archived first", - "Archival prerequisites not met" - ); - } + ) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + return new Failure("Earlier versions must be successfully archived first", + "Archival prerequisites not met"); } } + if (versionInLoop.equals(version)) { + foundCurrent = true; + } + } } // Delegate to the archiver-specific implementation From 49f4818c4954f7caf677e7def281437b0b3a9ba5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 11:17:15 -0500 Subject: [PATCH 64/97] basic fetch --- .../iq/dataverse/settings/JvmSettings.java | 4 + .../iq/dataverse/util/bagit/BagGenerator.java | 88 +++++++++++++++++-- 2 files changed, 83 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 05390ba8a8c..b32b7a8d77d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -276,6 +276,10 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), + SCOPE_BAGIT_HOLEY(SCOPE_BAGIT, "holey"), + BAGIT_HOLEY_MAX_FILE_SIZE(SCOPE_BAGIT_HOLEY, "max-file-size"), + BAGIT_HOLEY_MAX_DATA_SIZE(SCOPE_BAGIT_HOLEY, "max-data-size"), + // STORAGE USE SETTINGS SCOPE_STORAGEUSE(PREFIX, "storageuse"), diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 1864361d755..e61ba6b7b0e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -142,6 +142,13 @@ public class BagGenerator { private OREMap oremap; static PrintWriter pw = null; + + //Holey Bags + private long maxDataFileSize = Long.MAX_VALUE; + private long maxTotalDataSize = Long.MAX_VALUE; + private long currentBagDataSize = 0; + private StringBuilder fetchFileContent = new StringBuilder(); + private boolean usingFetchFile = false; // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; @@ -222,6 +229,13 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } + initializeHoleyBagLimits(); + } + + private void initializeHoleyBagLimits() { + this.maxDataFileSize = JvmSettings.BAGIT_HOLEY_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_HOLEY_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + ", maxTotalDataSize: " + maxTotalDataSize); } public void setIgnoreHashes(boolean val) { @@ -363,6 +377,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { logger.fine("Creating bag: " + bagName); + writeFetchFile(); + ZipArchiveOutputStream zipArchiveOutputStream = new ZipArchiveOutputStream(outputStream); /* @@ -570,7 +586,6 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } else { resourceUsed[index] = true; // add item - // ToDo String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); logger.fine("File url: " + dataUrl); String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); @@ -585,6 +600,15 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce if (directoryLabel != null) { childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; } + // Get file size + Long fileSize = null; + if (child.has(JsonLDTerm.filesize.getLabel())) { + fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); + } + if(fileSize == null) { + logger.severe("File size missing for " + childPath); + throw new IOException("Unable to create bag due to missing file size"); + } String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { @@ -614,7 +638,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } try { if ((childHash == null) | ignorehashes) { - // Generate missing hashInputStream inputStream = null; + // Generate missing hash try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { if (hashtype != null) { @@ -644,17 +668,30 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } } - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); + + // Add file to bag or fetch file + if (shouldAddToFetchFile(fileSize)) { + // Add to fetch file instead of including in bag + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl); + addToFetchFile(dataUrl, fileSize, childPath); + usingFetchFile = true; + } else { + // Add file to bag as before + logger.fine("Requesting: " + childPath + " from " + dataUrl); + createFileFromURL(childPath, dataUrl); + if (fileSize != null) { + currentBagDataSize += fileSize; + } + } + dataCount++; if (dataCount % 1000 == 0) { logger.info("Retrieval in progress: " + dataCount + " files retrieved"); } - if (child.has(JsonLDTerm.filesize.getLabel())) { - Long size = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); - totalDataSize += size; - if (size > maxFileSize) { - maxFileSize = size; + if (fileSize != null) { + totalDataSize += fileSize; + if (fileSize > maxFileSize) { + maxFileSize = fileSize; } } if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { @@ -674,6 +711,39 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } } + // Helper method to determine if file should go to fetch file + private boolean shouldAddToFetchFile(long fileSize) { + + // Check individual file size limit + if (fileSize > maxDataFileSize) { + logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); + return true; + } + + // Check total bag size limit + if (currentBagDataSize + fileSize > maxTotalDataSize) { + logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + + ", File: " + fileSize + ", Max: " + maxTotalDataSize); + return true; + } + + return false; + } + + // Method to append to fetch file content + private void addToFetchFile(String url, long size, String filename) { + // Format: URL size filename + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\n"); + } + + // Method to write fetch file to bag (call this before finalizing the bag) + private void writeFetchFile() throws IOException, ExecutionException, InterruptedException { + if (usingFetchFile && fetchFileContent.length() > 0) { + logger.info("Creating fetch.txt file for holey bag"); + createFileFromString("fetch.txt", fetchFileContent.toString()); + } + } + private int getUnusedIndexOf(String childId) { int index = resourceIndex.indexOf(childId); if (resourceUsed[index] != null) { From 7f5179f82535d997f68396b791bc283d1808d527 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 13:52:18 -0500 Subject: [PATCH 65/97] order by file size --- .../iq/dataverse/util/bagit/BagGenerator.java | 290 +++++++++++------- 1 file changed, 171 insertions(+), 119 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index e61ba6b7b0e..4b94fa44bbd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -23,10 +23,11 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -296,7 +297,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { resourceUsed = new Boolean[aggregates.size() + 1]; // Process current container (the aggregation itself) and its // children - processContainer(aggregation, currentPath); + // Recursively collect all files from the entire tree, start with an empty set of processedContainers + List allFiles = new ArrayList<>(); + collectAllFiles(aggregation, currentPath, allFiles); + + // Sort files by size (smallest first) + Collections.sort(allFiles); + + // Process all files in sorted order + processAllFiles(allFiles); } // Create manifest files // pid-mapping.txt - a DataOne recommendation to connect ids and @@ -545,17 +554,31 @@ public static String getValidName(String bagName) { } private void processContainer(JsonObject item, String currentPath) throws IOException { + // Collect all files recursively and process containers to create dirs in the zip + private void collectAllFiles(JsonObject item, String currentPath, List allFiles) + throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); + + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; + + // Mark this container as processed + String containerId = item.get("@id").getAsString(); + + // Create directory and update tracking for this container int containerIndex = -1; try { createDir(currentPath); - // Add containers to pid map and mark as 'used', but no sha1 hash - // value - containerIndex = getUnusedIndexOf(item.get("@id").getAsString()); + containerIndex = getUnusedIndexOf(containerId); resourceUsed[containerIndex] = true; - pidMap.put(item.get("@id").getAsString(), currentPath); - + pidMap.put(containerId, currentPath); } catch (InterruptedException | IOException | ExecutionException e) { e.printStackTrace(); logger.severe(e.getMessage()); @@ -563,14 +586,14 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce resourceUsed[containerIndex] = false; } throw new IOException("Unable to create bag"); - } + for (int i = 0; i < children.size(); i++) { // Find the ith child in the overall array of aggregated // resources String childId = children.get(i).getAsString(); - logger.fine("Processing: " + childId); + logger.fine("Examining: " + childId); int index = getUnusedIndexOf(childId); if (resourceUsed[index] != null) { System.out.println("Warning: reusing resource " + index); @@ -580,137 +603,147 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce // entries JsonObject child = aggregates.get(index - 1).getAsJsonObject(); if (childIsContainer(child)) { - // create dir and process children - // processContainer will mark this item as used - processContainer(child, currentPath); + // Recursively collect files from this container + collectAllFiles(child, currentPath, allFiles); } else { - resourceUsed[index] = true; - // add item - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - logger.fine("File url: " + dataUrl); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - if (titles.contains(childTitle)) { - logger.warning("**** Multiple items with the same title in: " + currentPath); - logger.warning("**** Will cause failure in hash and size validation in: " + bagID); - } else { - titles.add(childTitle); - } - String childPath = currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if (directoryLabel != null) { - childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; - } // Get file size Long fileSize = null; if (child.has(JsonLDTerm.filesize.getLabel())) { fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); } - if(fileSize == null) { - logger.severe("File size missing for " + childPath); + if (fileSize == null) { + logger.severe("File size missing for child: " + childId); throw new IOException("Unable to create bag due to missing file size"); } - String childHash = null; - if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType - .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); - if (hashtype == null) { - // If one wasn't set as a default, pick up what the first child with one uses - hashtype = childHashType; - } - if (hashtype != null && !hashtype.equals(childHashType)) { - logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); - } else { - childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); - if (checksumMap.containsValue(childHash)) { - // Something else has this hash - logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); - } - logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); - checksumMap.put(childPath, childHash); - } + // Store minimal info for sorting - JsonObject is just a reference + allFiles.add(new FileEntry(fileSize, child, currentPath, index)); + } + } + } + + + // Process all files in sorted order + private void processAllFiles(List sortedFiles) + throws IOException, ExecutionException, InterruptedException { + + if ((hashtype == null) | ignorehashes) { + hashtype = DataFile.ChecksumType.SHA512; + } + + for (FileEntry entry : sortedFiles) { + // Extract all needed information from the JsonObject reference + JsonObject child = entry.jsonObject; + String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); + String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + + // Build full path using stored currentPath + String childPath = entry.currentPath + childTitle; + JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = entry.currentPath + directoryLabel.getAsString() + "/" + childTitle; + } + + // Get hash if exists + String childHash = null; + if (child.has(JsonLDTerm.checksum.getLabel())) { + ChecksumType childHashType = ChecksumType.fromString( + child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + if (hashtype == null) { + hashtype = childHashType; } - if ((hashtype == null) | ignorehashes) { - // Pick sha512 when ignoring hashes or none exist - hashtype = DataFile.ChecksumType.SHA512; + if (hashtype != null && !hashtype.equals(childHashType)) { + logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() + + " hashes for " + childTitle); + } else { + childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); } - try { - if ((childHash == null) | ignorehashes) { - // Generate missing hash - try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { - - if (hashtype != null) { - if (hashtype.equals(DataFile.ChecksumType.SHA1)) { - childHash = DigestUtils.sha1Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { - childHash = DigestUtils.sha256Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { - childHash = DigestUtils.sha512Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { - childHash = DigestUtils.md5Hex(inputStream); - } + } + + resourceUsed[entry.resourceIndex] = true; + + try { + if ((childHash == null) | ignorehashes) { + // Generate missing hash + InputStream inputStream = null; + try { + inputStream = getInputStreamSupplier(dataUrl).get(); + + if (hashtype != null) { + if (hashtype.equals(DataFile.ChecksumType.SHA1)) { + childHash = DigestUtils.sha1Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { + childHash = DigestUtils.sha256Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { + childHash = DigestUtils.sha512Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { + childHash = DigestUtils.md5Hex(inputStream); } - - } catch (IOException e) { - logger.severe("Failed to read " + childPath); - throw e; - } - if (childHash != null) { - JsonObject childHashObject = new JsonObject(); - childHashObject.addProperty("@type", hashtype.toString()); - childHashObject.addProperty("@value", childHash); - child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); - - checksumMap.put(childPath, childHash); - } else { - logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } + + } catch (IOException e) { + logger.severe("Failed to read " + childPath); + throw e; + } finally { + IOUtils.closeQuietly(inputStream); } - - // Add file to bag or fetch file - if (shouldAddToFetchFile(fileSize)) { - // Add to fetch file instead of including in bag - logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl); - addToFetchFile(dataUrl, fileSize, childPath); - usingFetchFile = true; + if (childHash != null) { + JsonObject childHashObject = new JsonObject(); + childHashObject.addProperty("@type", hashtype.toString()); + childHashObject.addProperty("@value", childHash); + child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); + + checksumMap.put(childPath, childHash); } else { - // Add file to bag as before - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); - if (fileSize != null) { - currentBagDataSize += fileSize; - } + logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } - - dataCount++; - if (dataCount % 1000 == 0) { - logger.info("Retrieval in progress: " + dataCount + " files retrieved"); - } - if (fileSize != null) { - totalDataSize += fileSize; - if (fileSize > maxFileSize) { - maxFileSize = fileSize; - } - } - if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { - mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); + } else { + // Hash already exists, add to checksumMap + if (checksumMap.containsValue(childHash)) { + logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + + " has hash: " + childHash + " in: " + bagID); } - - } catch (Exception e) { - resourceUsed[index] = false; - e.printStackTrace(); - throw new IOException("Unable to create bag"); + logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); + checksumMap.put(childPath, childHash); + } + + // Add file to bag or fetch file + if (shouldAddToFetchFile(entry.size)) { + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + addToFetchFile(dataUrl, entry.size, childPath); + usingFetchFile = true; + } else { + logger.fine("Requesting: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + createFileFromURL(childPath, dataUrl); + currentBagDataSize += entry.size; + } + + dataCount++; + if (dataCount % 1000 == 0) { + logger.info("Retrieval in progress: " + dataCount + " files retrieved"); + } + + totalDataSize += entry.size; + if (entry.size > maxFileSize) { + maxFileSize = entry.size; + } + + if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { + mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); } - // Check for nulls! - pidMap.put(child.get("@id").getAsString(), childPath); - + } catch (Exception e) { + resourceUsed[entry.resourceIndex] = false; + e.printStackTrace(); + throw new IOException("Unable to create bag"); } + + pidMap.put(child.get("@id").getAsString(), childPath); } } - + // Helper method to determine if file should go to fetch file private boolean shouldAddToFetchFile(long fileSize) { @@ -1394,5 +1427,24 @@ public static void setNumConnections(int numConnections) { BagGenerator.numConnections = numConnections; logger.fine("All BagGenerators will use " + numConnections + " threads"); } - + + // Inner class to hold file information before processing + private static class FileEntry implements Comparable { + final long size; + final JsonObject jsonObject; // Direct reference, not a copy + final String currentPath; // Parent directory path + final int resourceIndex; // Still need this for resourceUsed tracking + + FileEntry(long size, JsonObject jsonObject, String currentPath, int resourceIndex) { + this.size = size; + this.jsonObject = jsonObject; + this.currentPath = currentPath; + this.resourceIndex = resourceIndex; + } + + @Override + public int compareTo(FileEntry other) { + return Long.compare(this.size, other.size); + } + } } \ No newline at end of file From bc63285cb16a4215fefbc8a1e48bb12b8f60fdfe Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 15:10:28 -0500 Subject: [PATCH 66/97] only add subcollection folders (if they exist) --- .../iq/dataverse/util/bagit/BagGenerator.java | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 4b94fa44bbd..6de7d970605 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -299,7 +299,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { // children // Recursively collect all files from the entire tree, start with an empty set of processedContainers List allFiles = new ArrayList<>(); - collectAllFiles(aggregation, currentPath, allFiles); + collectAllFiles(aggregation, currentPath, allFiles, false); // Sort files by size (smallest first) Collections.sort(allFiles); @@ -555,20 +555,21 @@ public static String getValidName(String bagName) { private void processContainer(JsonObject item, String currentPath) throws IOException { // Collect all files recursively and process containers to create dirs in the zip - private void collectAllFiles(JsonObject item, String currentPath, List allFiles) + private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + if (addTitle) { + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; } - logger.fine("Collecting files from " + title + "/ at path " + currentPath); - currentPath = currentPath + title + "/"; - // Mark this container as processed String containerId = item.get("@id").getAsString(); @@ -602,9 +603,10 @@ private void collectAllFiles(JsonObject item, String currentPath, List Date: Fri, 30 Jan 2026 15:22:28 -0500 Subject: [PATCH 67/97] replace deprecated constructs --- .../iq/dataverse/util/bagit/BagGenerator.java | 74 +++++++++---------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 6de7d970605..bd65bd35340 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -48,6 +48,7 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntryRequest; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.compress.archivers.zip.ZipFile.Builder; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; @@ -466,57 +467,54 @@ public boolean generateBag(String bagName, boolean temp) { public void validateBag(String bagId) { logger.info("Validating Bag"); - ZipFile zf = null; - InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = ZipFile.builder().setFile(bagFile).get(); - ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); - if (entry != null) { - logger.info("SHA1 hashes used"); - hashtype = DataFile.ChecksumType.SHA1; - } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { + ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { - logger.info("SHA512 hashes used"); - hashtype = DataFile.ChecksumType.SHA512; + logger.info("SHA1 hashes used"); + hashtype = DataFile.ChecksumType.SHA1; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); if (entry != null) { - logger.info("SHA256 hashes used"); - hashtype = DataFile.ChecksumType.SHA256; + logger.info("SHA512 hashes used"); + hashtype = DataFile.ChecksumType.SHA512; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); if (entry != null) { - logger.info("MD5 hashes used"); - hashtype = DataFile.ChecksumType.MD5; + logger.info("SHA256 hashes used"); + hashtype = DataFile.ChecksumType.SHA256; + } else { + entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + if (entry != null) { + logger.info("MD5 hashes used"); + hashtype = DataFile.ChecksumType.MD5; + } } } } + if (entry == null) + throw new IOException("No manifest file found"); + try (InputStream is = zf.getInputStream(entry)) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line = br.readLine(); + while (line != null) { + logger.fine("Hash entry: " + line); + int breakIndex = line.indexOf(' '); + String hash = line.substring(0, breakIndex); + String path = line.substring(breakIndex + 1); + logger.fine("Adding: " + path + " with hash: " + hash); + checksumMap.put(path, hash); + line = br.readLine(); + } + } } - if (entry == null) - throw new IOException("No manifest file found"); - is = zf.getInputStream(entry); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String line = br.readLine(); - while (line != null) { - logger.fine("Hash entry: " + line); - int breakIndex = line.indexOf(' '); - String hash = line.substring(0, breakIndex); - String path = line.substring(breakIndex + 1); - logger.fine("Adding: " + path + " with hash: " + hash); - checksumMap.put(path, hash); - line = br.readLine(); - } - IOUtils.closeQuietly(is); logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { logger.log(Level.SEVERE, "Could not validate Hashes", e); - } finally { - IOUtils.closeQuietly(zf); } return; } @@ -667,10 +665,8 @@ private void processAllFiles(List sortedFiles) try { if ((childHash == null) | ignorehashes) { // Generate missing hash - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); - + + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()){ if (hashtype != null) { if (hashtype.equals(DataFile.ChecksumType.SHA1)) { childHash = DigestUtils.sha1Hex(inputStream); @@ -686,8 +682,6 @@ private void processAllFiles(List sortedFiles) } catch (IOException e) { logger.severe("Failed to read " + childPath); throw e; - } finally { - IOUtils.closeQuietly(inputStream); } if (childHash != null) { JsonObject childHashObject = new JsonObject(); From 69c9a0d822dc8bc5904b5f08d0ff6e8516194979 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 16:35:36 -0500 Subject: [PATCH 68/97] restore name collision check --- .../iq/dataverse/util/bagit/BagGenerator.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index bd65bd35340..63969a21c5b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -25,6 +25,7 @@ import java.util.Calendar; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -151,7 +152,7 @@ public class BagGenerator { private long currentBagDataSize = 0; private StringBuilder fetchFileContent = new StringBuilder(); private boolean usingFetchFile = false; - + // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; private static final String CONTACT_EMAIL = "Contact-Email: "; @@ -627,6 +628,9 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) throws IOException, ExecutionException, InterruptedException { + // Track titles to detect duplicates + Set titles = new HashSet<>(); + if ((hashtype == null) | ignorehashes) { hashtype = DataFile.ChecksumType.SHA512; } @@ -637,6 +641,14 @@ private void processAllFiles(List sortedFiles) String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + // Check for duplicate titles + if (titles.contains(childTitle)) { + logger.warning("**** Multiple items with the same title in: " + entry.currentPath); + logger.warning("**** Will cause failure in hash and size validation in: " + bagID); + } else { + titles.add(childTitle); + } + // Build full path using stored currentPath String childPath = entry.currentPath + childTitle; JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); From 422435a22c97b55b5d51aca13a287a77d0821022 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 16:35:53 -0500 Subject: [PATCH 69/97] add null check to quiet log/avoid exception --- .../harvard/iq/dataverse/util/bagit/OREMap.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 426d5c9aa5f..0d99a5bddd1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -506,11 +506,16 @@ private static void addCvocValue(String val, JsonArrayBuilder vals, JsonObject c for (String prefix : context.keySet()) { localContext.putIfAbsent(prefix, context.getString(prefix)); } - JsonObjectBuilder job = Json.createObjectBuilder(datasetFieldService.getExternalVocabularyValue(val)); - job.add("@id", val); - JsonObject extVal = job.build(); - logger.fine("Adding: " + extVal); - vals.add(extVal); + JsonObject cachedValue = datasetFieldService.getExternalVocabularyValue(val); + if (cachedValue != null) { + JsonObjectBuilder job = Json.createObjectBuilder(cachedValue); + job.add("@id", val); + JsonObject extVal = job.build(); + logger.fine("Adding: " + extVal); + vals.add(extVal); + } else { + vals.add(val); + } } else { vals.add(val); } From d9cfe1df63dd6be3677c603a5aa3339a0dfb4284 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 17:39:17 -0500 Subject: [PATCH 70/97] cleanup - checksum change --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 63969a21c5b..f23df2947bd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -49,7 +49,6 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntryRequest; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; -import org.apache.commons.compress.archivers.zip.ZipFile.Builder; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; @@ -593,7 +592,7 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) // Get hash if exists String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { hashtype = childHashType; } From 4895f80b6530489988828e72aa1149e984864c7c Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 18:09:17 -0500 Subject: [PATCH 71/97] cleanup, suppress downloads with gbrec for fetch file --- .../iq/dataverse/util/bagit/BagGenerator.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f23df2947bd..56116976e18 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -50,7 +50,6 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; -import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.config.RequestConfig; @@ -551,14 +550,12 @@ public static String getValidName(String bagName) { return bagName.replaceAll("\\W", "-"); } - private void processContainer(JsonObject item, String currentPath) throws IOException { // Collect all files recursively and process containers to create dirs in the zip private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) throws IOException { JsonArray children = getChildren(item); - HashSet titles = new HashSet(); - if (addTitle) { + if (addTitle) { //For any sub-collections (non-Dataverse) String title = null; if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { title = item.get("Title").getAsString(); @@ -716,6 +713,7 @@ private void processAllFiles(List sortedFiles) // Add file to bag or fetch file if (shouldAddToFetchFile(entry.size)) { + dataUrl = suppressDownloadCounts(dataUrl); logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); addToFetchFile(dataUrl, entry.size, childPath); @@ -1291,7 +1289,7 @@ InputStreamSupplier getInputStreamSupplier(final String uriString) { public InputStream get() { try { // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) - String modifiedUriString = uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + String modifiedUriString = suppressDownloadCounts(uriString); URI uri = new URI(modifiedUriString); logger.finest("Final URI used (with gbrecs param): " + modifiedUriString); int tries = 0; @@ -1386,6 +1384,11 @@ public void close() throws IOException { }; } + private String suppressDownloadCounts(String uriString ) { + // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) + return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + } + /** * Adapted from org/apache/commons/io/FileUtils.java change to SI - add 2 digits * of precision From 62a03b2f097860ba1c81fbd85d7bfd15e7dd9b31 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sun, 1 Feb 2026 14:21:52 -0500 Subject: [PATCH 72/97] add setting, refactor, for non-holey option --- .../iq/dataverse/settings/JvmSettings.java | 7 ++-- .../iq/dataverse/util/bagit/BagGenerator.java | 36 +++++++++++++------ 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index b32b7a8d77d..086ed7929aa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -276,9 +276,10 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), - SCOPE_BAGIT_HOLEY(SCOPE_BAGIT, "holey"), - BAGIT_HOLEY_MAX_FILE_SIZE(SCOPE_BAGIT_HOLEY, "max-file-size"), - BAGIT_HOLEY_MAX_DATA_SIZE(SCOPE_BAGIT_HOLEY, "max-data-size"), + SCOPE_BAGIT_ZIP(SCOPE_BAGIT, "zip"), + BAGIT_ZIP_MAX_FILE_SIZE(SCOPE_BAGIT_ZIP, "max-file-size"), + BAGIT_ZIP_MAX_DATA_SIZE(SCOPE_BAGIT_ZIP, "max-data-size"), + BAGIT_ZIP_HOLEY(SCOPE_BAGIT_ZIP, "holey"), // STORAGE USE SETTINGS diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 56116976e18..2ca833ba839 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -144,12 +144,14 @@ public class BagGenerator { static PrintWriter pw = null; - //Holey Bags + // Size limits and holey Bags private long maxDataFileSize = Long.MAX_VALUE; private long maxTotalDataSize = Long.MAX_VALUE; private long currentBagDataSize = 0; private StringBuilder fetchFileContent = new StringBuilder(); private boolean usingFetchFile = false; + private boolean createHoleyBag = false; + private List oversizedFiles = new ArrayList<>(); // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; @@ -234,9 +236,12 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio } private void initializeHoleyBagLimits() { - this.maxDataFileSize = JvmSettings.BAGIT_HOLEY_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); - this.maxTotalDataSize = JvmSettings.BAGIT_HOLEY_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); - logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + ", maxTotalDataSize: " + maxTotalDataSize); + this.maxDataFileSize = JvmSettings.BAGIT_ZIP_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_ZIP_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.createHoleyBag = JvmSettings.BAGIT_ZIP_HOLEY.lookupOptional(Boolean.class).orElse(false); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + + ", maxTotalDataSize: " + maxTotalDataSize + + ", createHoleyBag: " + createHoleyBag); } public void setIgnoreHashes(boolean val) { @@ -603,6 +608,7 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) } // Add file to bag or fetch file - if (shouldAddToFetchFile(entry.size)) { + if (!addToZip(entry.size)) { + if(createHoleyBag) { dataUrl = suppressDownloadCounts(dataUrl); logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); addToFetchFile(dataUrl, entry.size, childPath); usingFetchFile = true; + } else { + // Add to list for archiver to retrieve + oversizedFiles.add(entry); + logger.fine("Adding " + childPath + " to oversized files list for archiver"); + } } else { logger.fine("Requesting: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); @@ -750,28 +762,28 @@ private void processAllFiles(List sortedFiles) } // Helper method to determine if file should go to fetch file - private boolean shouldAddToFetchFile(long fileSize) { + private boolean addToZip(long fileSize) { // Check individual file size limit if (fileSize > maxDataFileSize) { logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); - return true; + return false; } // Check total bag size limit if (currentBagDataSize + fileSize > maxTotalDataSize) { logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + ", File: " + fileSize + ", Max: " + maxTotalDataSize); - return true; + return false; } - return false; + return true; } // Method to append to fetch file content private void addToFetchFile(String url, long size, String filename) { // Format: URL size filename - fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\n"); + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\r\n"); } // Method to write fetch file to bag (call this before finalizing the bag) @@ -1389,6 +1401,10 @@ private String suppressDownloadCounts(String uriString ) { return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; } + public List getOversizedFiles() { + return oversizedFiles; + } + /** * Adapted from org/apache/commons/io/FileUtils.java change to SI - add 2 digits * of precision From 637b2e30e25b57d7fa87d5d1b2b70eeb08ac5ad5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 11:00:21 -0500 Subject: [PATCH 73/97] Update to track non-zipped files, add method --- .../iq/dataverse/util/bagit/BagGenerator.java | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 2ca833ba839..60cabc9ac99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -640,8 +640,8 @@ private void processAllFiles(List sortedFiles) for (FileEntry entry : sortedFiles) { // Extract all needed information from the JsonObject reference JsonObject child = entry.jsonObject; - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + + String childTitle = entry.getChildTitle(); // Check for duplicate titles if (titles.contains(childTitle)) { @@ -651,12 +651,7 @@ private void processAllFiles(List sortedFiles) titles.add(childTitle); } - // Build full path using stored currentPath - String childPath = entry.currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if (directoryLabel != null) { - childPath = entry.currentPath + directoryLabel.getAsString() + "/" + childTitle; - } + String childPath= entry.getChildPath(childTitle); // Get hash if exists String childHash = null; @@ -675,6 +670,7 @@ private void processAllFiles(List sortedFiles) } resourceUsed[entry.resourceIndex] = true; + String dataUrl = entry.getDataUrl(); try { if ((childHash == null) | ignorehashes) { @@ -716,11 +712,9 @@ private void processAllFiles(List sortedFiles) logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); checksumMap.put(childPath, childHash); } - // Add file to bag or fetch file if (!addToZip(entry.size)) { if(createHoleyBag) { - dataUrl = suppressDownloadCounts(dataUrl); logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); addToFetchFile(dataUrl, entry.size, childPath); @@ -1300,10 +1294,7 @@ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { try { - // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) - String modifiedUriString = suppressDownloadCounts(uriString); - URI uri = new URI(modifiedUriString); - logger.finest("Final URI used (with gbrecs param): " + modifiedUriString); + URI uri = new URI(uriString); int tries = 0; while (tries < 5) { @@ -1396,10 +1387,7 @@ public void close() throws IOException { }; } - private String suppressDownloadCounts(String uriString ) { - // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) - return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; - } + public List getOversizedFiles() { return oversizedFiles; @@ -1456,7 +1444,7 @@ public static void setNumConnections(int numConnections) { } // Inner class to hold file information before processing - private static class FileEntry implements Comparable { + public static class FileEntry implements Comparable { final long size; final JsonObject jsonObject; // Direct reference, not a copy final String currentPath; // Parent directory path @@ -1469,6 +1457,30 @@ private static class FileEntry implements Comparable { this.resourceIndex = resourceIndex; } + public String getDataUrl() { + return suppressDownloadCounts(jsonObject.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString()); + } + + public String getChildTitle() { + return jsonObject.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + + public String getChildPath(String title) { + // Build full path using stored currentPath + String childPath = currentPath + title; + JsonElement directoryLabel = jsonObject.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + title; + } + return childPath; + } + + private String suppressDownloadCounts(String uriString) { + // Adding gbrecs to suppress counting this access as a download (archiving is + // not a download indicating scientific use) + return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + } + @Override public int compareTo(FileEntry other) { return Long.compare(this.size, other.size); From a6b05056401065e356dd2dfed13a4aa080702a7a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 14:20:28 -0500 Subject: [PATCH 74/97] reuse stream supplier, update archivers to send oversized files --- .../impl/DuraCloudSubmitToArchiveCommand.java | 65 +++++-- .../GoogleCloudSubmitToArchiveCommand.java | 177 +++++++++++++----- .../impl/LocalSubmitToArchiveCommand.java | 14 +- .../impl/S3SubmitToArchiveCommand.java | 47 ++++- .../iq/dataverse/util/bagit/BagGenerator.java | 6 +- 5 files changed, 244 insertions(+), 65 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..b65f39fa484 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -7,16 +7,24 @@ import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; + import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudContext; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudHost; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudPort; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -96,6 +104,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); + Path tempBagFile = null; + try { /* * If there is a failure in creating a space, it is likely that a prior version @@ -161,20 +171,38 @@ public void run() { // Add BagIt ZIP file // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the // transfer + Path bagFile = null; + - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); - bagThread.join(); - if (success) { - logger.fine("Content: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); } - if (!success || !checksum.equals(localchecksum)) { + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + // Now upload the bag file + messageDigest = MessageDigest.getInstance("MD5"); + try (InputStream is = Files.newInputStream(bagFile); + DigestInputStream bagDigestInputStream = new DigestInputStream(is, messageDigest)) { + checksum = store.addContent(spaceName, fileName, bagDigestInputStream, bagFile.toFile().length(), "application/zip", null, null); + localchecksum = Hex.encodeHexString(bagDigestInputStream.getMessageDigest().digest()); + + if (checksum != null && checksum.equals(localchecksum)) { + logger.fine("Content: " + fileName + " added with checksum: " + checksum); + success = true; + } else { logger.severe("Failure on " + fileName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); + logger.severe(checksum + " not equal to " + localchecksum); try { store.deleteContent(spaceName, fileName); store.deleteContent(spaceName, baseFileName + "_datacite.xml"); @@ -185,9 +213,6 @@ public void run() { "DuraCloud Submission Failure: incomplete archive transfer"); } } - - logger.fine("DuraCloud Submission step: Content Transferred"); - // Document the location of dataset archival copy location (actually the URL // where you can // view it as an admin) @@ -223,8 +248,20 @@ public void run() { return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); } catch (NoSuchAlgorithmException e) { logger.severe("MD5 MessageDigest not available!"); + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("Error in transferring file to DuraCloud", + "DuraCloud Submission Failure: internal error"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } } else { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..21038a1eab6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -14,20 +14,29 @@ import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; + import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudBucket; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudProject; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.compress.parallel.InputStreamSupplier; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.util.Map; @@ -45,26 +54,28 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, + Map requestedSettings) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); logger.fine("Project: " + projectName + " Bucket: " + bucketName); if (bucketName != null && projectName != null) { Storage storage; - //Set a failure status that will be updated if we succeed + // Set a failure status that will be updated if we succeed JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + String cloudKeyFile = JvmSettings.FILES_DIRECTORY.lookup() + File.separator + "googlecloudkey.json"; - + + // Create temporary file for bag + Path tempBagFile = null; + try (FileInputStream cloudKeyStream = new FileInputStream(cloudKeyFile)) { storage = StorageOptions.newBuilder() - .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)) - .setProjectId(projectName) - .build() - .getService(); + .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)).setProjectId(projectName) + .build().getService(); Bucket bucket = storage.get(bucketName); Dataset dataset = dv.getDataset(); @@ -72,6 +83,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') .replace('.', '-').toLowerCase(); + String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; String dataciteXml = getDataCiteXml(dv); MessageDigest messageDigest = MessageDigest.getInstance("MD5"); @@ -102,7 +114,8 @@ public void run() { Thread.sleep(10); i++; } - Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", + digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); dcThread.join(); String checksum = dcXml.getMd5ToHexString(); @@ -110,7 +123,8 @@ public void run() { String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); if (!success || !checksum.equals(localchecksum)) { logger.severe("Failure on " + spaceName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "datacite.xml transfer did not succeed"); + logger.severe(success ? checksum + " not equal to " + localchecksum + : "datacite.xml transfer did not succeed"); try { dcXml.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { @@ -119,55 +133,112 @@ public void run() { return new Failure("Error in transferring DataCite.xml file to GoogleCloud", "GoogleCloud Submission Failure: incomplete metadata transfer"); } + } + + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); + } + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + if (bagSize == 0) { + throw new IOException("Generated bag file is empty"); + } + + // Upload bag file and calculate checksum during upload + messageDigest = MessageDigest.getInstance("MD5"); + String localChecksum; + + try (FileInputStream fis = new FileInputStream(tempBagFile.toFile()); + DigestInputStream dis = new DigestInputStream(fis, messageDigest)) { + + logger.fine("Uploading bag to GoogleCloud: " + bagFileName); + + Blob bag = bucket.create(bagFileName, dis, "application/zip", + Bucket.BlobWriteOption.doesNotExist()); + + if (bag.getSize() == 0) { + throw new IOException("Uploaded bag has zero size"); + } + + // Get checksum after upload completes + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = bag.getMd5ToHexString(); + + logger.fine("Bag: " + bagFileName + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); - // Store BagIt file - success = false; - String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Bag checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); + try { + bag.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); + } + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: bag checksum mismatch"); + } + } - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer + logger.fine("GoogleCloud Submission step: Content Transferred Successfully"); + + // Now upload any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + logger.fine("Uploading oversized file to GoogleCloud: " + fileKey); messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", - Bucket.BlobWriteOption.doesNotExist()); - if (bag.getSize() == 0) { - throw new IOException("Empty Bag"); + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get(); + DigestInputStream dis = new DigestInputStream(is, messageDigest)) { + Blob oversizedFileBlob = bucket.create(fileKey, dis, Bucket.BlobWriteOption.doesNotExist()); + if (oversizedFileBlob.getSize() == 0) { + throw new IOException("Uploaded oversized file has zero size: " + fileKey); } - bagThread.join(); - - checksum = bag.getMd5ToHexString(); - logger.fine("Bag: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe(success ? checksum + " not equal to " + localchecksum - : "bag transfer did not succeed"); + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = oversizedFileBlob.getMd5ToHexString(); + logger.fine("Oversized file: " + fileKey + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Oversized file checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); try { - bag.delete(Blob.BlobSourceOption.generationMatch()); + oversizedFileBlob.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { logger.warning(se.getMessage()); } - return new Failure("Error in transferring Zip file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete archive transfer"); + return new Failure("Error in transferring oversized file to GoogleCloud", + "GoogleCloud Submission Failure: oversized file transfer incomplete"); } + } catch (IOException e) { + logger.warning("Failed to upload oversized file: " + childPath + " : " + e.getMessage()); + return new Failure("Error uploading oversized file to Google Cloud: " + childPath); } + } - logger.fine("GoogleCloud Submission step: Content Transferred"); - - // Document the location of dataset archival copy location (actually the URL - // where you can view it as an admin) - // Changed to point at bucket where the zip and datacite.xml are visible + // Document the location of dataset archival copy location (actually the URL + // to the bucket). + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + String.format("https://storage.cloud.google.com/%s/%s", bucketName, spaceName)); - StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); - sb.append(bucketName + "/" + spaceName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - - } } else { - logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); + logger.warning( + "GoogleCloud Archiver Submision Workflow aborted: Dataset locked for publication/pidRegister"); + return new Failure("Dataset locked"); } } catch (Exception e) { @@ -177,11 +248,21 @@ public void run() { e.getLocalizedMessage() + ": check log for details"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; - } else { - return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + } else + + { + return new Failure( + "GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..76d7ae87f38 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -10,6 +10,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagItLocalPath; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -23,6 +24,7 @@ import java.io.File; import java.io.FileOutputStream; +import java.io.InputStream; import org.apache.commons.io.FileUtils; @@ -63,12 +65,22 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; //ToDo: generateBag(File f, true) seems to do the same thing (with a .tmp extension) - since we don't have to use a stream here, could probably just reuse the existing code? bagger.generateBag(new FileOutputStream(zipName + ".partial")); + // Now download any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + File destFile = new File(localPath, localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + "/" + childPath); + logger.fine("Downloading oversized file to " + destFile.getAbsolutePath()); + destFile.getParentFile().mkdirs(); + try (InputStream is = bagger.getInputStreamSupplier(entry.getDataUrl()).get()) { + FileUtils.copyInputStreamToFile(is, destFile); + } + } + File srcFile = new File(zipName + ".partial"); File destFile = new File(zipName); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..072fd0edb48 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -9,6 +9,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.S3ArchiverConfig; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.workflow.step.Failure; @@ -17,9 +18,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.logging.Level; import java.util.logging.Logger; import jakarta.annotation.Resource; @@ -28,6 +35,7 @@ import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; +import org.apache.commons.compress.parallel.InputStreamSupplier; import org.eclipse.microprofile.config.Config; import org.eclipse.microprofile.config.ConfigProvider; @@ -55,8 +63,11 @@ import software.amazon.awssdk.utils.StringUtils; import software.amazon.awssdk.transfer.s3.S3TransferManager; import software.amazon.awssdk.transfer.s3.model.CompletedFileUpload; +import software.amazon.awssdk.transfer.s3.model.CompletedUpload; import software.amazon.awssdk.transfer.s3.model.FileUpload; +import software.amazon.awssdk.transfer.s3.model.Upload; import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; +import software.amazon.awssdk.transfer.s3.model.UploadRequest; @RequiredPermissions(Permission.PublishDataset) public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { @@ -98,7 +109,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + ExecutorService executor = Executors.newCachedThreadPool(); + try { Dataset dataset = dv.getDataset(); @@ -150,7 +162,39 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t if (uploadResult.response().sdkHttpResponse().isSuccessful()) { logger.fine("S3 Submission step: Content Transferred"); + List bigFiles = bagger.getOversizedFiles(); + + for (FileEntry entry : bigFiles) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get()) { + + PutObjectRequest filePutRequest = PutObjectRequest.builder().bucket(bucketName) + .key(fileKey).build(); + + UploadRequest uploadRequest = UploadRequest.builder() + .putObjectRequest(filePutRequest) + .requestBody(AsyncRequestBody.fromInputStream(is, entry.getSize(), executor)) + .build(); + + Upload upload = tm.upload(uploadRequest); + CompletedUpload completedUpload = upload.completionFuture().join(); + + if (completedUpload.response().sdkHttpResponse().isSuccessful()) { + logger.fine("Successfully uploaded oversized file: " + fileKey); + } else { + logger.warning("Failed to upload oversized file: " + fileKey); + return new Failure("Error uploading oversized file to S3: " + fileKey); + } + } catch (IOException e) { + logger.log(Level.WARNING, + "Failed to get input stream for oversized file: " + fileKey, e); + return new Failure("Error getting input stream for oversized file: " + fileKey); + } + } statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); } else { @@ -175,6 +219,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t e.getLocalizedMessage() + ": check log for details"); } finally { + executor.shutdown(); if (tm != null) { tm.close(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 60cabc9ac99..55235f85491 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1289,7 +1289,7 @@ private HttpGet createNewGetRequest(URI url, String returnType) { * * Caller must close the stream when done. */ - InputStreamSupplier getInputStreamSupplier(final String uriString) { + public InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { @@ -1485,5 +1485,9 @@ private String suppressDownloadCounts(String uriString) { public int compareTo(FileEntry other) { return Long.compare(this.size, other.size); } + + public long getSize() { + return size; + } } } \ No newline at end of file From 5739e3521fefd90c4f7a5c1c7940f25acd670294 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 15:08:58 -0500 Subject: [PATCH 75/97] docs, release note update --- doc/release-notes/12144-un-holey-bags.md | 21 +++++++++++++++++++ .../source/admin/big-data-administration.rst | 1 + .../source/installation/config.rst | 17 +++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 doc/release-notes/12144-un-holey-bags.md diff --git a/doc/release-notes/12144-un-holey-bags.md b/doc/release-notes/12144-un-holey-bags.md new file mode 100644 index 00000000000..3c9c632eb6c --- /dev/null +++ b/doc/release-notes/12144-un-holey-bags.md @@ -0,0 +1,21 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse +- the size of data files and total dataset size that will be included in an archival bag can now be limited. Admins can choose whether files above these limits are transferred along with the zipped bag (creating a complete archival copy) or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be retrieved. In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials as needed) to make a complete copy + +### New JVM Options (MicroProfile Config Settings) +dataverse.bagit.zip.holey +dataverse.bagit.zip.max-data-size +dataverse.bagit.zip.max-file-size \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/big-data-administration.rst b/doc/sphinx-guides/source/admin/big-data-administration.rst index c4a98a6987a..c1d2a02c4a2 100644 --- a/doc/sphinx-guides/source/admin/big-data-administration.rst +++ b/doc/sphinx-guides/source/admin/big-data-administration.rst @@ -302,6 +302,7 @@ There are a broad range of options (that are not turned on by default) for impro - :ref:`:DisableSolrFacetsWithoutJsession` - disables facets for users who have disabled cookies (e.g. for bots) - :ref:`:DisableUncheckedTypesFacet` - only disables the facet showing the number of collections, datasets, files matching the query (this facet is potentially less useful than others) - :ref:`:StoreIngestedTabularFilesWithVarHeaders` - by default, Dataverse stores ingested files without headers and dynamically adds them back at download time. Once this setting is enabled, Dataverse will leave the headers in place (for newly ingested files), reducing the cost of downloads +- :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` - options to control the size and temporary storage requirements when generating archival Bags - see :ref:`BagIt Export` Scaling Infrastructure diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9d5c7c0041..fff7a747063 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2259,6 +2259,8 @@ These archival Bags include all of the files and metadata in a given dataset ver The Dataverse Software offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse Software web interface. +The size of the zipped archival Bag can be limited, and files that don't fit within that limit can either be transferred separately (placed so that they are correctly positioned according to the BagIt specification when the zipped bag in unzipped in place) or just referenced for later download (using the BagIt concept of a 'holey' bag with a list of files in a ``fetch.txt`` file) can now be configured for all archivers. These settings allow for managing large datasets by excluding files over a certain size or total data size, which can be useful for archivers with size limitations or to reduce transfer times. See the :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` JVM options for more details. + At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). @@ -3868,6 +3870,21 @@ This can instead be restricted to only superusers who can publish the dataset us Example: ``dataverse.coar-notify.relationship-announcement.notify-superusers-only=true`` +.. _dataverse.bagit.zip.holey: + +``dataverse.bagit.zip.holey`` + A boolean that, if true, will cause the BagIt archiver to create a "holey" bag. In a holey bag, files that are not included in the bag are listed in the ``fetch.txt`` file with a URL from which they can be downloaded. This is used in conjunction with ``dataverse.bagit.zip.max-file-size`` and/or ``dataverse.bagit.zip.max-data-size``. Default: false. + +.. _dataverse.bagit.zip.max-data-size: + +``dataverse.bagit.zip.max-data-size`` + The maximum total (uncompressed) size of data files (in bytes) to include in a BagIt zip archive. If the total size of the dataset files exceeds this limit, files will be excluded from the zipped bag (starting from the largest) until the total size is under the limit. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + +.. _dataverse.bagit.zip.max-file-size: + +``dataverse.bagit.zip.max-file-size`` + The maximum (uncompressed) size of a single file (in bytes) to include in a BagIt zip archive. Any file larger than this will be excluded. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + .. _feature-flags: Feature Flags From 5c82ab8504579b6204105485dce96c12dea6fe89 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 15:09:53 -0500 Subject: [PATCH 76/97] style fix --- .../command/impl/GoogleCloudSubmitToArchiveCommand.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 21038a1eab6..17e7b641cf6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -258,9 +258,7 @@ public void run() { dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; - } else - - { + } else { return new Failure( "GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); } From 949606bacde2dcf451a2f51f2de32c944eca5041 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 17 Feb 2026 10:39:54 -0500 Subject: [PATCH 77/97] merge fixes - refactor precondition check for prior versions --- .../edu/harvard/iq/dataverse/DatasetPage.java | 4 ++ .../impl/AbstractSubmitToArchiveCommand.java | 37 ++++++++++--------- .../impl/DRSSubmitToArchiveCommand.java | 17 ++++++++- .../impl/DuraCloudSubmitToArchiveCommand.java | 1 + .../GoogleCloudSubmitToArchiveCommand.java | 2 +- .../ArchivalSubmissionWorkflowStep.java | 31 +++++++++++++++- 6 files changed, 71 insertions(+), 21 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index e29224d2980..9d5a3e8822b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -39,6 +39,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.UpdateDatasetVersionCommand; import edu.harvard.iq.dataverse.export.ExportService; import edu.harvard.iq.dataverse.util.cache.CacheFactoryBean; +import edu.harvard.iq.dataverse.util.json.JsonUtil; import io.gdcc.spi.export.ExportException; import io.gdcc.spi.export.Exporter; import edu.harvard.iq.dataverse.ingest.IngestRequest; @@ -102,6 +103,8 @@ import jakarta.faces.view.ViewScoped; import jakarta.inject.Inject; import jakarta.inject.Named; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; import jakarta.persistence.OptimisticLockException; import org.apache.commons.lang3.StringUtils; @@ -157,6 +160,7 @@ import edu.harvard.iq.dataverse.search.SearchFields; import edu.harvard.iq.dataverse.search.SearchUtil; import edu.harvard.iq.dataverse.search.SolrClientService; +import edu.harvard.iq.dataverse.settings.FeatureFlags; import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.util.SignpostingResources; import edu.harvard.iq.dataverse.util.FileMetadataUtil; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 8056fc1023a..353f8ee7be7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -81,12 +81,25 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { //No un-expired token token = ctxt.authentication().generateApiTokenForUser(user); } - runArchivingProcess(version, token, requestedSettings); + if (!preconditionsMet(version, token, requestedSettings)) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + } else { + + String dataCiteXml = getDataCiteXml(version); + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + Map terms = getJsonLDTerms(oreMap); + performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); + } return ctxt.em().merge(version); } // While we have a transaction context, get the terms needed to create the baginfo file - private Map getJsonLDTerms(OREMap oreMap) { + public Map getJsonLDTerms(OREMap oreMap) { Map terms = new HashMap(); terms.put(DatasetFieldConstant.datasetContact, oreMap.getContactTerm()); terms.put(DatasetFieldConstant.datasetContactName, oreMap.getContactNameTerm()); @@ -108,9 +121,7 @@ private Map getJsonLDTerms(OREMap oreMap) { * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSettings) { - // this.requestedSettings won't be set yet in the workflow case, so set it now (used in getNumberOfBagGeneratorThreads) - this.requestedSettings.putAll(requestedSettings); + public boolean preconditionsMet(DatasetVersion version, ApiToken token, Map requestedSettings) { // Check if earlier versions must be archived first String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived.toString()); boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); @@ -131,13 +142,7 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) // || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE) ) { - JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); - statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); - statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, - "Successful archiving of earlier versions is required."); - version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); - return new Failure("Earlier versions must be successfully archived first", - "Archival prerequisites not met"); + return false; } } if (versionInLoop.equals(version)) { @@ -146,11 +151,7 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t } } - String dataCiteXml = getDataCiteXml(version); - OREMap oreMap = new OREMap(version, false); - JsonObject ore = oreMap.getOREMap(); - Map terms = getJsonLDTerms(oreMap); - return performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); + return true; } @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) @@ -203,7 +204,7 @@ public String describe() { + version.getFriendlyVersionNumber()+")]"; } - String getDataCiteXml(DatasetVersion dv) { + public String getDataCiteXml(DatasetVersion dv) { DataCitation dc = new DataCitation(dv); Map metadata = dc.getDataCiteMetadata(); return DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java index d20dcb06e0c..1a49a68b097 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java @@ -15,6 +15,8 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -139,7 +141,20 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { //No un-expired token token = ctxt.authentication().generateApiTokenForUser(user); } - runArchivingProcess(version, token, requestedSettings); + if (!preconditionsMet(version, token, requestedSettings)) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + } else { + + String dataCiteXml = getDataCiteXml(version); + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + Map terms = getJsonLDTerms(oreMap); + performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); + } return ctxt.em().merge(version); } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index e18bae58a31..f73dbba178d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -10,6 +10,7 @@ import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudHost; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudPort; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 1b09a802e8f..4e64fcfaabc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -169,7 +169,7 @@ public void run() { tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); bagger.setAuthenticationKey(token.getTokenString()); // Generate bag to temporary file using the provided ore JsonObject try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index 3a77629c603..ba35ca0273a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -7,6 +7,8 @@ import edu.harvard.iq.dataverse.engine.command.impl.AbstractSubmitToArchiveCommand; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.ArchiverUtil; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStep; @@ -17,6 +19,7 @@ import java.util.logging.Level; import java.util.logging.Logger; +import jakarta.json.JsonObject; import jakarta.servlet.http.HttpServletRequest; /** @@ -57,6 +60,24 @@ public WorkflowStepResult run(WorkflowContext context) { String className = requestedSettings.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvr, context.getDataset().getReleasedVersion()); if (archiveCommand != null) { + // Generate the required components for archiving + DatasetVersion version = context.getDataset().getReleasedVersion(); + if (!archiveCommand.preconditionsMet(version, context.getApiToken(), requestedSettings)) { + return new Failure("Earlier versions must be successfully archived first", + "Archival prerequisites not met"); + } + + // Generate DataCite XML + String dataCiteXml = archiveCommand.getDataCiteXml(version); + + // Generate OREMap + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + + // Get JSON-LD terms + Map terms = archiveCommand.getJsonLDTerms(oreMap); + + // Call the updated method with all required parameters /* * Note: because this must complete before the workflow can complete and update the version status * in the db a long-running archive submission via workflow could hit a transaction timeout and fail. @@ -68,7 +89,15 @@ public WorkflowStepResult run(WorkflowContext context) { * pending as is done when running archiving from the UI/API. Instead, there is a generic workflow * lock on the dataset. */ - return (archiveCommand.runArchivingProcess(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); + return archiveCommand.performArchiveSubmission( + version, + dataCiteXml, + ore, + terms, + context.getApiToken(), + requestedSettings + ); + } else { logger.severe("No Archiver instance could be created for name: " + className); return new Failure("No Archiver", "Could not create instance of class: " + className); From de9ed31ee8eb79c75b5ba4e1044a1aaf7719780e Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 17 Feb 2026 13:42:02 -0500 Subject: [PATCH 78/97] test fix --- .../engine/command/impl/AbstractSubmitToArchiveCommand.java | 2 +- .../iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 353f8ee7be7..c4ec5bce736 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -99,7 +99,7 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { } // While we have a transaction context, get the terms needed to create the baginfo file - public Map getJsonLDTerms(OREMap oreMap) { + public static Map getJsonLDTerms(OREMap oreMap) { Map terms = new HashMap(); terms.put(DatasetFieldConstant.datasetContact, oreMap.getContactTerm()); terms.put(DatasetFieldConstant.datasetContactName, oreMap.getContactNameTerm()); diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java index dbbf3241318..05e83b8540d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.util.bagit; +import edu.harvard.iq.dataverse.engine.command.impl.AbstractSubmitToArchiveCommand; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -52,7 +53,7 @@ private void initializeBagGenerator() throws Exception { when(mockOreMap.getOREMap()).thenReturn(oremapObject); // Initialize BagGenerator with test data - bagGenerator = new BagGenerator(mockOreMap, ""); + bagGenerator = new BagGenerator(oremapObject, "", AbstractSubmitToArchiveCommand.getJsonLDTerms(mockOreMap)); setPrivateField(bagGenerator, "aggregation", (com.google.gson.JsonObject) JsonParser .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString())); setPrivateField(bagGenerator, "totalDataSize", 1024000L); From ee87ab57aff46a44bab726bbeb90e84b8588265d Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 17 Feb 2026 13:42:48 -0500 Subject: [PATCH 79/97] style fix to separate submit button from status --- src/main/webapp/dataset-versions.xhtml | 5 +++-- src/main/webapp/resources/css/structure.css | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index cb6e2d5a081..185009b8115 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -177,11 +177,12 @@ - + - + diff --git a/src/main/webapp/resources/css/structure.css b/src/main/webapp/resources/css/structure.css index cd2e7d33d10..27cb0d7e8bf 100644 --- a/src/main/webapp/resources/css/structure.css +++ b/src/main/webapp/resources/css/structure.css @@ -936,6 +936,9 @@ div.dvnDifferanceTable .versionValue { } div[id$="versionsTable"] tbody {word-break:break-word;} +.archive-submit-link { + display: block; +} /* DATATABLE + DROPDOWN BUTTON + OVERFLOW VISIBLE */ thead.ui-datatable-scrollable-theadclone {display:none} From 9840648f10d36b166032d42d9f5e874629201e22 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 17 Feb 2026 14:05:22 -0500 Subject: [PATCH 80/97] missing param --- src/main/webapp/dataset-versions.xhtml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index 185009b8115..df5a39c09b7 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -179,7 +179,7 @@ + action="#{DatasetPage.archiveVersion(versionTab.id, !empty(versionTab.archivalCopyLocationStatus))}"> From 20008eca622d3aca95295aa3344463c1879b202a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 18 Feb 2026 10:43:29 -0500 Subject: [PATCH 81/97] add sleep --- src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java b/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java index 709908ac6eb..22dfe61da07 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java @@ -954,7 +954,8 @@ public void testDeleteFiles() { reindexDataset4ToFindDatabaseId.then().assertThat() .statusCode(OK.getStatusCode()); Integer datasetId4 = JsonPath.from(reindexDataset4ToFindDatabaseId.asString()).getInt("data.id"); - + UtilIT.sleepForReindex(datasetPersistentId4, apiToken, 5); + Response destroyDataset4 = UtilIT.destroyDataset(datasetId4, apiToken); destroyDataset4.prettyPrint(); destroyDataset4.then().assertThat() From 6fcd84dad3283c9ffeee39df400e043e35f7682b Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 18 Feb 2026 13:51:28 -0500 Subject: [PATCH 82/97] release note updates --- doc/release-notes/12144-un-holey-bags.md | 80 +++++++++++++++++------- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/doc/release-notes/12144-un-holey-bags.md b/doc/release-notes/12144-un-holey-bags.md index 3c9c632eb6c..234f387fb0b 100644 --- a/doc/release-notes/12144-un-holey-bags.md +++ b/doc/release-notes/12144-un-holey-bags.md @@ -1,21 +1,59 @@ -This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: - -OAI-ORE -- now uses URI for checksum algorithms -- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). -- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed - -Archival Bag -- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" -- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed -- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). -- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) -- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation -- improvements to file retrieval w.r.t. retries on errors or throttling -- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse -- the size of data files and total dataset size that will be included in an archival bag can now be limited. Admins can choose whether files above these limits are transferred along with the zipped bag (creating a complete archival copy) or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be retrieved. In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials as needed) to make a complete copy - -### New JVM Options (MicroProfile Config Settings) -dataverse.bagit.zip.holey -dataverse.bagit.zip.max-data-size -dataverse.bagit.zip.max-file-size \ No newline at end of file +## Archiving, OAI-ORE, and BagIt Export + +This release includes multiple updates to the OAI_ORE metadata export and the process of creating archival bags, improving performance, fixing bugs, and adding significant new functionality. + +### General Archiving Improvements +- Multiple performance and scaling improvements have been made for creating archival bags for large datasets, including: + - the duration or archiving tasks triggered from the version table or API are no longer limited by the transaction time limit + - temporary storage space requirements have increased by 1/:BagGeneratorThreads of the zipped bag size. (This is a consequence of changes to avoid timeout errors on larger files/datasets.) + - The size of individual data files and the total dataset size that will be included in an archival bag can now be limited. + Admins can choose whether files above these limits are transferred along with, but outside, the zipped bag (creating a complete archival copy) + or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be + retrieved in a `fetch.txt` file). In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials + as needed) to make a complete copy. + - superusers can now see a pending status in the dataset version table while archiving is active + - workflows are now triggered outside the transactions related to publication, assuring that workflow locks and status updates are always recorded + - potential conflicts between archiving/workflows, indexing, and metadata exports after publication have been resolved, avoiding cases where the status/last update times for these actions were not recorded +- A bug has been fixed where superusers would incorrectly see the "Submit" button to launch archiving from the dataset page version table. +- The local, S3, and Google archivers have been updated to support deleting existing archival files for a version to allow re-creating the bag for a give version +- For archivers that support file deletion, it is now possible to recreate an archival bag after "Update Current Version" has been used (replacing the original bag). By default, Dataverse will mark the current version's archive as out-of-date, but will not automatically re-archive it. + - a new 'obsolete' status has been added to indicate when an archival bag exists for a version but it was created prior to an update current version change +- Improvements have been made to file retrieval for bagging, including retries on errors and when download requests are being throttled. + - a bug causing :BagGeneratorThreads to be ignored has been fixed, and the default has been reduced to 2 +- Retrieval of files for inclusion in an archival bag is no longer counted as a download. +- It is now possible to require that all previous versions have been succesfully archived before archiving of a newly published version can succeed. (This is intended to support use cases where deduplication of files between dataset versions will be done and is a step towards supporting the Oxford Common File Layout (OCFL).) +- The pending status has changed to use the same JSON format as other statuses + +### OAI-ORE Export Updates +- The export now uses URIs for checksum algorithms, conforming with JSON-LD requirements. +- A bug causing failures with deaccessioned versions has been fixed. This occurred when the deaccession note ("Deaccession Reason" in the UI) was null, which is permissible via the API. +- The `https://schema.org/additionalType` has been updated to "Dataverse OREMap Format v1.0.2" to reflect format changes. + +### Archival Bag (BagIt) Updates +- The `bag-info.txt` file now correctly includes information for dataset contacts, fixing a bug where nothing was included when multiple contacts were defined. (Multiple contacts were always included in the OAI-ORE file in the bag, only the baginfo file was affected). +- Values used in the `bag-info.txt` file that may be multi-line (i.e. with embedded CR or LF characters) are now properly indented and wrapped per the BagIt specification (Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- The dataset name is no longer used as a subdirectory within the `data/` directory to reduce issues with unzipping long paths on some filesystems. +- For dataset versions with no files, the empty `manifest-.txt` file will now use the algorithm from the `:FileFixityChecksumAlgorithm` setting instead of defaulting to MD5. +- A new key, `Dataverse-Bag-Version`, has been added to `bag-info.txt` with the value "1.0" to allow for tracking changes to Dataverse's archival bag generation over time. +- When using the `holey` bag option discussed above, the required fetch.txt file will be included. + + +### New Configuration Settings + +This release introduces several new settings to control archival and bagging behavior. + +- **`dataverse.archive.archive-only-if-earlier-versions-are-archived`** (Default: `false`) + When set to `true`, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. + +- **`dataverse.feature.archive-on-version-update`** (Default: `false`) + Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, + i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. + + +- **Bag Size Control And Holey Bag Support** + + The following JVM options (MicroProfile Config Settings) control this feature: + - `dataverse.bagit.zip.holey` + - `dataverse.bagit.zip.max-data-size` + - `dataverse.bagit.zip.max-file-size` + \ No newline at end of file From 17588c72067ed1d81e029352c5c6c7a85408fdf7 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 18 Feb 2026 13:54:24 -0500 Subject: [PATCH 83/97] tweaks, remove duplicates --- .../12063-ORE-and-Bag-updates.md | 15 --- .../12122-archiving in sequence.md | 3 - doc/release-notes/12122-archiving updates.md | 8 -- ....md => 12167-ore-bag-archiving-changes.md} | 112 +++++++++--------- 4 files changed, 53 insertions(+), 85 deletions(-) delete mode 100644 doc/release-notes/12063-ORE-and-Bag-updates.md delete mode 100644 doc/release-notes/12122-archiving in sequence.md delete mode 100644 doc/release-notes/12122-archiving updates.md rename doc/release-notes/{12144-un-holey-bags.md => 12167-ore-bag-archiving-changes.md} (58%) diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md deleted file mode 100644 index bbc22b22182..00000000000 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ /dev/null @@ -1,15 +0,0 @@ -This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: - -OAI-ORE -- now uses URI for checksum algorithms -- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). -- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed - -Archival Bag -- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" -- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed -- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). -- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) -- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation -- improvements to file retrieval w.r.t. retries on errors or throttling -- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse \ No newline at end of file diff --git a/doc/release-notes/12122-archiving in sequence.md b/doc/release-notes/12122-archiving in sequence.md deleted file mode 100644 index 6f4373a1e31..00000000000 --- a/doc/release-notes/12122-archiving in sequence.md +++ /dev/null @@ -1,3 +0,0 @@ -This release introduces an additial setting related to archival bag creation, ArchiveOnlyIfEarlierVersionsAreArchived (default false). -If it is true, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. -This is intended to support use cases where deduplication of files between dataset versions will be done (i.e. by a third-party service running at the archival copy location) and is a step towards supporting the Oxford Common File Layout (OCFL) as an archival format. diff --git a/doc/release-notes/12122-archiving updates.md b/doc/release-notes/12122-archiving updates.md deleted file mode 100644 index 2dd4eb6909f..00000000000 --- a/doc/release-notes/12122-archiving updates.md +++ /dev/null @@ -1,8 +0,0 @@ -## Notifications - -This release includes multiple updates to the process of creating archival bags including -- performance/scaling improvements for large datasets (multiple changes) -- bug fixes for when superusers see the "Submit" button to launch archiving from the dataset page version table -- new functionality to optionally suppress an archiving workflow when using the Update Current Version functionality and mark the current archive as out of date -- new functionality to support recreating an archival bag when Update Current Version has been used, which is available for archivers that can delete existing files -- \ No newline at end of file diff --git a/doc/release-notes/12144-un-holey-bags.md b/doc/release-notes/12167-ore-bag-archiving-changes.md similarity index 58% rename from doc/release-notes/12144-un-holey-bags.md rename to doc/release-notes/12167-ore-bag-archiving-changes.md index 234f387fb0b..12d9a80463f 100644 --- a/doc/release-notes/12144-un-holey-bags.md +++ b/doc/release-notes/12167-ore-bag-archiving-changes.md @@ -1,59 +1,53 @@ -## Archiving, OAI-ORE, and BagIt Export - -This release includes multiple updates to the OAI_ORE metadata export and the process of creating archival bags, improving performance, fixing bugs, and adding significant new functionality. - -### General Archiving Improvements -- Multiple performance and scaling improvements have been made for creating archival bags for large datasets, including: - - the duration or archiving tasks triggered from the version table or API are no longer limited by the transaction time limit - - temporary storage space requirements have increased by 1/:BagGeneratorThreads of the zipped bag size. (This is a consequence of changes to avoid timeout errors on larger files/datasets.) - - The size of individual data files and the total dataset size that will be included in an archival bag can now be limited. - Admins can choose whether files above these limits are transferred along with, but outside, the zipped bag (creating a complete archival copy) - or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be - retrieved in a `fetch.txt` file). In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials - as needed) to make a complete copy. - - superusers can now see a pending status in the dataset version table while archiving is active - - workflows are now triggered outside the transactions related to publication, assuring that workflow locks and status updates are always recorded - - potential conflicts between archiving/workflows, indexing, and metadata exports after publication have been resolved, avoiding cases where the status/last update times for these actions were not recorded -- A bug has been fixed where superusers would incorrectly see the "Submit" button to launch archiving from the dataset page version table. -- The local, S3, and Google archivers have been updated to support deleting existing archival files for a version to allow re-creating the bag for a give version -- For archivers that support file deletion, it is now possible to recreate an archival bag after "Update Current Version" has been used (replacing the original bag). By default, Dataverse will mark the current version's archive as out-of-date, but will not automatically re-archive it. - - a new 'obsolete' status has been added to indicate when an archival bag exists for a version but it was created prior to an update current version change -- Improvements have been made to file retrieval for bagging, including retries on errors and when download requests are being throttled. - - a bug causing :BagGeneratorThreads to be ignored has been fixed, and the default has been reduced to 2 -- Retrieval of files for inclusion in an archival bag is no longer counted as a download. -- It is now possible to require that all previous versions have been succesfully archived before archiving of a newly published version can succeed. (This is intended to support use cases where deduplication of files between dataset versions will be done and is a step towards supporting the Oxford Common File Layout (OCFL).) -- The pending status has changed to use the same JSON format as other statuses - -### OAI-ORE Export Updates -- The export now uses URIs for checksum algorithms, conforming with JSON-LD requirements. -- A bug causing failures with deaccessioned versions has been fixed. This occurred when the deaccession note ("Deaccession Reason" in the UI) was null, which is permissible via the API. -- The `https://schema.org/additionalType` has been updated to "Dataverse OREMap Format v1.0.2" to reflect format changes. - -### Archival Bag (BagIt) Updates -- The `bag-info.txt` file now correctly includes information for dataset contacts, fixing a bug where nothing was included when multiple contacts were defined. (Multiple contacts were always included in the OAI-ORE file in the bag, only the baginfo file was affected). -- Values used in the `bag-info.txt` file that may be multi-line (i.e. with embedded CR or LF characters) are now properly indented and wrapped per the BagIt specification (Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). -- The dataset name is no longer used as a subdirectory within the `data/` directory to reduce issues with unzipping long paths on some filesystems. -- For dataset versions with no files, the empty `manifest-.txt` file will now use the algorithm from the `:FileFixityChecksumAlgorithm` setting instead of defaulting to MD5. -- A new key, `Dataverse-Bag-Version`, has been added to `bag-info.txt` with the value "1.0" to allow for tracking changes to Dataverse's archival bag generation over time. -- When using the `holey` bag option discussed above, the required fetch.txt file will be included. - - -### New Configuration Settings - -This release introduces several new settings to control archival and bagging behavior. - -- **`dataverse.archive.archive-only-if-earlier-versions-are-archived`** (Default: `false`) - When set to `true`, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. - -- **`dataverse.feature.archive-on-version-update`** (Default: `false`) - Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, - i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. - - -- **Bag Size Control And Holey Bag Support** - - The following JVM options (MicroProfile Config Settings) control this feature: - - `dataverse.bagit.zip.holey` - - `dataverse.bagit.zip.max-data-size` - - `dataverse.bagit.zip.max-file-size` - \ No newline at end of file +## Archiving, OAI-ORE, and BagIt Export + +This release includes multiple updates to the OAI-ORE metadata export and the process of creating archival bags, improving performance, fixing bugs, and adding significant new functionality. + +### General Archiving Improvements +- Multiple performance and scaling improvements have been made for creating archival bags for large datasets, including: + - The duration of archiving tasks triggered from the version table or API are no longer limited by the transaction time limit. + - Temporary storage space requirements have increased by `1/:BagGeneratorThreads` of the zipped bag size. (This is a consequence of changes to avoid timeout errors on larger files/datasets.) + - The size of individual data files and the total dataset size that will be included in an archival bag can now be limited. Admins can choose whether files above these limits are transferred along with, but outside, the zipped bag (creating a complete archival copy) or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse URLs from which they can be retrieved in a `fetch.txt` file). In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials as needed) to make a complete copy. + - Superusers can now see a pending status in the dataset version table while archiving is active. + - Workflows are now triggered outside the transactions related to publication, assuring that workflow locks and status updates are always recorded. + - Potential conflicts between archiving/workflows, indexing, and metadata exports after publication have been resolved, avoiding cases where the status/last update times for these actions were not recorded. +- A bug has been fixed where superusers would incorrectly see the "Submit" button to launch archiving from the dataset page version table. +- The local, S3, and Google archivers have been updated to support deleting existing archival files for a version to allow re-creating the bag for a given version. +- For archivers that support file deletion, it is now possible to recreate an archival bag after "Update Current Version" has been used (replacing the original bag). By default, Dataverse will mark the current version's archive as out-of-date, but will not automatically re-archive it. + - A new 'obsolete' status has been added to indicate when an archival bag exists for a version but it was created prior to an "Update Current Version" change. +- Improvements have been made to file retrieval for bagging, including retries on errors and when download requests are being throttled. + - A bug causing `:BagGeneratorThreads` to be ignored has been fixed, and the default has been reduced to 2. +- Retrieval of files for inclusion in an archival bag is no longer counted as a download. +- It is now possible to require that all previous versions have been successfully archived before archiving of a newly published version can succeed. (This is intended to support use cases where deduplication of files between dataset versions will be done and is a step towards supporting the Oxford Common File Layout (OCFL).) +- The pending status has changed to use the same JSON format as other statuses + +### OAI-ORE Export Updates +- The export now uses URIs for checksum algorithms, conforming with JSON-LD requirements. +- A bug causing failures with deaccessioned versions has been fixed. This occurred when the deaccession note ("Deaccession Reason" in the UI) was null, which is permissible via the API. +- The `https://schema.org/additionalType` has been updated to "Dataverse OREMap Format v1.0.2" to reflect format changes. + +### Archival Bag (BagIt) Updates +- The `bag-info.txt` file now correctly includes information for dataset contacts, fixing a bug where nothing was included when multiple contacts were defined. (Multiple contacts were always included in the OAI-ORE file in the bag; only the baginfo file was affected). +- Values used in the `bag-info.txt` file that may be multi-line (i.e. with embedded CR or LF characters) are now properly indented and wrapped per the BagIt specification (`Internal-Sender-Identifier`, `External-Description`, `Source-Organization`, `Organization-Address`). +- The dataset name is no longer used as a subdirectory within the `data/` directory to reduce issues with unzipping long paths on some filesystems. +- For dataset versions with no files, the empty `manifest-.txt` file will now use the algorithm from the `:FileFixityChecksumAlgorithm` setting instead of defaulting to MD5. +- A new key, `Dataverse-Bag-Version`, has been added to `bag-info.txt` with the value "1.0" to allow for tracking changes to Dataverse's archival bag generation over time. +- When using the `holey` bag option discussed above, the required `fetch.txt` file will be included. + + +### New Configuration Settings + +This release introduces several new settings to control archival and bagging behavior. + +- **`dataverse.archive.archive-only-if-earlier-versions-are-archived`** (Default: `false`) + When set to `true`, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. + +- **`dataverse.feature.archive-on-version-update`** (Default: `false`) + Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, i.e., via the Update-Current-Version publication option. Setting the flag to `true` only works if the archiver being used supports deleting existing archival bags. + + +### Bag Size Control And Holey Bag Support + +The following JVM options (MicroProfile Config Settings) control this feature: +- `dataverse.bagit.zip.holey` +- `dataverse.bagit.zip.max-data-size` +- `dataverse.bagit.zip.max-file-size` \ No newline at end of file From 27b0d3785fcd72154188c3d18248fcc9cc4d87cc Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 13:32:58 -0500 Subject: [PATCH 84/97] switch to jvm setting --- .../source/installation/config.rst | 20 +++++++++---------- .../edu/harvard/iq/dataverse/DatasetPage.java | 2 +- .../iq/dataverse/settings/JvmSettings.java | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index b8f898521f2..f829b57d3e6 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2273,7 +2273,7 @@ Two settings that can be used with all current Archivers are: These must be included in the \:ArchiverSettings for the Archiver to work Archival Bags are created per dataset version. By default, if a version is republished (via the superuser-only 'Update Current Version' publication option in the UI/API), a new archival bag is not created for the version. -If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`dataverse.feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. +If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`dataverse.bagit.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. .. _Duracloud Configuration: @@ -3727,6 +3727,14 @@ The email for your institution that you'd like to appear in bag-info.txt. See :r Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_BAGIT_SOURCEORG_EMAIL``. +.. _dataverse.bagit.archive-on-version-update: + +dataverse.bagit.archive-on-version-update ++++++++++++++++++++++++++++++++++++++++++ + +Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, +i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. + .. _dataverse.files.globus-monitoring-server: dataverse.files.globus-monitoring-server @@ -4058,16 +4066,6 @@ dataverse.feature.only-update-datacite-when-needed Only contact DataCite to update a DOI after checking to see if DataCite has outdated information (for efficiency, lighter load on DataCite, especially when using file DOIs). -.. _dataverse.feature.archive-on-version-update: - -dataverse.feature.archive-on-version-update -+++++++++++++++++++++++++++++++++++++++++++ - -Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, -i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. - - - .. _:ApplicationServerSettings: Application Server Settings diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 9d5a3e8822b..d2f098338e2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -3000,7 +3000,7 @@ public String updateCurrentVersion() { // If pending or an obsolete copy exists, do nothing (nominally if a pending run succeeds and we're updating the current version here, it should be marked as obsolete - ignoring for now since updates within the time an archiving run is pending should be rare // If a failure or null, rerun archiving now. If a failure is due to an exiting copy in the repo, we'll fail again String status = updateVersion.getArchivalCopyLocationStatus(); - if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE) || (FeatureFlags.ARCHIVE_ON_VERSION_UPDATE.enabled() && archiveCommand.canDelete())){ + if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE) || (JvmSettings.BAGIT_ARCHIVE_ON_VERSION_UPDATE.lookupOptional(Boolean.class).orElse(false) && archiveCommand.canDelete())){ // Delete the record of any existing copy since it is now out of date/incorrect JsonObjectBuilder job = Json.createObjectBuilder(); job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 086ed7929aa..cf74fc62337 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -280,7 +280,7 @@ public enum JvmSettings { BAGIT_ZIP_MAX_FILE_SIZE(SCOPE_BAGIT_ZIP, "max-file-size"), BAGIT_ZIP_MAX_DATA_SIZE(SCOPE_BAGIT_ZIP, "max-data-size"), BAGIT_ZIP_HOLEY(SCOPE_BAGIT_ZIP, "holey"), - + BAGIT_ARCHIVE_ON_VERSION_UPDATE(SCOPE_BAGIT, "archive-on-version-update"), // STORAGE USE SETTINGS SCOPE_STORAGEUSE(PREFIX, "storageuse"), From 7c08907c59add336ba02581862b869159f9db65f Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Feb 2026 13:37:40 -0500 Subject: [PATCH 85/97] update static string to include ver number --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index a8d92958560..02d77ed52fa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -167,7 +167,9 @@ public class BagGenerator { private static final String BAG_SIZE = "Bag-Size: "; private static final String PAYLOAD_OXUM = "Payload-Oxum: "; private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; - private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + + /** THIS NUMBER SHOULD CHANGE ANY TIME THE BAG CONTENTS ARE CHANGED */ + private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: 1.0"; // Implement exponential backoff with jitter static final long baseWaitTimeMs = 1000; // Start with 1 second @@ -1025,7 +1027,7 @@ private String generateInfoFile() { // Add a version number for our bag type - should be updated with any change to // the bag content/structure - info.append(DATAVERSE_BAG_VERSION + "1.0"); + info.append(DATAVERSE_BAG_VERSION); info.append(CRLF); return info.toString(); From fb7517bce13e60fa98548431008974b3cd83b12c Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Feb 2026 13:38:05 -0500 Subject: [PATCH 86/97] missed change - use pending/obsolete --- .../harvard/iq/dataverse/api/Datasets.java | 49 +++++++++++-------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 30a78458c86..1bc0ce1e380 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1278,26 +1278,35 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( DatasetVersion updateVersion = ds.getLatestVersion(); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, createDataverseRequest(user), updateVersion); if (archiveCommand != null) { - // Delete the record of any existing copy since it is now out of date/incorrect - JsonObjectBuilder job = Json.createObjectBuilder(); - job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); - updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); - datasetVersionSvc.persistArchivalCopyLocation(updateVersion); - /* - * Then try to generate and submit an archival copy. Note that running this - * command within the CuratePublishedDatasetVersionCommand was causing an error: - * "The attribute [id] of class - * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary - * key column in the database. Updates are not allowed." To avoid that, and to - * simplify reporting back to the GUI whether this optional step succeeded, I've - * pulled this out as a separate submit(). - */ - try { - commandEngine.submitAsync(archiveCommand); - successMsg = BundleUtil.getStringFromBundle("datasetversion.archive.inprogress"); - } catch (CommandException ex) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); - logger.severe(ex.getMessage()); + String status = updateVersion.getArchivalCopyLocationStatus(); + if ((status == null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { + // Delete the record of any existing copy since it is now out of + // date/incorrect + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + datasetVersionSvc.persistArchivalCopyLocation(updateVersion); + /* + * Then try to generate and submit an archival copy. Note that running this + * command within the CuratePublishedDatasetVersionCommand was causing an error: + * "The attribute [id] of class + * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary + * key column in the database. Updates are not allowed." To avoid that, and to + * simplify reporting back to the GUI whether this optional step succeeded, I've + * pulled this out as a separate submit(). + */ + try { + commandEngine.submitAsync(archiveCommand); + successMsg = BundleUtil.getStringFromBundle("datasetversion.archive.inprogress"); + } catch (CommandException ex) { + successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + + " - " + ex.toString(); + logger.severe(ex.getMessage()); + } + } else if (status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + // Not automatically replacing the old archival copy as creating it is expensive + updateVersion.setArchivalStatusOnly(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); + datasetVersionSvc.persistArchivalCopyLocation(updateVersion); } } } catch (CommandException ex) { From 79e1ddc4868e2b64b9588cb93fe7a003c47ee528 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 13:34:23 -0500 Subject: [PATCH 87/97] fix param order per review --- .../engine/command/impl/AbstractSubmitToArchiveCommand.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index c4ec5bce736..9c244b27535 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -177,8 +177,8 @@ private void persistResult(CommandContext ctxt, DatasetVersion versionWithStatus * constructor and could be dropped from the parameter list.) * * @param version - the DatasetVersion to archive - * @param ore - * @param dataCiteXml + * @param dataCiteXml + * @param ore * @param terms * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). From 7eef9ca5bc76fe58a805ae7d6273ef5f2ca15c07 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Feb 2026 13:44:55 -0500 Subject: [PATCH 88/97] update/fix release note --- .../12167-ore-bag-archiving-changes.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/doc/release-notes/12167-ore-bag-archiving-changes.md b/doc/release-notes/12167-ore-bag-archiving-changes.md index 12d9a80463f..e1cce68c15a 100644 --- a/doc/release-notes/12167-ore-bag-archiving-changes.md +++ b/doc/release-notes/12167-ore-bag-archiving-changes.md @@ -38,16 +38,13 @@ This release includes multiple updates to the OAI-ORE metadata export and the pr This release introduces several new settings to control archival and bagging behavior. -- **`dataverse.archive.archive-only-if-earlier-versions-are-archived`** (Default: `false`) +- `:ArchiveOnlyIfEarlierVersionsAreArchived` (Default: `false`) When set to `true`, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. -- **`dataverse.feature.archive-on-version-update`** (Default: `false`) - Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, i.e., via the Update-Current-Version publication option. Setting the flag to `true` only works if the archiver being used supports deleting existing archival bags. - - -### Bag Size Control And Holey Bag Support - -The following JVM options (MicroProfile Config Settings) control this feature: +The following JVM options (MicroProfile Config Settings) control bag size and holey bag support: - `dataverse.bagit.zip.holey` - `dataverse.bagit.zip.max-data-size` -- `dataverse.bagit.zip.max-file-size` \ No newline at end of file +- `dataverse.bagit.zip.max-file-size` + +- `dataverse.bagit.archive-on-version-update` (Default: `false`) + Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, i.e., via the Update-Current-Version publication option. Setting the flag to `true` only works if the archiver being used supports deleting existing archival bags. From 599cb0fdc9c0bac0dd8a998785c1411185e1b0f8 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Feb 2026 13:45:44 -0500 Subject: [PATCH 89/97] 443 fix per review --- .../engine/command/impl/DuraCloudSubmitToArchiveCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index f73dbba178d..88d5bf3a33b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -219,7 +219,7 @@ public void run() { // view it as an admin) StringBuffer sb = new StringBuffer("https://"); sb.append(host); - if (!port.equals("443")) { + if (!port.equals(DEFAULT_PORT)) { sb.append(":" + port); } sb.append("/duradmin/spaces/sm/"); From 1bb3fa7959f2bf12b551dd262d642147a7078826 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 14:10:37 -0500 Subject: [PATCH 90/97] refactor per review --- .../impl/AbstractSubmitToArchiveCommand.java | 17 +++++++++++++++++ .../impl/DuraCloudSubmitToArchiveCommand.java | 4 ++-- .../GoogleCloudSubmitToArchiveCommand.java | 7 +++---- .../impl/LocalSubmitToArchiveCommand.java | 8 +++----- .../command/impl/S3SubmitToArchiveCommand.java | 18 +----------------- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 9c244b27535..137d41e2c97 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -43,6 +43,7 @@ public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand requestedSettings = new HashMap(); + protected String spaceName = null; protected boolean success=false; private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName()); private static final int MAX_ZIP_WAIT = 20000; @@ -302,4 +303,20 @@ public static boolean supportsDelete() { public boolean canDelete() { return supportsDelete(); } + + protected String getDataCiteFileName(String spaceName, DatasetVersion dv) { + return spaceName + "_datacite.v" + dv.getFriendlyVersionNumber(); + } + + protected String getFileName(String spaceName, DatasetVersion dv) { + return spaceName + ".v" + dv.getFriendlyVersionNumber(); + } + + protected String getSpaceName(Dataset dataset) { + if (spaceName == null) { + spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-') + .toLowerCase(); + } + return spaceName; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index 88d5bf3a33b..57a4a68a44a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -92,8 +92,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dat * the same space. */ String spaceName = dataset.getOwner().getAlias().toLowerCase().replaceAll("[^a-z0-9-]", ".dcsafe"); - String baseFileName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-') - .toLowerCase() + "_v" + dv.getFriendlyVersionNumber(); + //This archiver doesn't use the standard spaceName, but does use it to generate the file name + String baseFileName = getFileName(getSpaceName(dataset), dv); ContentStore store; // Set a failure status that will be updated if we succeed diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 4e64fcfaabc..ba873c81d97 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -87,12 +87,11 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dat Dataset dataset = dv.getDataset(); - String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase(); + String spaceName = getSpaceName(dataset); // Check for and delete existing files for this version - String dataciteFileName = spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml"; - String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + String dataciteFileName = getDataCiteFileName(spaceName, dv) + ".xml"; + String bagFileName = spaceName + "/" + getFileName(spaceName,dv) + ".zip"; logger.fine("Checking for existing files in archive..."); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 2e6ddef7b54..a594ac02cfb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -61,13 +61,11 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dat Dataset dataset = dv.getDataset(); - String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-') - .toLowerCase(); + String spaceName = getSpaceName(dataset); // Define file paths - String dataciteFileName = localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() - + ".xml"; - zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; + String dataciteFileName = localPath + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + zipName = localPath + "/" + getFileName(spaceName, dv) + ".zip"; // Check for and delete existing files for this version logger.fine("Checking for existing files in archive..."); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index faa4fa50d50..17be53a458f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -75,7 +75,7 @@ public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { private static final Config config = ConfigProvider.getConfig(); protected S3AsyncClient s3 = null; private S3TransferManager tm = null; - private String spaceName = null; + protected String bucketName = null; public S3SubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { @@ -288,22 +288,6 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dat } } - protected String getDataCiteFileName(String spaceName, DatasetVersion dv) { - return spaceName + "_datacite.v" + dv.getFriendlyVersionNumber(); - } - - protected String getFileName(String spaceName, DatasetVersion dv) { - return spaceName + ".v" + dv.getFriendlyVersionNumber(); - } - - protected String getSpaceName(Dataset dataset) { - if (spaceName == null) { - spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-') - .toLowerCase(); - } - return spaceName; - } - private S3AsyncClient createClient(JsonObject configObject) { // Create a builder for the S3AsyncClient From 06e48ac78c95af47ac882076a6d5b40f6fabc6b4 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 14:22:26 -0500 Subject: [PATCH 91/97] fix indent per review --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 8a4a0cf3f53..92bab58e8d6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -402,8 +402,8 @@ private void populateArchivalStatus(boolean force) { if(archivalCopyLocationJson ==null || force) { if(archivalCopyLocation!=null) { try { - archivalCopyLocationJson = JsonUtil.getJsonObject(archivalCopyLocation); - } catch(Exception e) { + archivalCopyLocationJson = JsonUtil.getJsonObject(archivalCopyLocation); + } catch (Exception e) { logger.warning("DatasetVersion id: " + id + "has a non-JsonObject value, parsing error: " + e.getMessage()); logger.fine(archivalCopyLocation); } From 3b72a4c2c13ddbd2fea96b728f2582f37e40593e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 14:22:38 -0500 Subject: [PATCH 92/97] fix javadoc per review --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 02d77ed52fa..14328b02b88 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -188,7 +188,9 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. - * @param terms + * @param oremapObject - OAI-ORE Map file as a JSON object + * @param dataciteXml - DataCite XML file as a string + * @param terms - Map of schema.org/terms to their corresponding JsonLDTerm objects * * @throws Exception * @throws JsonSyntaxException From a77e0a81f8e6ab989b32624c583418f2861a3abf Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Feb 2026 14:29:22 -0500 Subject: [PATCH 93/97] remove param in doc per review --- src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java b/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java index 5a3f105497d..4fa85a543d8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java +++ b/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java @@ -359,7 +359,6 @@ public R submit(Command aCommand) throws CommandException { * * @param The return type of the command * @param aCommand The command to execute - * @param user The user executing the command * @return A Future representing the pending result * @throws CommandException if the command cannot be submitted */ From 2e1d2e536990859cbe8a9557d4100ca529023fa8 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 16:09:10 -0500 Subject: [PATCH 94/97] cleanup --- .../workflow/internalspi/ArchivalSubmissionWorkflowStep.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index ba35ca0273a..aacaa585dd7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -75,7 +75,7 @@ public WorkflowStepResult run(WorkflowContext context) { JsonObject ore = oreMap.getOREMap(); // Get JSON-LD terms - Map terms = archiveCommand.getJsonLDTerms(oreMap); + Map terms = AbstractSubmitToArchiveCommand.getJsonLDTerms(oreMap); // Call the updated method with all required parameters /* From d83b7af03eb1e28132cceb1bd816f3ef5a7e0dfc Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Feb 2026 16:09:28 -0500 Subject: [PATCH 95/97] add spacename to datacite file --- .../engine/command/impl/GoogleCloudSubmitToArchiveCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index ba873c81d97..43769dbdb49 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -90,7 +90,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dat String spaceName = getSpaceName(dataset); // Check for and delete existing files for this version - String dataciteFileName = getDataCiteFileName(spaceName, dv) + ".xml"; + String dataciteFileName = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; String bagFileName = spaceName + "/" + getFileName(spaceName,dv) + ".zip"; logger.fine("Checking for existing files in archive..."); From 7e732a0dd68815931a36380143fc266087265e85 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Feb 2026 18:20:16 -0500 Subject: [PATCH 96/97] handle Local archiver zip name change --- doc/release-notes/12167-ore-bag-archiving-changes.md | 4 ++++ src/test/java/edu/harvard/iq/dataverse/api/BagIT.java | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/release-notes/12167-ore-bag-archiving-changes.md b/doc/release-notes/12167-ore-bag-archiving-changes.md index e1cce68c15a..a10dbdce1df 100644 --- a/doc/release-notes/12167-ore-bag-archiving-changes.md +++ b/doc/release-notes/12167-ore-bag-archiving-changes.md @@ -48,3 +48,7 @@ The following JVM options (MicroProfile Config Settings) control bag size and ho - `dataverse.bagit.archive-on-version-update` (Default: `false`) Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, i.e., via the Update-Current-Version publication option. Setting the flag to `true` only works if the archiver being used supports deleting existing archival bags. + + ###Backward Incompatibility + + The name of archival zipped bag produced by the LocalSubmitToArchiveCommand archiver now has a '.' character before the version number mirror the name used by other archivers, e.g. the name will be like doi-10-5072-fk2-fosg5q.v1.0.zip rather than doi-10-5072-fk2-fosg5qv1.0.zip \ No newline at end of file diff --git a/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java b/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java index 16c44003f35..b649ad6bb95 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java @@ -87,7 +87,7 @@ public void testBagItExport() throws IOException { .replace('.', '-').toLowerCase(); // spacename: doi-10-5072-fk2-fosg5q - String pathToZip = bagitExportDir + "/" + spaceName + "v1.0" + ".zip"; + String pathToZip = bagitExportDir + "/" + spaceName + ".v1.0" + ".zip"; try { // give the bag time to generate From f80a1cd5470fd41d0c10cda67282dfb4f53e46d2 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 20 Feb 2026 11:41:34 -0500 Subject: [PATCH 97/97] use constants --- .../iq/dataverse/util/bagit/BagGenerator.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 14328b02b88..1459e2989da 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -100,7 +100,11 @@ public class BagGenerator { private static final Logger logger = Logger.getLogger(BagGenerator.class.getCanonicalName()); - + + static final String CRLF = "\r\n"; + + protected static final int MAX_RETRIES = 5; + private ParallelScatterZipCreator scatterZipCreator = null; private ScatterZipOutputStream dirs = null; @@ -326,7 +330,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { boolean first = true; for (Entry pidEntry : pidMap.entrySet()) { if (!first) { - pidStringBuffer.append("\r\n"); + pidStringBuffer.append(CRLF); } else { first = false; } @@ -341,7 +345,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { first = true; for (Entry sha1Entry : checksumMap.entrySet()) { if (!first) { - sha1StringBuffer.append("\r\n"); + sha1StringBuffer.append(CRLF); } else { first = false; } @@ -784,7 +788,7 @@ private boolean addToZip(long fileSize) { // Method to append to fetch file content private void addToFetchFile(String url, long size, String filename) { // Format: URL size filename - fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\r\n"); + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append(CRLF); } // Method to write fetch file to bag (call this before finalizing the bag) @@ -910,8 +914,6 @@ public void writeTo(ZipArchiveOutputStream zipArchiveOutputStream) logger.fine("Files written"); } - static final String CRLF = "\r\n"; - private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); @@ -1303,7 +1305,7 @@ public InputStream get() { try { URI uri = new URI(uriString); int tries = 0; - while (tries < 5) { + while (tries < MAX_RETRIES) { logger.finest("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); @@ -1357,29 +1359,29 @@ public void close() throws IOException { } catch (InterruptedException ie) { logger.log(Level.SEVERE, "InterruptedException during retry delay for file: " + uriString, ie); Thread.currentThread().interrupt(); // Restore interrupt status - tries += 5; // Skip remaining attempts + tries += MAX_RETRIES; // Skip remaining attempts } } } catch (ClientProtocolException e) { - tries += 5; + tries += MAX_RETRIES; logger.log(Level.SEVERE, "ClientProtocolException when retrieving file: " + uriString + " (attempt " + tries + ")", e); } catch (SocketTimeoutException e) { // Specific handling for timeout exceptions tries++; - logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of 5) - Request exceeded timeout", e); - if (tries == 5) { + logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of " + MAX_RETRIES + ") - Request exceeded timeout", e); + if (tries == MAX_RETRIES) { logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries due to timeouts: " + uriString, e); } } catch (InterruptedIOException e) { // Catches interruptions during I/O operations - tries += 5; + tries += MAX_RETRIES; logger.log(Level.SEVERE, "InterruptedIOException when retrieving file: " + uriString + " - Operation was interrupted", e); Thread.currentThread().interrupt(); // Restore interrupt status } catch (IOException e) { // Retry if this is a potentially temporary error such as a timeout tries++; - logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of 5)", e); - if (tries == 5) { + logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of " + MAX_RETRIES+ ")", e); + if (tries == MAX_RETRIES) { logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries: " + uriString, e); } }