diff --git a/doc/sphinx-guides/source/admin/integrations.rst b/doc/sphinx-guides/source/admin/integrations.rst index 5ee6372d56d..8d3f53981e0 100644 --- a/doc/sphinx-guides/source/admin/integrations.rst +++ b/doc/sphinx-guides/source/admin/integrations.rst @@ -12,7 +12,7 @@ Getting Data In A variety of integrations are oriented toward making it easier for your researchers to deposit data into your Dataverse installation. GitHub -+++++++ +++++++ Dataverse integration with GitHub is implemented via a Dataverse Uploader GitHub Action. It is a reusable, composite workflow for uploading a git repository or subdirectory into a dataset on a target Dataverse installation. The action is customizable, allowing users to choose to replace a dataset, add to the dataset, publish it or leave it as a draft version on Dataverse. The action provides some metadata to the dataset, such as the origin GitHub repository, and it preserves the directory tree structure. @@ -157,12 +157,14 @@ Archivematica Sponsored by the `Ontario Council of University Libraries (OCUL) `_, this technical integration enables users of Archivematica to select datasets from connected Dataverse installations and process them for long-term access and digital preservation. For more information and list of known issues, please refer to Artefactual's `release notes `_, `integration documentation `_, and the `project wiki `_. -DuraCloud/Chronopolis -+++++++++++++++++++++ +.. _rda-bagit-archiving: + +RDA BagIt (BagPack) Archiving ++++++++++++++++++++++++++++++ -A Dataverse installation can be configured to submit a copy of published Datasets, packaged as `Research Data Alliance conformant `_ zipped `BagIt `_ bags to the `Chronopolis `_ via `DuraCloud `_ +A Dataverse installation can be configured to submit a copy of published Datasets, packaged as `Research Data Alliance conformant `_ zipped `BagIt `_ bags to the `Chronopolis `_ via `DuraCloud `_, to a local file system, or to `Google Cloud Storage `_. -For details on how to configure this integration, look for "DuraCloud/Chronopolis" in the :doc:`/installation/config` section of the Installation Guide. +For details on how to configure this integration, see :ref:`BagIt Export` in the :doc:`/installation/config` section of the Installation Guide. Future Integrations ------------------- diff --git a/doc/sphinx-guides/source/developers/workflows.rst b/doc/sphinx-guides/source/developers/workflows.rst index c982edc08bb..df63bf239fe 100644 --- a/doc/sphinx-guides/source/developers/workflows.rst +++ b/doc/sphinx-guides/source/developers/workflows.rst @@ -178,9 +178,9 @@ Available variables are: archiver ++++++++ -A step that sends an archival copy of a Dataset Version to a configured archiver, e.g. the DuraCloud interface of Chronopolis. See the `DuraCloud/Chronopolis Integration documentation `_ for further detail. +A step that sends an archival copy of a Dataset Version to a configured archiver, e.g. the DuraCloud interface of Chronopolis. See :ref:`rda-bagit-archiving` for further detail. -Note - the example step includes two settings required for any archiver and three (DuraCloud*) that are specific to DuraCloud. +Note - the example step includes two settings required for any archiver, three (DuraCloud*) that are specific to DuraCloud, and the optional BagGeneratorThreads setting that controls parallelism when creating the Bag. .. code:: json @@ -196,7 +196,8 @@ Note - the example step includes two settings required for any archiver and thre ":ArchiverSettings": "string", ":DuraCloudHost":"string", ":DuraCloudPort":"string", - ":DuraCloudContext":"string" + ":DuraCloudContext":"string", + ":BagGeneratorThreads":"string" } } diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 7ed9fe1327d..f890f5312ff 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -932,7 +932,7 @@ The minimal configuration to support an archiver integration involves adding a m \:ArchiverSettings - the archiver class can access required settings including existing Dataverse installation settings and dynamically defined ones specific to the class. This setting is a comma-separated list of those settings. For example\: -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":DuraCloudHost, :DuraCloudPort, :DuraCloudContext"`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":DuraCloudHost, :DuraCloudPort, :DuraCloudContext, :BagGeneratorThreads"`` The DPN archiver defines three custom settings, one of which is required (the others have defaults): @@ -942,6 +942,12 @@ The DPN archiver defines three custom settings, one of which is required (the ot :DuraCloudPort and :DuraCloudContext are also defined if you are not using the defaults ("443" and "duracloud" respectively). (Note\: these settings are only in effect if they are listed in the \:ArchiverSettings. Otherwise, they will not be passed to the DuraCloud Archiver class.) +It also can use one setting that is common to all Archivers: :BagGeneratorThreads + +``curl http://localhost:8080/api/admin/settings/:BagGeneratorThreads -X PUT -d '8'`` + +By default, the Bag generator zips two datafiles at a time when creating the Bag. This setting can be used to lower that to 1, i.e. to decrease system load, or to increase it, e.g. to 4 or 8, to speed processing of many small files. + Archivers may require JVM options as well. For the Chronopolis archiver, the username and password associated with your organization's Chronopolis/DuraCloud account should be configured in Payara: ``./asadmin create-jvm-options '-Dduracloud.username=YOUR_USERNAME_HERE'`` @@ -963,9 +969,9 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam \:ArchiverSettings - the archiver class can access required settings including existing Dataverse installation settings and dynamically defined ones specific to the class. This setting is a comma-separated list of those settings. For example\: -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath"`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath, :BagGeneratorThreads"`` -:BagItLocalPath is the file path that you've set in :ArchiverSettings. +:BagItLocalPath is the file path that you've set in :ArchiverSettings. See the DuraCloud Configuration section for a description of :BagGeneratorThreads. .. _Google Cloud Configuration: @@ -976,9 +982,9 @@ The Google Cloud Archiver can send Dataverse Project Bags to a bucket in Google' ``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject"`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject, :BagGeneratorThreads"`` -The Google Cloud Archiver defines two custom settings, both are required. The credentials for your account, in the form of a json key file, must also be obtained and stored locally (see below): +The Google Cloud Archiver defines two custom settings, both are required. It can also use the :BagGeneratorThreads setting as described in the DuraCloud Configuration section above. The credentials for your account, in the form of a json key file, must also be obtained and stored locally (see below): In order to use the Google Cloud Archiver, you must have a Google account. You will need to create a project and bucket within that account and provide those values in the settings: @@ -2400,6 +2406,13 @@ For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setti ``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` +:BagGeneratorThreads +++++++++++++++++++++ + +An archiver setting shared by several implementations (e.g. DuraCloud, Google, and Local) that can make Bag generation use fewer or more threads in zipping datafiles that the default of 2 + +``curl http://localhost:8080/api/admin/settings/:BagGeneratorThreads -X PUT -d '8'`` + :DuraCloudHost ++++++++++++++ :DuraCloudPort @@ -2415,7 +2428,7 @@ These three settings define the host, port, and context used by the DuraCloudSub This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration` section above. :GoogleCloudBucket -++++++++++++++++++ +++++++++++++++++++ :GoogleCloudProject +++++++++++++++++++ diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 77ea680598f..4fa0961d134 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -11,6 +11,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; import java.util.Date; @@ -24,6 +25,7 @@ public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand requestedSettings = new HashMap(); private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName()); + private static final int DEFAULT_THREADS = 2; public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version.getDataset()); @@ -67,6 +69,18 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { */ abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, Map requestedSetttings); + protected int getNumberOfBagGeneratorThreads() { + if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { + try { + return Integer.valueOf(requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS)); + } catch (NumberFormatException nfe) { + logger.warning("Can't parse the value of setting " + BagGenerator.BAG_GENERATOR_THREADS + + " as an integer - using default:" + DEFAULT_THREADS); + } + } + return DEFAULT_THREADS; + } + @Override public String describe() { return super.describe() + "DatasetVersion: [" + version.getId() + " (v" diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index 468e99f24c1..f30183663e6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -42,6 +42,7 @@ public class DuraCloudSubmitToArchiveCommand extends AbstractSubmitToArchiveComm private static final String DURACLOUD_HOST = ":DuraCloudHost"; private static final String DURACLOUD_CONTEXT = ":DuraCloudContext"; + public DuraCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version); } @@ -128,6 +129,7 @@ public void run() { try (PipedOutputStream out = new PipedOutputStream(in)){ // Generate bag BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); } catch (Exception e) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index cb729a9807a..af4c960c2d6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -121,6 +121,7 @@ public void run() { try (PipedOutputStream out = new PipedOutputStream(in)) { // Generate bag BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); } catch (Exception e) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 64e635c7d3d..b336d9a77f9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -58,6 +58,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; bagger.generateBag(new FileOutputStream(zipName + ".partial")); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 958e61f33e6..56676e3d00a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -115,6 +115,7 @@ public class BagGenerator { private boolean usetemp = false; private int numConnections = 8; + public static final String BAG_GENERATOR_THREADS = ":BagGeneratorThreads"; private OREMap oremap; @@ -1080,4 +1081,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } + public void setNumConnections(int numConnections) { + this.numConnections = numConnections; + logger.fine("BagGenerator will use " + numConnections + " threads"); + } + } \ No newline at end of file