From aa407c302fadc4b8426d977a61110011c5afd663 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Mon, 17 Sep 2018 16:13:30 +0200 Subject: [PATCH 01/14] Add IntelliJ IDEA files to .gitignore to avoid tracking by mistake. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 87c2918d902..e1915ef7ddc 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,7 @@ conf/docker-aio/dv/install/dvinstall.zip conf/docker-aio/testdata/ scripts/installer/default.config *.pem + +# do not track IntelliJ IDEA files +.idea +**/*.iml From c2375b2a422b56b9de39dcc498259386263e77b7 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Wed, 19 Sep 2018 09:51:23 +0200 Subject: [PATCH 02/14] IDEA 2018.2 fails to build the project because of the uninitialized Maven property 'compilerArgument'. The property name was passed as argument string to the compiler, which of course failed. Adding an empty default fixes this. --- pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pom.xml b/pom.xml index c2e4f8a82a2..501ccbf7e1e 100644 --- a/pom.xml +++ b/pom.xml @@ -13,6 +13,8 @@ ${project.build.directory}/endorsed UTF-8 -Xdoclint:none + + 1.11.172 UTC en From 53090518dc4eed9f01c1033f4ec63f3c517d9a9a Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Thu, 20 Sep 2018 16:41:39 +0200 Subject: [PATCH 03/14] Make S3AccessIO actually testable by making private methods default visible and adding a constructor to inject a mocked AWS S3 Client --- .../iq/dataverse/dataaccess/S3AccessIO.java | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 530f7ee4a17..b28cf85984c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -43,6 +43,8 @@ import java.util.logging.Logger; import org.apache.commons.io.IOUtils; +import javax.validation.constraints.NotNull; + /** * * @author Matthew A Dunlap @@ -77,6 +79,12 @@ public S3AccessIO(T dvObject, DataAccessRequest req) { e); } } + + public S3AccessIO(T dvObject, DataAccessRequest req, @NotNull AmazonS3 s3client) { + super(dvObject, req); + this.setIsLocalFile(false); + this.s3 = s3client; + } public static String S3_IDENTIFIER_PREFIX = "s3"; @@ -630,7 +638,7 @@ public InputStream getAuxFileAsInputStream(String auxItemTag) throws IOException } } - private String getDestinationKey(String auxItemTag) throws IOException { + String getDestinationKey(String auxItemTag) throws IOException { if (dvObject instanceof DataFile) { return getMainFileKey() + "." + auxItemTag; } else if (dvObject instanceof Dataset) { @@ -643,7 +651,16 @@ private String getDestinationKey(String auxItemTag) throws IOException { } } - private String getMainFileKey() throws IOException { + /** + * TODO: this function is not side effect free (sets instance variables key and bucketName). + * Is this good or bad? Need to ask @landreev + * + * Extract the file key from a file stored on S3. + * Follows template: "owner authority name"/"owner identifier"/"storage identifier without bucketname and protocol" + * @return Main File Key + * @throws IOException + */ + String getMainFileKey() throws IOException { if (key == null) { String baseKey = this.getDataFile().getOwner().getAuthorityForFileStorage() + "/" + this.getDataFile().getOwner().getIdentifierForFileStorage(); String storageIdentifier = dvObject.getStorageIdentifier(); @@ -723,7 +740,7 @@ public String generateTemporaryS3Url() throws IOException { } } - private int getUrlExpirationMinutes() { + int getUrlExpirationMinutes() { String optionValue = System.getProperty("dataverse.files.s3-url-expiration-minutes"); if (optionValue != null) { Integer num; From fd004fcd60570f16e8e78e650f93f68eeade3ed2 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Thu, 20 Sep 2018 16:43:10 +0200 Subject: [PATCH 04/14] Add basic unit test infrastructure for S3AccessIO including Mockito Jupiter support via Maven. --- pom.xml | 9 ++- .../dataverse/dataaccess/S3AccessIOTest.java | 57 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java diff --git a/pom.xml b/pom.xml index 501ccbf7e1e..0dccecd0708 100644 --- a/pom.xml +++ b/pom.xml @@ -29,6 +29,7 @@ 5.3.1 5.3.1 1.3.1 + 2.22.0 @@ -385,7 +386,13 @@ org.mockito mockito-core - 2.22.0 + ${mockito.version} + test + + + org.mockito + mockito-junit-jupiter + ${mockito.version} test diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java new file mode 100644 index 00000000000..ab1db8869f2 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java @@ -0,0 +1,57 @@ +/* + * Copyright 2018 Forschungszentrum Jülich GmbH + * SPDX-License-Identifier: Apache 2.0 + */ +package edu.harvard.iq.dataverse.dataaccess; + +import com.amazonaws.services.s3.AmazonS3; +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.api.UtilIT; +import edu.harvard.iq.dataverse.mocks.MocksFactory; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.junit.jupiter.MockitoSettings; +import org.mockito.quality.Strictness; + +import java.io.IOException; + +@ExtendWith(MockitoExtension.class) +@MockitoSettings(strictness = Strictness.STRICT_STUBS) +public class S3AccessIOTest { + + @Mock + private AmazonS3 s3client; + + private S3AccessIO dataSetAccess; + private S3AccessIO dataFileAccess; + private Dataset dataSet; + private DataFile dataFile; + private String dataFileId; + + @BeforeEach + public void setup() throws IOException { + dataFile = MocksFactory.makeDataFile(); + dataSet = MocksFactory.makeDataset(); + dataFile.setOwner(dataSet); + dataFileId = UtilIT.getRandomIdentifier(); + dataFile.setStorageIdentifier("s3://bucket:"+dataFileId); + dataSetAccess = new S3AccessIO<>(dataSet, null, s3client); + dataFileAccess = new S3AccessIO<>(dataFile, null, s3client); + } + + /* + createTempFile + getStorageLocation + getFileSystemPath + exists? + getWriteChannel + getOutputStream + getDestinationKey + getMainFileKey + getUrlExpirationMinutes + */ + +} From 0e8547f3f1cd1519a7a10728ce0dead44784dfb5 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Thu, 20 Sep 2018 16:46:45 +0200 Subject: [PATCH 05/14] Add some very minor unit tests to S3AccessIO. This should be enhanced, as the class could use some refactoring and more documentation. --- .../dataverse/dataaccess/S3AccessIOTest.java | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java index ab1db8869f2..4ce821a5fee 100644 --- a/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java @@ -10,12 +10,17 @@ import edu.harvard.iq.dataverse.api.UtilIT; import edu.harvard.iq.dataverse.mocks.MocksFactory; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import static org.junit.jupiter.api.Assertions.*; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoSettings; import org.mockito.quality.Strictness; +import static org.mockito.Mockito.*; +import static org.mockito.BDDMockito.*; +import java.io.FileNotFoundException; import java.io.IOException; @ExtendWith(MockitoExtension.class) @@ -50,8 +55,69 @@ public void setup() throws IOException { getWriteChannel getOutputStream getDestinationKey + + DONE + --------------------- getMainFileKey getUrlExpirationMinutes */ + @Test + void keyNull_getMainFileKey() throws IOException { + // given + String authOwner = dataSet.getAuthority(); + String idOwner = dataSet.getIdentifier(); + + // when + String key = dataFileAccess.getMainFileKey(); + + // then + assertEquals(authOwner+"/"+idOwner+"/"+dataFileId, key); + } + + @Test + void keyNullstorageIdNullOrEmpty_getMainFileKey() throws IOException { + // given + dataFile.setStorageIdentifier(null); + // when & then + assertThrows(FileNotFoundException.class, () -> {dataFileAccess.getMainFileKey(); }); + + // given + dataFile.setStorageIdentifier(""); + // when & then + assertThrows(FileNotFoundException.class, () -> {dataFileAccess.getMainFileKey(); }); + } + + @Test + void keyNullstorageIdNull_getMainFileKey() throws IOException { + // given + dataFile.setStorageIdentifier("invalid://abcd"); + // when & then + assertThrows(IOException.class, () -> {dataFileAccess.getMainFileKey(); }); + } + + @Test + void default_getUrlExpirationMinutes() { + // given + System.clearProperty("dataverse.files.s3-url-expiration-minutes"); + // when & then + assertEquals(60, dataFileAccess.getUrlExpirationMinutes()); + } + + @Test + void validSetting_getUrlExpirationMinutes() { + // given + System.setProperty("dataverse.files.s3-url-expiration-minutes", "120"); + // when & then + assertEquals(120, dataFileAccess.getUrlExpirationMinutes()); + } + + @Test + void invalidSetting_getUrlExpirationMinutes() { + // given + System.setProperty("dataverse.files.s3-url-expiration-minutes", "NaN"); + // when & then + assertEquals(60, dataFileAccess.getUrlExpirationMinutes()); + } + } From 13a9980a522ac22cf785f39d114a978a12f328ed Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Fri, 21 Sep 2018 14:33:13 +0200 Subject: [PATCH 06/14] Fixes #4690. Enable custom endpoint URL with new system properties using a custom EndpointConfiguration. Will default to previous standard configuration. --- .../iq/dataverse/dataaccess/S3AccessIO.java | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index b28cf85984c..83b1885f778 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -3,6 +3,7 @@ import com.amazonaws.AmazonClientException; import com.amazonaws.HttpMethod; import com.amazonaws.SdkClientException; +import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.amazonaws.services.s3.model.ObjectMetadata; @@ -71,12 +72,20 @@ public S3AccessIO(T dvObject) { public S3AccessIO(T dvObject, DataAccessRequest req) { super(dvObject, req); this.setIsLocalFile(false); + try { - s3 = AmazonS3ClientBuilder.standard().defaultClient(); + // get a standard client, using the standard way of configuration the credentials, etc. + AmazonS3ClientBuilder s3CB = AmazonS3ClientBuilder.standard(); + // if the admin has set a system property (see below) we use this endpoint URL instead of the standard ones. + if (!s3url.isEmpty()) { + s3CB.setEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3url, s3region)); + } + // let's build the client :-) + this.s3 = s3CB.build(); } catch (Exception e) { throw new AmazonClientException( - "Cannot instantiate a S3 client using; check your AWS credentials and region", - e); + "Cannot instantiate a S3 client using; check your AWS credentials and region", + e); } } @@ -89,6 +98,16 @@ public S3AccessIO(T dvObject, DataAccessRequest req, @NotNull AmazonS3 s3client) public static String S3_IDENTIFIER_PREFIX = "s3"; private AmazonS3 s3 = null; + /** + * Pass in a URL pointing to your S3 compatible storage. + * For possible values see https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/client/builder/AwsClientBuilder.EndpointConfiguration.html + */ + private String s3url = System.getProperty("dataverse.files.s3-url", ""); + /** + * Pass in a region to use for SigV4 signing of requests. + * Defaults to "dataverse" as it is not relevant for custom S3 implementations. + */ + private String s3region = System.getProperty("dataverse.files.s3-region", "dataverse"); private String bucketName = System.getProperty("dataverse.files.s3-bucket-name"); private String key; From 7dac187c56e9847b094fb1e73ac5925ca0976623 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Fri, 21 Sep 2018 19:15:31 +0200 Subject: [PATCH 07/14] Refactoring S3 configuration documentation and include advice how to use custom S3 endpoints. --- .../source/installation/config.rst | 149 ++++++++++++++---- 1 file changed, 117 insertions(+), 32 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 145adeeb34a..00b2d862b65 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -262,59 +262,130 @@ if your installation's :ref:`:PublicInstall` setting is true, or: You can configure this redirect properly in your cloud environment to generate a temporary URL for access to the Swift objects for computing. -Amazon S3 Storage -+++++++++++++++++ +Amazon S3 Storage (or compatible) ++++++++++++++++++++++++++++++++++ -For institutions and organizations looking to use Amazon's S3 cloud storage for their installation, this can be set up manually through creation of the credentials and config files or automatically via the AWS console commands. +For institutions and organizations looking to use some kind of S3-based object storage for files uploaded to Dataverse, +this is entirely possible. You can either use the services offered by Amazon or use some other, even on-site S3-compatible +storage (like Minio, Ceph RADOS S3 Gateway and many more). -You'll need an AWS account with an associated S3 bucket for your installation to use. From the S3 management console (e.g. ``_), you can poke around and get familiar with your bucket. We recommend using IAM (Identity and Access Management) to create a user with full S3 access and nothing more, for security reasons. See ``_ for more info on this process. +First: setup accounts and access credentials +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Make note of the bucket's name and the region its data is hosted in. Dataverse and the AWS SDK make use of "AWS credentials profile file" and "AWS config profile file" located in ``~/.aws/`` where ``~`` is the home directory of the user you run Glassfish as. This file can be generated via either of two methods described below. It's also possible to use IAM Roles rather than the credentials file. Please note that in this case you will need anyway the config file to specify the region. +Dataverse and the AWS SDK make use of "AWS credentials profile file" and "AWS config profile file" located in +``~/.aws/`` where ``~`` is the home directory of the user you run Glassfish as. This file can be generated via either +of two methods described below: -Set Up credentials File Manually -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +1. manually through creation of the credentials and config files or +2. automatically via the AWS console commands. -To create the ``credentials`` file manually, you will need to generate a key/secret key. The first step is to log onto your AWS web console (e.g. ``_). If you have created a user in AWS IAM, you can click on that user and generate the keys needed for Dataverse. +Preparation when using Amazon's S3 service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Once you have acquired the keys, they need to be added to the ``credentials`` file. The format for credentials is as follows: +You'll need an AWS account with an associated S3 bucket for your installation to use. From the S3 management console +(e.g. ``_), you can poke around and get familiar with your bucket. -| ``[default]`` -| ``aws_access_key_id = `` -| ``aws_secret_access_key = `` +**Make note** of the **bucket's name** and the **region** its data is hosted in. -You must also specify the AWS region in the ``config`` file, for example: +To **create a user** with full S3 access and nothing more for security reasons, we recommend using IAM +(Identity and Access Management). See `IAM User Guide `_ +for more info on this process. -| ``[default]`` -| ``region = us-east-1`` +**Generate the user keys** needed for Dataverse afterwards by clicking on the created user. +(You can skip this step when running on EC2, see below.) -Place these two files in a folder named ``.aws`` under the home directory for the user running your Dataverse Glassfish instance. (From the `AWS Command Line Interface Documentation `_: "In order to separate credentials from less sensitive options, region and output format are stored in a separate file named config in the same folder") +.. TIP:: + If you are hosting Dataverse on an AWS EC2 instance alongside storage in S3, it is possible to use IAM Roles instead + of the credentials file (the file at ``~/.aws/credentials`` mentioned below). Please note that you will still need the + ``~/.aws/config`` file to specify the region. For more information on this option, see + http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html -Set Up Access Configuration Via Command Line Tools -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Preparation when using custom S3-compatible service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Begin by installing the CLI tool `pip `_ to install the `AWS command line interface `_ if you don't have it. +We assume you have your S3-compatible custom storage in place, up and running, ready for service. -First, we'll get our access keys set up. If you already have your access keys configured, skip this step. From the command line, run: +Please make note of the following details: -``pip install awscli`` +- | Endpoint URL. Consult the documentation of your service how to find that. + | Example: https://play.minio.io:9000 +- | Region. Optional, but some services might use it. Consult your service documentation! + | Example: *us-east-1* +- | Access key ID and secret access key. Usually you can generate access keys within the user profile of your service. + | Example: + | ID: *3AM3UQ867SPQQA43P2F* + | Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* +- | Bucket name. Dataverse will fail opening and uploading files on S3 if you don't create one. + | Example: *dataverse* -``aws configure`` -You'll be prompted to enter your Access Key ID and secret key, which should be issued to your AWS account. The subsequent config steps after the access keys are up to you. For reference, the keys will be stored in ``~/.aws/credentials``, and your AWS access region in ``~/.aws/config``. +Reported working S3-compatible storage +###################################### -Using an IAM Role with EC2 -^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you are hosting Dataverse on an AWS EC2 instance alongside storage in S3, it is possible to use IAM Roles instead of the credentials file (the file at ``~/.aws/credentials`` mentioned above). Please note that you will still need the ``~/.aws/config`` file to specify the region. For more information on this option, see http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html +- None yet :-( -Configure Dataverse to Use AWS/S3 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. HINT:: + If you are successfully using an S3 storage implementation not yet listed above, please feel free to + `open an issue at Github `_ and describe your setup. + We will be glad to add it here. -With your access to your bucket in place, we'll want to navigate to ``/usr/local/glassfish4/glassfish/bin/`` and execute the following ``asadmin`` commands to set up the proper JVM options. Recall that out of the box, Dataverse is configured to use local file storage. You'll need to delete the existing storage driver before setting the new one. -``./asadmin $ASADMIN_OPTS delete-jvm-options "\-Ddataverse.files.storage-driver-id=file"`` +Manually set up credentials file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To create the ``~/.aws/credentials`` file manually, you will need to generate a key/secret key (see above). Once you have +acquired the keys, they need to be added to the ``credentials`` file. The format for credentials is as follows: + +:: + + [default] + aws_access_key_id = + aws_secret_access_key = + +While using Amazon's service, you must also specify the AWS region in the ``~/.aws/config`` file, for example: + +:: + + [default] + region = us-east-1 -``./asadmin $ASADMIN_OPTS create-jvm-options "\-Ddataverse.files.storage-driver-id=s3"`` +Place these two files in a folder named ``.aws`` under the home directory for the user running your Dataverse Glassfish +instance. (From the `AWS Command Line Interface Documentation `_: +"In order to separate credentials from less sensitive options, region and output format are stored in a separate file +named config in the same folder") + +Console commands set up access configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Begin by installing the CLI tool `pip `_ to install the +`AWS command line interface `_ if you don't have it. + +First, we'll get our access keys set up. If you already have your access keys configured, skip this step. +From the command line, run: + +- ``pip install awscli`` +- ``aws configure`` + +You'll be prompted to enter your Access Key ID and secret key, which should be issued to your AWS account. +The subsequent config steps after the access keys are up to you. For reference, the keys will be stored in +``~/.aws/credentials``, and your AWS access region in ``~/.aws/config``. + +.. TIP:: + When using a custom S3 URL endpoint, you need to add it to every ``aws`` call: ``aws --endpoint-url s3 ...`` + (may omit it while configuring). + +Second: configure Dataverse to use S3 storage +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +With access to your bucket in place, we'll want to navigate to ``/usr/local/glassfish4/glassfish/bin/`` +and execute the following ``asadmin`` commands to set up the proper JVM options. Recall that out of the box, Dataverse +is configured to use local file storage. You'll need to delete the existing storage driver before setting the new one. + +:: + + ./asadmin $ASADMIN_OPTS delete-jvm-options "-Ddataverse.files.storage-driver-id=file" + ./asadmin $ASADMIN_OPTS create-jvm-options "-Ddataverse.files.storage-driver-id=s3" Then, we'll need to identify which S3 bucket we're using. Replace ``your_bucket_name`` with, of course, your bucket: @@ -326,10 +397,24 @@ Optionally, you can have users download files from S3 directly rather than havin If you enable ``dataverse.files.s3-download-redirect`` as described above, note that the S3 URLs expire after an hour by default but you can configure the expiration time using the ``dataverse.files.s3-url-expiration-minutes`` JVM option. Here's an example of setting the expiration time to 120 minutes: -``./asadmin create-jvm-options "-D dataverse.files.s3-url-expiration-minutes=120"`` +``./asadmin create-jvm-options "-Ddataverse.files.s3-url-expiration-minutes=120"`` Lastly, go ahead and restart your glassfish server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage. +S3 storage options +^^^^^^^^^^^^^^^^^^ + +========================================= ================== ================================================================== ============= +System Property Value Description Default value +========================================= ================== ================================================================== ============= +dataverse.files.storage-driver-id s3 Enable S3 storage driver. ``file`` +dataverse.files.s3-bucket-name The bucket name. See above. (none) +dataverse.files.s3-download-redirect ``true``/``false`` Enable direct download or proxy through Dataverse. ``false`` +dataverse.files.s3-url-expiration-minutes If direct downloads: time until links expire. Optional. 60 +dataverse.files.s3-url Use custom S3 endpoint. Needs URL either with or without protocol. (none) +dataverse.files.s3-region Only used when using custom endpoint. Optional. ``dataverse`` +========================================= ================== ================================================================== ============= + .. _Branding Your Installation: Branding Your Installation From 39af9b0c8bbdabf580366af088c647aa2814cccd Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Mon, 24 Sep 2018 10:45:49 +0200 Subject: [PATCH 08/14] Add option to enable path style access for S3AccessIO AWS client, used with many custom S3 services like Minio. --- doc/sphinx-guides/source/installation/config.rst | 1 + .../edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 00b2d862b65..514a0dad038 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -413,6 +413,7 @@ dataverse.files.s3-download-redirect ``true``/``false`` Enable direct dow dataverse.files.s3-url-expiration-minutes If direct downloads: time until links expire. Optional. 60 dataverse.files.s3-url Use custom S3 endpoint. Needs URL either with or without protocol. (none) dataverse.files.s3-region Only used when using custom endpoint. Optional. ``dataverse`` +dataverse.files.s3-path-style-access ``true``/``false`` Use path style buckets instead of subdomains. Optional. ``false`` ========================================= ================== ================================================================== ============= .. _Branding Your Installation: diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 83b1885f778..588eb466e43 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -80,6 +80,8 @@ public S3AccessIO(T dvObject, DataAccessRequest req) { if (!s3url.isEmpty()) { s3CB.setEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3url, s3region)); } + // some custom S3 implementations require "PathStyleAccess" as they us a path, not a subdomain. default = false + s3CB.withPathStyleAccessEnabled(s3pathStyleAccess); // let's build the client :-) this.s3 = s3CB.build(); } catch (Exception e) { @@ -108,6 +110,11 @@ public S3AccessIO(T dvObject, DataAccessRequest req, @NotNull AmazonS3 s3client) * Defaults to "dataverse" as it is not relevant for custom S3 implementations. */ private String s3region = System.getProperty("dataverse.files.s3-region", "dataverse"); + /** + * Pass in a boolean value if path style access should be used within the S3 client. + * Anything but case-insensitive "true" will lead to value of false, which is default value, too. + */ + private boolean s3pathStyleAccess = Boolean.parseBoolean(System.getProperty("dataverse.files.s3-path-style-access", "false")); private String bucketName = System.getProperty("dataverse.files.s3-bucket-name"); private String key; From e8e589091afccd407513f8e3c9e1dd2d1a8d4aab Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Mon, 24 Sep 2018 10:46:52 +0200 Subject: [PATCH 09/14] Report working Minio configuration for S3 storage --- doc/sphinx-guides/source/installation/config.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 514a0dad038..9183ae3cb58 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -322,8 +322,9 @@ Please make note of the following details: Reported working S3-compatible storage ###################################### - -- None yet :-( +`Minio v2018-09-12 `_ + Set ``dataverse.files.s3-path-style-access=true``, as Minio works path-based. + Works pretty smooth, easy to setup. .. HINT:: If you are successfully using an S3 storage implementation not yet listed above, please feel free to From 48d1876d16a93a98293646e8cc3f0e3b294efb0f Mon Sep 17 00:00:00 2001 From: Derek Murphy Date: Thu, 27 Sep 2018 13:33:30 -0400 Subject: [PATCH 10/14] Documentation review - edits [#4690] Made some edits to structure and content to match the rest of the page. --- .../source/installation/config.rst | 68 +++++++++---------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 9183ae3cb58..fb1ae7f671a 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -262,25 +262,23 @@ if your installation's :ref:`:PublicInstall` setting is true, or: You can configure this redirect properly in your cloud environment to generate a temporary URL for access to the Swift objects for computing. -Amazon S3 Storage (or compatible) +Amazon S3 Storage (or Compatible) +++++++++++++++++++++++++++++++++ -For institutions and organizations looking to use some kind of S3-based object storage for files uploaded to Dataverse, -this is entirely possible. You can either use the services offered by Amazon or use some other, even on-site S3-compatible -storage (like Minio, Ceph RADOS S3 Gateway and many more). +For institutions and organizations looking to use Amazon's S3 cloud storage for their installation, this can be set up manually through creation of the credentials and config files or automatically via the AWS console commands. Alternatively, you may instead use other, even on-site S3-compatible storage (like Minio, Ceph RADOS S3 Gateway and many more). -First: setup accounts and access credentials -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +First: Set Up Accounts and Access Credentials +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Dataverse and the AWS SDK make use of "AWS credentials profile file" and "AWS config profile file" located in +Dataverse and the AWS SDK make use of the "AWS credentials profile file" and "AWS config profile file" located in ``~/.aws/`` where ``~`` is the home directory of the user you run Glassfish as. This file can be generated via either of two methods described below: -1. manually through creation of the credentials and config files or -2. automatically via the AWS console commands. +1. Manually through creation of the credentials and config files or +2. Automatically via the AWS console commands. -Preparation when using Amazon's S3 service -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Preparation When Using Amazon's S3 Service +########################################## You'll need an AWS account with an associated S3 bucket for your installation to use. From the S3 management console (e.g. ``_), you can poke around and get familiar with your bucket. @@ -300,40 +298,39 @@ for more info on this process. ``~/.aws/config`` file to specify the region. For more information on this option, see http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html -Preparation when using custom S3-compatible service -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Preparation When Using Custom S3-Compatible Service +################################################### We assume you have your S3-compatible custom storage in place, up and running, ready for service. Please make note of the following details: -- | Endpoint URL. Consult the documentation of your service how to find that. - | Example: https://play.minio.io:9000 -- | Region. Optional, but some services might use it. Consult your service documentation! - | Example: *us-east-1* -- | Access key ID and secret access key. Usually you can generate access keys within the user profile of your service. - | Example: - | ID: *3AM3UQ867SPQQA43P2F* - | Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* -- | Bucket name. Dataverse will fail opening and uploading files on S3 if you don't create one. - | Example: *dataverse* +- **Endpoint URL** - consult the documentation of your service on how to find it. + - Example: https://play.minio.io:9000 +- **Region:** Optional, but some services might use it. Consult your service documentation. + - Example: *us-east-1* +- **Access key ID and secret access key:** Usually you can generate access keys within the user profile of your service. + - Example: + - ID: *3AM3UQ867SPQQA43P2F* + - Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* +- **Bucket name:** Dataverse will fail opening and uploading files on S3 if you don't create one. + - Example: *dataverse* -Reported working S3-compatible storage +Reported Working S3-Compatible Storage ###################################### `Minio v2018-09-12 `_ Set ``dataverse.files.s3-path-style-access=true``, as Minio works path-based. Works pretty smooth, easy to setup. -.. HINT:: - If you are successfully using an S3 storage implementation not yet listed above, please feel free to + **HINT:** If you are successfully using an S3 storage implementation not yet listed above, please feel free to `open an issue at Github `_ and describe your setup. We will be glad to add it here. -Manually set up credentials file -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Manually Set Up Credentials File +################################ To create the ``~/.aws/credentials`` file manually, you will need to generate a key/secret key (see above). Once you have acquired the keys, they need to be added to the ``credentials`` file. The format for credentials is as follows: @@ -356,8 +353,8 @@ instance. (From the `AWS Command Line Interface Documentation `_ to install the `AWS command line interface `_ if you don't have it. @@ -372,11 +369,10 @@ You'll be prompted to enter your Access Key ID and secret key, which should be i The subsequent config steps after the access keys are up to you. For reference, the keys will be stored in ``~/.aws/credentials``, and your AWS access region in ``~/.aws/config``. -.. TIP:: - When using a custom S3 URL endpoint, you need to add it to every ``aws`` call: ``aws --endpoint-url s3 ...`` - (may omit it while configuring). +**TIP:** When using a custom S3 URL endpoint, you need to add it to every ``aws`` call: ``aws --endpoint-url s3 ...`` + (you may omit it while configuring). -Second: configure Dataverse to use S3 storage +Second: Configure Dataverse to use S3 Storage ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ With access to your bucket in place, we'll want to navigate to ``/usr/local/glassfish4/glassfish/bin/`` @@ -402,8 +398,8 @@ If you enable ``dataverse.files.s3-download-redirect`` as described above, note Lastly, go ahead and restart your glassfish server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage. -S3 storage options -^^^^^^^^^^^^^^^^^^ +S3 Storage Options +################## ========================================= ================== ================================================================== ============= System Property Value Description Default value From 448c919b8d0afd4579281d755c593d7f294a7313 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Fri, 28 Sep 2018 22:13:27 +0200 Subject: [PATCH 11/14] Rename custom endpoint configuration properties as requested by @pdurbin --- doc/sphinx-guides/source/installation/config.rst | 8 ++++++-- .../edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index fb1ae7f671a..24d30701635 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -396,6 +396,10 @@ If you enable ``dataverse.files.s3-download-redirect`` as described above, note ``./asadmin create-jvm-options "-Ddataverse.files.s3-url-expiration-minutes=120"`` +In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please +add the options for the custom URL and region as documented below. Please read above if your desired combination has +been tested already and what other options have been set for a successful integration. + Lastly, go ahead and restart your glassfish server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage. S3 Storage Options @@ -408,8 +412,8 @@ dataverse.files.storage-driver-id s3 Enable S3 storage dataverse.files.s3-bucket-name The bucket name. See above. (none) dataverse.files.s3-download-redirect ``true``/``false`` Enable direct download or proxy through Dataverse. ``false`` dataverse.files.s3-url-expiration-minutes If direct downloads: time until links expire. Optional. 60 -dataverse.files.s3-url Use custom S3 endpoint. Needs URL either with or without protocol. (none) -dataverse.files.s3-region Only used when using custom endpoint. Optional. ``dataverse`` +dataverse.files.s3-custom-endpoint-url Use custom S3 endpoint. Needs URL either with or without protocol. (none) +dataverse.files.s3-custom-endpoint-region Only used when using custom endpoint. Optional. ``dataverse`` dataverse.files.s3-path-style-access ``true``/``false`` Use path style buckets instead of subdomains. Optional. ``false`` ========================================= ================== ================================================================== ============= diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 588eb466e43..90af53d7a0e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -77,8 +77,8 @@ public S3AccessIO(T dvObject, DataAccessRequest req) { // get a standard client, using the standard way of configuration the credentials, etc. AmazonS3ClientBuilder s3CB = AmazonS3ClientBuilder.standard(); // if the admin has set a system property (see below) we use this endpoint URL instead of the standard ones. - if (!s3url.isEmpty()) { - s3CB.setEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3url, s3region)); + if (!s3CEUrl.isEmpty()) { + s3CB.setEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3CEUrl, s3CERegion)); } // some custom S3 implementations require "PathStyleAccess" as they us a path, not a subdomain. default = false s3CB.withPathStyleAccessEnabled(s3pathStyleAccess); @@ -104,12 +104,12 @@ public S3AccessIO(T dvObject, DataAccessRequest req, @NotNull AmazonS3 s3client) * Pass in a URL pointing to your S3 compatible storage. * For possible values see https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/client/builder/AwsClientBuilder.EndpointConfiguration.html */ - private String s3url = System.getProperty("dataverse.files.s3-url", ""); + private String s3CEUrl = System.getProperty("dataverse.files.s3-custom-endpoint-url", ""); /** * Pass in a region to use for SigV4 signing of requests. * Defaults to "dataverse" as it is not relevant for custom S3 implementations. */ - private String s3region = System.getProperty("dataverse.files.s3-region", "dataverse"); + private String s3CERegion = System.getProperty("dataverse.files.s3-custom-endpoint-region", "dataverse"); /** * Pass in a boolean value if path style access should be used within the S3 client. * Anything but case-insensitive "true" will lead to value of false, which is default value, too. From 96060ba3222d552f87903fa669967b450bfaf0cb Mon Sep 17 00:00:00 2001 From: Derek Murphy Date: Fri, 28 Sep 2018 16:13:58 -0400 Subject: [PATCH 12/14] Documentation updates [#4690] Changed first paragraph, fixed list formatting. --- .../source/installation/config.rst | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 24d30701635..eb4add6bfe5 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -265,7 +265,11 @@ You can configure this redirect properly in your cloud environment to generate a Amazon S3 Storage (or Compatible) +++++++++++++++++++++++++++++++++ -For institutions and organizations looking to use Amazon's S3 cloud storage for their installation, this can be set up manually through creation of the credentials and config files or automatically via the AWS console commands. Alternatively, you may instead use other, even on-site S3-compatible storage (like Minio, Ceph RADOS S3 Gateway and many more). +For institutions and organizations looking to use some kind of S3-based object storage for files uploaded to Dataverse, +this is entirely possible. You can either use Amazon Web Services or use some other, even on-site S3-compatible +storage (like Minio, Ceph RADOS S3 Gateway and many more). + +**Note:** The Dataverse Team is most familiar with AWS S3, and can provide support on its usage with Dataverse. Thanks to community contributions, the application's architecture also allows non-AWS S3 providers. The Dataverse Team can provide very limited support on these other providers. We recommend reaching out to the wider Dataverse community if you have questions. First: Set Up Accounts and Access Credentials ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -306,15 +310,24 @@ We assume you have your S3-compatible custom storage in place, up and running, r Please make note of the following details: - **Endpoint URL** - consult the documentation of your service on how to find it. - - Example: https://play.minio.io:9000 + + * Example: https://play.minio.io:9000 + - **Region:** Optional, but some services might use it. Consult your service documentation. - - Example: *us-east-1* + + * Example: *us-east-1* + - **Access key ID and secret access key:** Usually you can generate access keys within the user profile of your service. - - Example: - - ID: *3AM3UQ867SPQQA43P2F* - - Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* + + * Example: + + - ID: *3AM3UQ867SPQQA43P2F* + + - Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* + - **Bucket name:** Dataverse will fail opening and uploading files on S3 if you don't create one. - - Example: *dataverse* + + * Example: *dataverse* Reported Working S3-Compatible Storage From b7b5d8a2a69b09897a6370a5f5cec1cb48730289 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Fri, 28 Sep 2018 22:25:35 +0200 Subject: [PATCH 13/14] Add a comment about Minio demo service used for QA and other testing purposes in the docs. --- doc/sphinx-guides/source/installation/config.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index eb4add6bfe5..0a494500105 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -334,12 +334,14 @@ Reported Working S3-Compatible Storage ###################################### `Minio v2018-09-12 `_ - Set ``dataverse.files.s3-path-style-access=true``, as Minio works path-based. - Works pretty smooth, easy to setup. + Set ``dataverse.files.s3-path-style-access=true``, as Minio works path-based. Works pretty smooth, easy to setup. + **Can be used for quick testing, too:** just use the example values above. Uses the public (read: unsecure and + possibly slow) http://play.minio.io:9000 service. - **HINT:** If you are successfully using an S3 storage implementation not yet listed above, please feel free to - `open an issue at Github `_ and describe your setup. - We will be glad to add it here. + +**HINT:** If you are successfully using an S3 storage implementation not yet listed above, please feel free to +`open an issue at Github `_ and describe your setup. +We will be glad to add it here. Manually Set Up Credentials File From d9146b654633348666615500461e0ba273b17abe Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 9 Oct 2018 16:19:43 -0400 Subject: [PATCH 14/14] small tweaks #4690 --- doc/sphinx-guides/source/installation/config.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 0a494500105..a19536228b1 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -321,7 +321,7 @@ Please make note of the following details: * Example: - - ID: *3AM3UQ867SPQQA43P2F* + - ID: *Q3AM3UQ867SPQQA43P2F* - Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* @@ -336,7 +336,7 @@ Reported Working S3-Compatible Storage `Minio v2018-09-12 `_ Set ``dataverse.files.s3-path-style-access=true``, as Minio works path-based. Works pretty smooth, easy to setup. **Can be used for quick testing, too:** just use the example values above. Uses the public (read: unsecure and - possibly slow) http://play.minio.io:9000 service. + possibly slow) https://play.minio.io:9000 service. **HINT:** If you are successfully using an S3 storage implementation not yet listed above, please feel free to @@ -421,7 +421,7 @@ S3 Storage Options ################## ========================================= ================== ================================================================== ============= -System Property Value Description Default value +JVM Option Value Description Default value ========================================= ================== ================================================================== ============= dataverse.files.storage-driver-id s3 Enable S3 storage driver. ``file`` dataverse.files.s3-bucket-name The bucket name. See above. (none)