diff --git a/.gitignore b/.gitignore index 87c2918d902..e1915ef7ddc 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,7 @@ conf/docker-aio/dv/install/dvinstall.zip conf/docker-aio/testdata/ scripts/installer/default.config *.pem + +# do not track IntelliJ IDEA files +.idea +**/*.iml diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 145adeeb34a..a19536228b1 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -262,59 +262,142 @@ if your installation's :ref:`:PublicInstall` setting is true, or: You can configure this redirect properly in your cloud environment to generate a temporary URL for access to the Swift objects for computing. -Amazon S3 Storage -+++++++++++++++++ +Amazon S3 Storage (or Compatible) ++++++++++++++++++++++++++++++++++ -For institutions and organizations looking to use Amazon's S3 cloud storage for their installation, this can be set up manually through creation of the credentials and config files or automatically via the AWS console commands. +For institutions and organizations looking to use some kind of S3-based object storage for files uploaded to Dataverse, +this is entirely possible. You can either use Amazon Web Services or use some other, even on-site S3-compatible +storage (like Minio, Ceph RADOS S3 Gateway and many more). -You'll need an AWS account with an associated S3 bucket for your installation to use. From the S3 management console (e.g. ``_), you can poke around and get familiar with your bucket. We recommend using IAM (Identity and Access Management) to create a user with full S3 access and nothing more, for security reasons. See ``_ for more info on this process. +**Note:** The Dataverse Team is most familiar with AWS S3, and can provide support on its usage with Dataverse. Thanks to community contributions, the application's architecture also allows non-AWS S3 providers. The Dataverse Team can provide very limited support on these other providers. We recommend reaching out to the wider Dataverse community if you have questions. -Make note of the bucket's name and the region its data is hosted in. Dataverse and the AWS SDK make use of "AWS credentials profile file" and "AWS config profile file" located in ``~/.aws/`` where ``~`` is the home directory of the user you run Glassfish as. This file can be generated via either of two methods described below. It's also possible to use IAM Roles rather than the credentials file. Please note that in this case you will need anyway the config file to specify the region. +First: Set Up Accounts and Access Credentials +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Set Up credentials File Manually -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Dataverse and the AWS SDK make use of the "AWS credentials profile file" and "AWS config profile file" located in +``~/.aws/`` where ``~`` is the home directory of the user you run Glassfish as. This file can be generated via either +of two methods described below: -To create the ``credentials`` file manually, you will need to generate a key/secret key. The first step is to log onto your AWS web console (e.g. ``_). If you have created a user in AWS IAM, you can click on that user and generate the keys needed for Dataverse. +1. Manually through creation of the credentials and config files or +2. Automatically via the AWS console commands. -Once you have acquired the keys, they need to be added to the ``credentials`` file. The format for credentials is as follows: +Preparation When Using Amazon's S3 Service +########################################## -| ``[default]`` -| ``aws_access_key_id = `` -| ``aws_secret_access_key = `` +You'll need an AWS account with an associated S3 bucket for your installation to use. From the S3 management console +(e.g. ``_), you can poke around and get familiar with your bucket. -You must also specify the AWS region in the ``config`` file, for example: +**Make note** of the **bucket's name** and the **region** its data is hosted in. -| ``[default]`` -| ``region = us-east-1`` +To **create a user** with full S3 access and nothing more for security reasons, we recommend using IAM +(Identity and Access Management). See `IAM User Guide `_ +for more info on this process. -Place these two files in a folder named ``.aws`` under the home directory for the user running your Dataverse Glassfish instance. (From the `AWS Command Line Interface Documentation `_: "In order to separate credentials from less sensitive options, region and output format are stored in a separate file named config in the same folder") +**Generate the user keys** needed for Dataverse afterwards by clicking on the created user. +(You can skip this step when running on EC2, see below.) -Set Up Access Configuration Via Command Line Tools -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. TIP:: + If you are hosting Dataverse on an AWS EC2 instance alongside storage in S3, it is possible to use IAM Roles instead + of the credentials file (the file at ``~/.aws/credentials`` mentioned below). Please note that you will still need the + ``~/.aws/config`` file to specify the region. For more information on this option, see + http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html -Begin by installing the CLI tool `pip `_ to install the `AWS command line interface `_ if you don't have it. +Preparation When Using Custom S3-Compatible Service +################################################### -First, we'll get our access keys set up. If you already have your access keys configured, skip this step. From the command line, run: +We assume you have your S3-compatible custom storage in place, up and running, ready for service. -``pip install awscli`` +Please make note of the following details: -``aws configure`` +- **Endpoint URL** - consult the documentation of your service on how to find it. -You'll be prompted to enter your Access Key ID and secret key, which should be issued to your AWS account. The subsequent config steps after the access keys are up to you. For reference, the keys will be stored in ``~/.aws/credentials``, and your AWS access region in ``~/.aws/config``. + * Example: https://play.minio.io:9000 + +- **Region:** Optional, but some services might use it. Consult your service documentation. -Using an IAM Role with EC2 -^^^^^^^^^^^^^^^^^^^^^^^^^^ + * Example: *us-east-1* + +- **Access key ID and secret access key:** Usually you can generate access keys within the user profile of your service. -If you are hosting Dataverse on an AWS EC2 instance alongside storage in S3, it is possible to use IAM Roles instead of the credentials file (the file at ``~/.aws/credentials`` mentioned above). Please note that you will still need the ``~/.aws/config`` file to specify the region. For more information on this option, see http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html + * Example: + + - ID: *Q3AM3UQ867SPQQA43P2F* + + - Key: *zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG* + +- **Bucket name:** Dataverse will fail opening and uploading files on S3 if you don't create one. -Configure Dataverse to Use AWS/S3 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * Example: *dataverse* -With your access to your bucket in place, we'll want to navigate to ``/usr/local/glassfish4/glassfish/bin/`` and execute the following ``asadmin`` commands to set up the proper JVM options. Recall that out of the box, Dataverse is configured to use local file storage. You'll need to delete the existing storage driver before setting the new one. -``./asadmin $ASADMIN_OPTS delete-jvm-options "\-Ddataverse.files.storage-driver-id=file"`` +Reported Working S3-Compatible Storage +###################################### + +`Minio v2018-09-12 `_ + Set ``dataverse.files.s3-path-style-access=true``, as Minio works path-based. Works pretty smooth, easy to setup. + **Can be used for quick testing, too:** just use the example values above. Uses the public (read: unsecure and + possibly slow) https://play.minio.io:9000 service. + + +**HINT:** If you are successfully using an S3 storage implementation not yet listed above, please feel free to +`open an issue at Github `_ and describe your setup. +We will be glad to add it here. + + +Manually Set Up Credentials File +################################ + +To create the ``~/.aws/credentials`` file manually, you will need to generate a key/secret key (see above). Once you have +acquired the keys, they need to be added to the ``credentials`` file. The format for credentials is as follows: + +:: + + [default] + aws_access_key_id = + aws_secret_access_key = + +While using Amazon's service, you must also specify the AWS region in the ``~/.aws/config`` file, for example: + +:: -``./asadmin $ASADMIN_OPTS create-jvm-options "\-Ddataverse.files.storage-driver-id=s3"`` + [default] + region = us-east-1 + +Place these two files in a folder named ``.aws`` under the home directory for the user running your Dataverse Glassfish +instance. (From the `AWS Command Line Interface Documentation `_: +"In order to separate credentials from less sensitive options, region and output format are stored in a separate file +named config in the same folder") + +Console Commands to Set Up Access Configuration +############################################### + +Begin by installing the CLI tool `pip `_ to install the +`AWS command line interface `_ if you don't have it. + +First, we'll get our access keys set up. If you already have your access keys configured, skip this step. +From the command line, run: + +- ``pip install awscli`` +- ``aws configure`` + +You'll be prompted to enter your Access Key ID and secret key, which should be issued to your AWS account. +The subsequent config steps after the access keys are up to you. For reference, the keys will be stored in +``~/.aws/credentials``, and your AWS access region in ``~/.aws/config``. + +**TIP:** When using a custom S3 URL endpoint, you need to add it to every ``aws`` call: ``aws --endpoint-url s3 ...`` + (you may omit it while configuring). + +Second: Configure Dataverse to use S3 Storage +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +With access to your bucket in place, we'll want to navigate to ``/usr/local/glassfish4/glassfish/bin/`` +and execute the following ``asadmin`` commands to set up the proper JVM options. Recall that out of the box, Dataverse +is configured to use local file storage. You'll need to delete the existing storage driver before setting the new one. + +:: + + ./asadmin $ASADMIN_OPTS delete-jvm-options "-Ddataverse.files.storage-driver-id=file" + ./asadmin $ASADMIN_OPTS create-jvm-options "-Ddataverse.files.storage-driver-id=s3" Then, we'll need to identify which S3 bucket we're using. Replace ``your_bucket_name`` with, of course, your bucket: @@ -326,10 +409,29 @@ Optionally, you can have users download files from S3 directly rather than havin If you enable ``dataverse.files.s3-download-redirect`` as described above, note that the S3 URLs expire after an hour by default but you can configure the expiration time using the ``dataverse.files.s3-url-expiration-minutes`` JVM option. Here's an example of setting the expiration time to 120 minutes: -``./asadmin create-jvm-options "-D dataverse.files.s3-url-expiration-minutes=120"`` +``./asadmin create-jvm-options "-Ddataverse.files.s3-url-expiration-minutes=120"`` + +In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please +add the options for the custom URL and region as documented below. Please read above if your desired combination has +been tested already and what other options have been set for a successful integration. Lastly, go ahead and restart your glassfish server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage. +S3 Storage Options +################## + +========================================= ================== ================================================================== ============= +JVM Option Value Description Default value +========================================= ================== ================================================================== ============= +dataverse.files.storage-driver-id s3 Enable S3 storage driver. ``file`` +dataverse.files.s3-bucket-name The bucket name. See above. (none) +dataverse.files.s3-download-redirect ``true``/``false`` Enable direct download or proxy through Dataverse. ``false`` +dataverse.files.s3-url-expiration-minutes If direct downloads: time until links expire. Optional. 60 +dataverse.files.s3-custom-endpoint-url Use custom S3 endpoint. Needs URL either with or without protocol. (none) +dataverse.files.s3-custom-endpoint-region Only used when using custom endpoint. Optional. ``dataverse`` +dataverse.files.s3-path-style-access ``true``/``false`` Use path style buckets instead of subdomains. Optional. ``false`` +========================================= ================== ================================================================== ============= + .. _Branding Your Installation: Branding Your Installation diff --git a/pom.xml b/pom.xml index c2e4f8a82a2..0dccecd0708 100644 --- a/pom.xml +++ b/pom.xml @@ -13,6 +13,8 @@ ${project.build.directory}/endorsed UTF-8 -Xdoclint:none + + 1.11.172 UTC en @@ -27,6 +29,7 @@ 5.3.1 5.3.1 1.3.1 + 2.22.0 @@ -383,7 +386,13 @@ org.mockito mockito-core - 2.22.0 + ${mockito.version} + test + + + org.mockito + mockito-junit-jupiter + ${mockito.version} test diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 530f7ee4a17..90af53d7a0e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -3,6 +3,7 @@ import com.amazonaws.AmazonClientException; import com.amazonaws.HttpMethod; import com.amazonaws.SdkClientException; +import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.amazonaws.services.s3.model.ObjectMetadata; @@ -43,6 +44,8 @@ import java.util.logging.Logger; import org.apache.commons.io.IOUtils; +import javax.validation.constraints.NotNull; + /** * * @author Matthew A Dunlap @@ -69,18 +72,49 @@ public S3AccessIO(T dvObject) { public S3AccessIO(T dvObject, DataAccessRequest req) { super(dvObject, req); this.setIsLocalFile(false); + try { - s3 = AmazonS3ClientBuilder.standard().defaultClient(); + // get a standard client, using the standard way of configuration the credentials, etc. + AmazonS3ClientBuilder s3CB = AmazonS3ClientBuilder.standard(); + // if the admin has set a system property (see below) we use this endpoint URL instead of the standard ones. + if (!s3CEUrl.isEmpty()) { + s3CB.setEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3CEUrl, s3CERegion)); + } + // some custom S3 implementations require "PathStyleAccess" as they us a path, not a subdomain. default = false + s3CB.withPathStyleAccessEnabled(s3pathStyleAccess); + // let's build the client :-) + this.s3 = s3CB.build(); } catch (Exception e) { throw new AmazonClientException( - "Cannot instantiate a S3 client using; check your AWS credentials and region", - e); + "Cannot instantiate a S3 client using; check your AWS credentials and region", + e); } } + + public S3AccessIO(T dvObject, DataAccessRequest req, @NotNull AmazonS3 s3client) { + super(dvObject, req); + this.setIsLocalFile(false); + this.s3 = s3client; + } public static String S3_IDENTIFIER_PREFIX = "s3"; private AmazonS3 s3 = null; + /** + * Pass in a URL pointing to your S3 compatible storage. + * For possible values see https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/client/builder/AwsClientBuilder.EndpointConfiguration.html + */ + private String s3CEUrl = System.getProperty("dataverse.files.s3-custom-endpoint-url", ""); + /** + * Pass in a region to use for SigV4 signing of requests. + * Defaults to "dataverse" as it is not relevant for custom S3 implementations. + */ + private String s3CERegion = System.getProperty("dataverse.files.s3-custom-endpoint-region", "dataverse"); + /** + * Pass in a boolean value if path style access should be used within the S3 client. + * Anything but case-insensitive "true" will lead to value of false, which is default value, too. + */ + private boolean s3pathStyleAccess = Boolean.parseBoolean(System.getProperty("dataverse.files.s3-path-style-access", "false")); private String bucketName = System.getProperty("dataverse.files.s3-bucket-name"); private String key; @@ -630,7 +664,7 @@ public InputStream getAuxFileAsInputStream(String auxItemTag) throws IOException } } - private String getDestinationKey(String auxItemTag) throws IOException { + String getDestinationKey(String auxItemTag) throws IOException { if (dvObject instanceof DataFile) { return getMainFileKey() + "." + auxItemTag; } else if (dvObject instanceof Dataset) { @@ -643,7 +677,16 @@ private String getDestinationKey(String auxItemTag) throws IOException { } } - private String getMainFileKey() throws IOException { + /** + * TODO: this function is not side effect free (sets instance variables key and bucketName). + * Is this good or bad? Need to ask @landreev + * + * Extract the file key from a file stored on S3. + * Follows template: "owner authority name"/"owner identifier"/"storage identifier without bucketname and protocol" + * @return Main File Key + * @throws IOException + */ + String getMainFileKey() throws IOException { if (key == null) { String baseKey = this.getDataFile().getOwner().getAuthorityForFileStorage() + "/" + this.getDataFile().getOwner().getIdentifierForFileStorage(); String storageIdentifier = dvObject.getStorageIdentifier(); @@ -723,7 +766,7 @@ public String generateTemporaryS3Url() throws IOException { } } - private int getUrlExpirationMinutes() { + int getUrlExpirationMinutes() { String optionValue = System.getProperty("dataverse.files.s3-url-expiration-minutes"); if (optionValue != null) { Integer num; diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java new file mode 100644 index 00000000000..4ce821a5fee --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIOTest.java @@ -0,0 +1,123 @@ +/* + * Copyright 2018 Forschungszentrum Jülich GmbH + * SPDX-License-Identifier: Apache 2.0 + */ +package edu.harvard.iq.dataverse.dataaccess; + +import com.amazonaws.services.s3.AmazonS3; +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.api.UtilIT; +import edu.harvard.iq.dataverse.mocks.MocksFactory; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import static org.junit.jupiter.api.Assertions.*; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.junit.jupiter.MockitoSettings; +import org.mockito.quality.Strictness; +import static org.mockito.Mockito.*; +import static org.mockito.BDDMockito.*; + +import java.io.FileNotFoundException; +import java.io.IOException; + +@ExtendWith(MockitoExtension.class) +@MockitoSettings(strictness = Strictness.STRICT_STUBS) +public class S3AccessIOTest { + + @Mock + private AmazonS3 s3client; + + private S3AccessIO dataSetAccess; + private S3AccessIO dataFileAccess; + private Dataset dataSet; + private DataFile dataFile; + private String dataFileId; + + @BeforeEach + public void setup() throws IOException { + dataFile = MocksFactory.makeDataFile(); + dataSet = MocksFactory.makeDataset(); + dataFile.setOwner(dataSet); + dataFileId = UtilIT.getRandomIdentifier(); + dataFile.setStorageIdentifier("s3://bucket:"+dataFileId); + dataSetAccess = new S3AccessIO<>(dataSet, null, s3client); + dataFileAccess = new S3AccessIO<>(dataFile, null, s3client); + } + + /* + createTempFile + getStorageLocation + getFileSystemPath + exists? + getWriteChannel + getOutputStream + getDestinationKey + + DONE + --------------------- + getMainFileKey + getUrlExpirationMinutes + */ + + @Test + void keyNull_getMainFileKey() throws IOException { + // given + String authOwner = dataSet.getAuthority(); + String idOwner = dataSet.getIdentifier(); + + // when + String key = dataFileAccess.getMainFileKey(); + + // then + assertEquals(authOwner+"/"+idOwner+"/"+dataFileId, key); + } + + @Test + void keyNullstorageIdNullOrEmpty_getMainFileKey() throws IOException { + // given + dataFile.setStorageIdentifier(null); + // when & then + assertThrows(FileNotFoundException.class, () -> {dataFileAccess.getMainFileKey(); }); + + // given + dataFile.setStorageIdentifier(""); + // when & then + assertThrows(FileNotFoundException.class, () -> {dataFileAccess.getMainFileKey(); }); + } + + @Test + void keyNullstorageIdNull_getMainFileKey() throws IOException { + // given + dataFile.setStorageIdentifier("invalid://abcd"); + // when & then + assertThrows(IOException.class, () -> {dataFileAccess.getMainFileKey(); }); + } + + @Test + void default_getUrlExpirationMinutes() { + // given + System.clearProperty("dataverse.files.s3-url-expiration-minutes"); + // when & then + assertEquals(60, dataFileAccess.getUrlExpirationMinutes()); + } + + @Test + void validSetting_getUrlExpirationMinutes() { + // given + System.setProperty("dataverse.files.s3-url-expiration-minutes", "120"); + // when & then + assertEquals(120, dataFileAccess.getUrlExpirationMinutes()); + } + + @Test + void invalidSetting_getUrlExpirationMinutes() { + // given + System.setProperty("dataverse.files.s3-url-expiration-minutes", "NaN"); + // when & then + assertEquals(60, dataFileAccess.getUrlExpirationMinutes()); + } + +}