From 42b695871c4faab715d96de6413ef6dfb296b57a Mon Sep 17 00:00:00 2001 From: Eryk Kulikowski Date: Fri, 20 Oct 2023 18:19:27 +0200 Subject: [PATCH 1/7] disable s3 tagging JVM option --- .../10022_upload_redirect_without_tagging.md | 1 + .../source/installation/config.rst | 8 +++ .../iq/dataverse/dataaccess/S3AccessIO.java | 11 ++- .../iq/dataverse/settings/JvmSettings.java | 2 + src/main/webapp/resources/js/fileupload.js | 70 ++++++++++--------- 5 files changed, 57 insertions(+), 35 deletions(-) create mode 100644 doc/release-notes/10022_upload_redirect_without_tagging.md diff --git a/doc/release-notes/10022_upload_redirect_without_tagging.md b/doc/release-notes/10022_upload_redirect_without_tagging.md new file mode 100644 index 00000000000..379c7c8f1e0 --- /dev/null +++ b/doc/release-notes/10022_upload_redirect_without_tagging.md @@ -0,0 +1 @@ +If your S3 store does not support tagging and gives an error when redirecting uploads, from now on, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. Disabling the tagging can result in leftover files that are not used by your Dataverse instance and should be removed to preserve the storage space. To clean up the leftover files, you can use the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index ce8876b012c..2b5fd75977c 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -792,6 +792,13 @@ Larger installations may want to increase the number of open S3 connections allo ``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` +By default, when redirecting an upload to the S3 storage, the Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup when the file is not added to the dataset after upload (e.g., when the user cancels the operation). +If your S3 store does not support tagging and gives an error when redirecting uploads, you can disable that tag by using the ``dataverse.files..disable-tagging`` JVM option. For example: + +``./asadmin create-jvm-options "-Ddataverse.files..disable-tagging=true"`` + +Disabling the ``temp`` tag can result in leftover files that are not used by your Dataverse instance and should be removed to preserve the storage space. To clean up the leftover files, you can use the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. + In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please add the options for the custom URL and region as documented below. Please read above if your desired combination has been tested already and what other options have been set for a successful integration. @@ -825,6 +832,7 @@ List of S3 Storage Options dataverse.files..payload-signing ``true``/``false`` Enable payload signing. Optional ``false`` dataverse.files..chunked-encoding ``true``/``false`` Disable chunked encoding. Optional ``true`` dataverse.files..connection-pool-size The maximum number of open connections to the S3 server ``256`` + dataverse.files..disable-tagging ``true``/``false`` Do not place the ``temp`` tag when redirecting the upload to the S3 server ``false`` =========================================== ================== =================================================================================== ============= .. table:: diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 822ada0b83e..c904651e81f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -39,6 +39,7 @@ import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.datavariable.DataVariable; +import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.util.FileUtil; import opennlp.tools.util.StringUtil; @@ -985,7 +986,10 @@ private String generateTemporaryS3UploadUrl(String key, Date expiration) throws GeneratePresignedUrlRequest generatePresignedUrlRequest = new GeneratePresignedUrlRequest(bucketName, key).withMethod(HttpMethod.PUT).withExpiration(expiration); //Require user to add this header to indicate a temporary file - generatePresignedUrlRequest.putCustomRequestHeader(Headers.S3_TAGGING, "dv-state=temp"); + final boolean taggingDisabled = JvmSettings.DISABLE_S3_TAGGING.lookupOptional(Boolean.class, this.driverId).orElse(false); + if (!taggingDisabled) { + generatePresignedUrlRequest.putCustomRequestHeader(Headers.S3_TAGGING, "dv-state=temp"); + } URL presignedUrl; try { @@ -1034,7 +1038,10 @@ public JsonObjectBuilder generateTemporaryS3UploadUrls(String globalId, String s } else { JsonObjectBuilder urls = Json.createObjectBuilder(); InitiateMultipartUploadRequest initiationRequest = new InitiateMultipartUploadRequest(bucketName, key); - initiationRequest.putCustomRequestHeader(Headers.S3_TAGGING, "dv-state=temp"); + final boolean taggingDisabled = JvmSettings.DISABLE_S3_TAGGING.lookupOptional(Boolean.class, this.driverId).orElse(false); + if (!taggingDisabled) { + initiationRequest.putCustomRequestHeader(Headers.S3_TAGGING, "dv-state=temp"); + } InitiateMultipartUploadResult initiationResponse = s3.initiateMultipartUpload(initiationRequest); String uploadId = initiationResponse.getUploadId(); for (int i = 1; i <= (fileSize / minPartSize) + (fileSize % minPartSize > 0 ? 1 : 0); i++) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index cc3272413c7..794ffd5e0af 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -50,6 +50,8 @@ public enum JvmSettings { UPLOADS_DIRECTORY(SCOPE_FILES, "uploads"), DOCROOT_DIRECTORY(SCOPE_FILES, "docroot"), GUESTBOOK_AT_REQUEST(SCOPE_FILES, "guestbook-at-request"), + SCOPE_DRIVER(SCOPE_FILES), + DISABLE_S3_TAGGING(SCOPE_DRIVER, "disable-tagging"), // SOLR INDEX SETTINGS SCOPE_SOLR(PREFIX, "solr"), diff --git a/src/main/webapp/resources/js/fileupload.js b/src/main/webapp/resources/js/fileupload.js index 08d6956b62c..03ec82f214b 100644 --- a/src/main/webapp/resources/js/fileupload.js +++ b/src/main/webapp/resources/js/fileupload.js @@ -192,41 +192,45 @@ var fileUpload = class fileUploadClass { progBar.html(''); progBar.append($('').attr('class', 'ui-progressbar ui-widget ui-widget-content ui-corner-all')); if(this.urls.hasOwnProperty("url")) { - $.ajax({ - url: this.urls.url, - headers: { "x-amz-tagging": "dv-state=temp" }, - type: 'PUT', - data: this.file, - context:this, - cache: false, - processData: false, - success: function() { - //ToDo - cancelling abandons the file. It is marked as temp so can be cleaned up later, but would be good to remove now (requires either sending a presigned delete URL or adding a callback to delete only a temp file - if(!cancelled) { - this.reportUpload(); - } - }, - error: function(jqXHR, textStatus, errorThrown) { - console.log('Failure: ' + jqXHR.status); - console.log('Failure: ' + errorThrown); - uploadFailure(jqXHR, thisFile); - }, - xhr: function() { - var myXhr = $.ajaxSettings.xhr(); - if (myXhr.upload) { - myXhr.upload.addEventListener('progress', function(e) { - if (e.lengthComputable) { - var doublelength = 2 * e.total; - progBar.children('progress').attr({ - value: e.loaded, - max: doublelength - }); - } - }); + const url = this.urls.url; + const request = { + url: url, + type: 'PUT', + data: this.file, + context:this, + cache: false, + processData: false, + success: function() { + //ToDo - cancelling abandons the file. It is marked as temp so can be cleaned up later, but would be good to remove now (requires either sending a presigned delete URL or adding a callback to delete only a temp file + if(!cancelled) { + this.reportUpload(); + } + }, + error: function(jqXHR, textStatus, errorThrown) { + console.log('Failure: ' + jqXHR.status); + console.log('Failure: ' + errorThrown); + uploadFailure(jqXHR, thisFile); + }, + xhr: function() { + var myXhr = $.ajaxSettings.xhr(); + if (myXhr.upload) { + myXhr.upload.addEventListener('progress', function(e) { + if (e.lengthComputable) { + var doublelength = 2 * e.total; + progBar.children('progress').attr({ + value: e.loaded, + max: doublelength + }); + } + }); + } + return myXhr; } - return myXhr; + }; + if (url.includes("x-amz-tagging")) { + request.headers = { "x-amz-tagging": "dv-state=temp" }; } - }); + $.ajax(request); } else { var loaded=[]; this.etags=[]; From b471eb0f671610bb24f9219456eecd8f482a49ae Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 4 Apr 2024 13:02:13 -0400 Subject: [PATCH 2/7] Wordsmithing tag guidance --- doc/sphinx-guides/source/installation/config.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 12c7cc2f6f8..691b5ee12d7 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -792,12 +792,12 @@ Larger installations may want to increase the number of open S3 connections allo ``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` -By default, when redirecting an upload to the S3 storage, the Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup when the file is not added to the dataset after upload (e.g., when the user cancels the operation). +By default, when redirecting an upload to the S3 storage, the Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup if the file is not added to the dataset after upload (e.g., if the user cancels the operation). If your S3 store does not support tagging and gives an error when redirecting uploads, you can disable that tag by using the ``dataverse.files..disable-tagging`` JVM option. For example: ``./asadmin create-jvm-options "-Ddataverse.files..disable-tagging=true"`` -Disabling the ``temp`` tag can result in leftover files that are not used by your Dataverse instance and should be removed to preserve the storage space. To clean up the leftover files, you can use the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. +Disabling the ``temp`` tag makes it harder to identify abandoned files that are not used by your Dataverse instance (i.e. one cannot search for the temp tag in a delete script). These should still be removed to avoid wasting storage space. To clean up these files and any other leftover files, regardless of whether the temp tag is applied, you can use the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please add the options for the custom URL and region as documented below. Please read above if your desired combination has From 9ef355b02ea1db7facf6b1f95e79d38f7ec972b3 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 4 Apr 2024 13:03:20 -0400 Subject: [PATCH 3/7] Update config.rst --- doc/sphinx-guides/source/installation/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 691b5ee12d7..7c83172f0b2 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -792,7 +792,7 @@ Larger installations may want to increase the number of open S3 connections allo ``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` -By default, when redirecting an upload to the S3 storage, the Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup if the file is not added to the dataset after upload (e.g., if the user cancels the operation). +By default, when redirecting an upload to the S3 storage, Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup if the file is not added to the dataset after upload (e.g., if the user cancels the operation). If your S3 store does not support tagging and gives an error when redirecting uploads, you can disable that tag by using the ``dataverse.files..disable-tagging`` JVM option. For example: ``./asadmin create-jvm-options "-Ddataverse.files..disable-tagging=true"`` From 596b5925c0675fe019f6fccf7e063759a5fcc232 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 4 Apr 2024 13:08:45 -0400 Subject: [PATCH 4/7] Update 10022_upload_redirect_without_tagging.md --- doc/release-notes/10022_upload_redirect_without_tagging.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release-notes/10022_upload_redirect_without_tagging.md b/doc/release-notes/10022_upload_redirect_without_tagging.md index 379c7c8f1e0..107b05f2e2a 100644 --- a/doc/release-notes/10022_upload_redirect_without_tagging.md +++ b/doc/release-notes/10022_upload_redirect_without_tagging.md @@ -1 +1 @@ -If your S3 store does not support tagging and gives an error when redirecting uploads, from now on, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. Disabling the tagging can result in leftover files that are not used by your Dataverse instance and should be removed to preserve the storage space. To clean up the leftover files, you can use the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. +If your S3 store does not support tagging and gives an error when redirecting uploads, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. Disabling the tagging makes it harder to identify abandoned files (created in cases where the user does not complete the upload operation) with an external script but they can still be removed using the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. From be2b7838c21dccc2043125acb6568185968e7bc6 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 4 Apr 2024 13:12:13 -0400 Subject: [PATCH 5/7] Update JvmSettings.java --- .../java/edu/harvard/iq/dataverse/settings/JvmSettings.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 794ffd5e0af..0a5662b0e98 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -50,6 +50,8 @@ public enum JvmSettings { UPLOADS_DIRECTORY(SCOPE_FILES, "uploads"), DOCROOT_DIRECTORY(SCOPE_FILES, "docroot"), GUESTBOOK_AT_REQUEST(SCOPE_FILES, "guestbook-at-request"), + + //STORAGE DRIVER SETTINGS SCOPE_DRIVER(SCOPE_FILES), DISABLE_S3_TAGGING(SCOPE_DRIVER, "disable-tagging"), From 22e5b1aa2a224d2bd138f7c025118d37b0a645b1 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 16 Apr 2024 16:54:47 -0400 Subject: [PATCH 6/7] tweak docs and release note #10022 --- .../10022_upload_redirect_without_tagging.md | 6 +++++- doc/sphinx-guides/source/api/native-api.rst | 2 +- .../source/developers/big-data-support.rst | 7 ++++++- doc/sphinx-guides/source/installation/config.rst | 10 ++++++---- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/release-notes/10022_upload_redirect_without_tagging.md b/doc/release-notes/10022_upload_redirect_without_tagging.md index 107b05f2e2a..7ff17f08f4c 100644 --- a/doc/release-notes/10022_upload_redirect_without_tagging.md +++ b/doc/release-notes/10022_upload_redirect_without_tagging.md @@ -1 +1,5 @@ -If your S3 store does not support tagging and gives an error when redirecting uploads, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. Disabling the tagging makes it harder to identify abandoned files (created in cases where the user does not complete the upload operation) with an external script but they can still be removed using the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. +If your S3 store does not support tagging and gives an error if you configure direct uploads, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. For more details see https://dataverse-guide--10029.org.readthedocs.build/en/10029/developers/big-data-support.html#s3-tags #10022 and #10029. + +## New config options + +- dataverse.files..disable-tagging diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 8e2a9d7b886..848f031178a 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -2013,7 +2013,7 @@ The fully expanded example above (without environment variables) looks like this .. _cleanup-storage-api: -Cleanup storage of a Dataset +Cleanup Storage of a Dataset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is an experimental feature and should be tested on your system before using it in production. diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 8d891e63317..4aba7881c1f 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -81,7 +81,12 @@ with the contents of the file cors.json as follows: Alternatively, you can enable CORS using the AWS S3 web interface, using json-encoded rules as in the example above. -Since the direct upload mechanism creates the final file rather than an intermediate temporary file, user actions, such as neither saving or canceling an upload session before closing the browser page, can leave an abandoned file in the store. The direct upload mechanism attempts to use S3 Tags to aid in identifying/removing such files. Upon upload, files are given a "dv-state":"temp" tag which is removed when the dataset changes are saved and the new file(s) are added in the Dataverse installation. Note that not all S3 implementations support Tags: Minio does not. WIth such stores, direct upload works, but Tags are not used. +.. _s3-tags: + +S3 Tags and Direct Upload +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since the direct upload mechanism creates the final file rather than an intermediate temporary file, user actions, such as neither saving or canceling an upload session before closing the browser page, can leave an abandoned file in the store. The direct upload mechanism attempts to use S3 tags to aid in identifying/removing such files. Upon upload, files are given a "dv-state":"temp" tag which is removed when the dataset changes are saved and new files are added in the Dataverse installation. Note that not all S3 implementations support tags. Minio, for example, does not. With such stores, direct upload may not work and you might need to disable tagging. For details, look for ``dataverse.files..disable-tagging`` under :ref:`list-of-s3-storage-options` in the Installation Guide. Trusted Remote Storage with the ``remote`` Store Type ----------------------------------------------------- diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index d32c65ef931..ae27a9727da 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1189,12 +1189,12 @@ Larger installations may want to increase the number of open S3 connections allo ``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` -By default, when redirecting an upload to the S3 storage, Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup if the file is not added to the dataset after upload (e.g., if the user cancels the operation). -If your S3 store does not support tagging and gives an error when redirecting uploads, you can disable that tag by using the ``dataverse.files..disable-tagging`` JVM option. For example: +By default, when direct upload to an S3 store is configured, Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup in case the file is not added to the dataset after upload (e.g., if the user cancels the operation). (See :ref:`s3-tags`.) +If your S3 store does not support tagging and gives an error when direct upload is configured, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. For example: ``./asadmin create-jvm-options "-Ddataverse.files..disable-tagging=true"`` -Disabling the ``temp`` tag makes it harder to identify abandoned files that are not used by your Dataverse instance (i.e. one cannot search for the temp tag in a delete script). These should still be removed to avoid wasting storage space. To clean up these files and any other leftover files, regardless of whether the temp tag is applied, you can use the [Cleanup Storage of a Dataset](https://guides.dataverse.org/en/5.13/api/native-api.html#cleanup-storage-of-a-dataset) API endpoint. +Disabling the ``temp`` tag makes it harder to identify abandoned files that are not used by your Dataverse instance (i.e. one cannot search for the ``temp`` tag in a delete script). These should still be removed to avoid wasting storage space. To clean up these files and any other leftover files, regardless of whether the ``temp`` tag is applied, you can use the :ref:`cleanup-storage-api` API endpoint. In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please add the options for the custom URL and region as documented below. Please read above if your desired combination has @@ -1202,6 +1202,8 @@ been tested already and what other options have been set for a successful integr Lastly, go ahead and restart your Payara server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage. +.. _list-of-s3-storage-options: + List of S3 Storage Options ########################## @@ -1229,7 +1231,7 @@ List of S3 Storage Options dataverse.files..payload-signing ``true``/``false`` Enable payload signing. Optional ``false`` dataverse.files..chunked-encoding ``true``/``false`` Disable chunked encoding. Optional ``true`` dataverse.files..connection-pool-size The maximum number of open connections to the S3 server ``256`` - dataverse.files..disable-tagging ``true``/``false`` Do not place the ``temp`` tag when redirecting the upload to the S3 server ``false`` + dataverse.files..disable-tagging ``true``/``false`` Do not place the ``temp`` tag when redirecting the upload to the S3 server. ``false`` =========================================== ================== =================================================================================== ============= .. table:: From e473d53bf73b2579f531c7d06d6c29b81bd8568c Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 17 Apr 2024 14:53:33 -0400 Subject: [PATCH 7/7] explain how to avoid error when tags are disabled #10022 --- .../source/developers/big-data-support.rst | 4 ++-- .../source/developers/s3-direct-upload-api.rst | 6 ++++++ doc/sphinx-guides/source/installation/config.rst | 12 +++++++++++- .../java/edu/harvard/iq/dataverse/api/UtilIT.java | 15 +++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 4aba7881c1f..087cdb6303e 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -81,12 +81,12 @@ with the contents of the file cors.json as follows: Alternatively, you can enable CORS using the AWS S3 web interface, using json-encoded rules as in the example above. -.. _s3-tags: +.. _s3-tags-and-direct-upload: S3 Tags and Direct Upload ~~~~~~~~~~~~~~~~~~~~~~~~~ -Since the direct upload mechanism creates the final file rather than an intermediate temporary file, user actions, such as neither saving or canceling an upload session before closing the browser page, can leave an abandoned file in the store. The direct upload mechanism attempts to use S3 tags to aid in identifying/removing such files. Upon upload, files are given a "dv-state":"temp" tag which is removed when the dataset changes are saved and new files are added in the Dataverse installation. Note that not all S3 implementations support tags. Minio, for example, does not. With such stores, direct upload may not work and you might need to disable tagging. For details, look for ``dataverse.files..disable-tagging`` under :ref:`list-of-s3-storage-options` in the Installation Guide. +Since the direct upload mechanism creates the final file rather than an intermediate temporary file, user actions, such as neither saving or canceling an upload session before closing the browser page, can leave an abandoned file in the store. The direct upload mechanism attempts to use S3 tags to aid in identifying/removing such files. Upon upload, files are given a "dv-state":"temp" tag which is removed when the dataset changes are saved and new files are added in the Dataverse installation. Note that not all S3 implementations support tags. Minio, for example, does not. With such stores, direct upload may not work and you might need to disable tagging. For details, see :ref:`s3-tagging` in the Installation Guide. Trusted Remote Storage with the ``remote`` Store Type ----------------------------------------------------- diff --git a/doc/sphinx-guides/source/developers/s3-direct-upload-api.rst b/doc/sphinx-guides/source/developers/s3-direct-upload-api.rst index 0040c1fd3f0..33b8e434e6e 100644 --- a/doc/sphinx-guides/source/developers/s3-direct-upload-api.rst +++ b/doc/sphinx-guides/source/developers/s3-direct-upload-api.rst @@ -79,6 +79,12 @@ In the single part case, only one call to the supplied URL is required: curl -i -H 'x-amz-tagging:dv-state=temp' -X PUT -T "" +Or, if you have disabled S3 tagging (see :ref:`s3-tagging`), you should omit the header like this: + +.. code-block:: bash + + curl -i -X PUT -T "" + Note that without the ``-i`` flag, you should not expect any output from the command above. With the ``-i`` flag, you should expect to see a "200 OK" response. In the multipart case, the client must send each part and collect the 'eTag' responses from the server. The calls for this are the same as the one for the single part case except that each call should send a slice of the total file, with the last part containing the remaining bytes. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index ae27a9727da..75ae760aa4a 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1189,13 +1189,23 @@ Larger installations may want to increase the number of open S3 connections allo ``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` -By default, when direct upload to an S3 store is configured, Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup in case the file is not added to the dataset after upload (e.g., if the user cancels the operation). (See :ref:`s3-tags`.) +.. _s3-tagging: + +S3 Tagging +########## + +By default, when direct upload to an S3 store is configured, Dataverse will place a ``temp`` tag on the file being uploaded for an easier cleanup in case the file is not added to the dataset after upload (e.g., if the user cancels the operation). (See :ref:`s3-tags-and-direct-upload`.) If your S3 store does not support tagging and gives an error when direct upload is configured, you can disable the tagging by using the ``dataverse.files..disable-tagging`` JVM option. For example: ``./asadmin create-jvm-options "-Ddataverse.files..disable-tagging=true"`` Disabling the ``temp`` tag makes it harder to identify abandoned files that are not used by your Dataverse instance (i.e. one cannot search for the ``temp`` tag in a delete script). These should still be removed to avoid wasting storage space. To clean up these files and any other leftover files, regardless of whether the ``temp`` tag is applied, you can use the :ref:`cleanup-storage-api` API endpoint. +Note that if you disable tagging, you should should omit the ``x-amz-tagging:dv-state=temp`` header when using the :doc:`/developers/s3-direct-upload-api`, as noted in that section. + +Finalizing S3 Configuration +########################### + In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please add the options for the custom URL and region as documented below. Please read above if your desired combination has been tested already and what other options have been set for a successful integration. diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index 03f41fc409d..ccbbe8bd619 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -2512,6 +2512,21 @@ static Response getUploadUrls(String idOrPersistentIdOfDataset, long sizeInBytes return requestSpecification.get("/api/datasets/" + idInPath + "/uploadurls?size=" + sizeInBytes + optionalQueryParam); } + /** + * If you set dataverse.files.localstack1.disable-tagging=true you will see + * an error like below. + * + * To avoid it, don't send the x-amz-tagging header. + */ + /* + + AccessDenied + There were headers present in the request which were not signed + 25ff2bb0-13c7-420e-8ae6-3d92677e4bd9 + 9Gjjt1m+cjU4OPvX9O9/8RuvnG41MRb/18Oux2o5H5MY7ISNTlXN+Dz9IG62/ILVxhAGI0qyPfg= + x-amz-tagging + + */ static Response uploadFileDirect(String url, InputStream inputStream) { return given() .header("x-amz-tagging", "dv-state=temp")