From 8093dc11b316eef6cf592fb2ab8c768b41163be7 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 22 Jul 2020 18:04:38 -0400 Subject: [PATCH 1/7] add tsv to recognized extensions for mimetype determination --- .../propertyFiles/MimeTypeDetectionByFileExtension.properties | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties b/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties index 91191d2b588..e5dd147aeed 100644 --- a/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties +++ b/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties @@ -28,5 +28,6 @@ shp=application/shp shx=application/shx smcl=application/x-stata-smcl swc=application/x-swc +tsv=text/tsv xz=application/x-xz xlsx=application/vnd.openxmlformats-officedocument.spreadsheetml.sheet From ba33cd1227d5128820e08fdd317254f144183747 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 23 Jul 2020 13:17:34 -0400 Subject: [PATCH 2/7] enable direct upload/ingest of text/tsv when no mimetype is assigned by the browser --- src/main/java/META-INF/mime.types | 2 +- .../harvard/iq/dataverse/util/FileUtil.java | 25 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/main/java/META-INF/mime.types b/src/main/java/META-INF/mime.types index c9a18ae2d06..7e11a630e78 100644 --- a/src/main/java/META-INF/mime.types +++ b/src/main/java/META-INF/mime.types @@ -7,7 +7,7 @@ text/comma-separated-values csv CSV text/plain txt TXT text/xml xml XML # Common statistical data formats -text/tab-separated-values tab TAB tsv TSV +text/tsv tab TAB tsv TSV text/x-fixed-field dat DAT asc ASC application/x-rlang-transport Rdata RData rdata RDATA type/x-r-syntax r R diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 59dca5bb2da..38722c50a16 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -517,17 +517,20 @@ public static String determineFileType(File f, String fileName) throws IOExcepti } public static String determineFileTypeByExtension(String fileName) { - String mimetypesFileTypeMapResult = MIME_TYPE_MAP.getContentType(fileName); - logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + mimetypesFileTypeMapResult); - if (mimetypesFileTypeMapResult != null) { - if ("application/octet-stream".equals(mimetypesFileTypeMapResult)) { - return lookupFileTypeFromPropertiesFile(fileName); - } else { - return mimetypesFileTypeMapResult; - } - } else { - return null; - } + + String fileType = MIME_TYPE_MAP.getContentType(fileName); + logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + fileType); + if (fileType != null) { + if ("application/octet-stream".equals(fileType)) { + fileType = lookupFileTypeFromPropertiesFile(fileName); + } + } + //Do a check for statistical files if we have plain text or the alt tsv mimetype (which won't be ingested) +// String fileExtension = getFileExtension(fileName); +// if (fileType != null && (fileType.startsWith("text/plain")|| fileType.startsWith(MIME_TYPE_TSV_ALT)) && STATISTICAL_FILE_EXTENSION.containsKey(fileExtension)) { +// fileType = STATISTICAL_FILE_EXTENSION.get(fileExtension); +// } + return fileType; } public static String lookupFileTypeFromPropertiesFile(String fileName) { From 4d930df0a5b5203a8a40ebccc3b16cc94daaedea Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 22 Jul 2020 18:05:36 -0400 Subject: [PATCH 3/7] use mimetype determination by extension for direct upload --- .../java/edu/harvard/iq/dataverse/util/FileUtil.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 38722c50a16..f90b7f58da8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -1094,8 +1094,13 @@ public static List createDataFiles(DatasetVersion version, InputStream } } else { - //Remote file, trust supplier - finalType = suppliedContentType; + if(suppliedContentType==FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT) { + finalType=determineFileTypeByExtension(fileName); + logger.fine("Determined type: " + finalType); + } else { + //Remote file, trust supplier + finalType = suppliedContentType; + } } // Finally, if none of the special cases above were applicable (or // if we were unable to unpack an uploaded file, etc.), we'll just From 8ae270b9a04924ac8fd0b1b2371b9e96e38800ff Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 23 Jul 2020 14:06:16 -0400 Subject: [PATCH 4/7] remove unnecessary changes --- .../harvard/iq/dataverse/util/FileUtil.java | 25 ++++++++----------- ...imeTypeDetectionByFileExtension.properties | 1 - 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index f90b7f58da8..80cc68feb5f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -517,20 +517,17 @@ public static String determineFileType(File f, String fileName) throws IOExcepti } public static String determineFileTypeByExtension(String fileName) { - - String fileType = MIME_TYPE_MAP.getContentType(fileName); - logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + fileType); - if (fileType != null) { - if ("application/octet-stream".equals(fileType)) { - fileType = lookupFileTypeFromPropertiesFile(fileName); - } - } - //Do a check for statistical files if we have plain text or the alt tsv mimetype (which won't be ingested) -// String fileExtension = getFileExtension(fileName); -// if (fileType != null && (fileType.startsWith("text/plain")|| fileType.startsWith(MIME_TYPE_TSV_ALT)) && STATISTICAL_FILE_EXTENSION.containsKey(fileExtension)) { -// fileType = STATISTICAL_FILE_EXTENSION.get(fileExtension); -// } - return fileType; + String mimetypesFileTypeMapResult = MIME_TYPE_MAP.getContentType(fileName); + logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + mimetypesFileTypeMapResult); + if (mimetypesFileTypeMapResult != null) { + if ("application/octet-stream".equals(mimetypesFileTypeMapResult)) { + return lookupFileTypeFromPropertiesFile(fileName); + } else { + return mimetypesFileTypeMapResult; + } + } else { + return null; + } } public static String lookupFileTypeFromPropertiesFile(String fileName) { diff --git a/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties b/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties index e5dd147aeed..91191d2b588 100644 --- a/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties +++ b/src/main/java/propertyFiles/MimeTypeDetectionByFileExtension.properties @@ -28,6 +28,5 @@ shp=application/shp shx=application/shx smcl=application/x-stata-smcl swc=application/x-swc -tsv=text/tsv xz=application/x-xz xlsx=application/vnd.openxmlformats-officedocument.spreadsheetml.sheet From 22572a59f05199f68b0b333a1ebe6c8ff08a7689 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 7 Aug 2020 10:03:03 -0400 Subject: [PATCH 5/7] avoid a null/blank finalType --- .../edu/harvard/iq/dataverse/util/FileUtil.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 80cc68feb5f..ec280ce28a8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -83,6 +83,8 @@ import org.apache.commons.io.FilenameUtils; import com.amazonaws.AmazonServiceException; +import com.nimbusds.oauth2.sdk.util.StringUtils; + import edu.harvard.iq.dataverse.dataaccess.DataAccessOption; import edu.harvard.iq.dataverse.dataaccess.StorageIO; import java.util.Arrays; @@ -1091,12 +1093,14 @@ public static List createDataFiles(DatasetVersion version, InputStream } } else { + //Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied + finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; if(suppliedContentType==FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT) { - finalType=determineFileTypeByExtension(fileName); - logger.fine("Determined type: " + finalType); - } else { - //Remote file, trust supplier - finalType = suppliedContentType; + String type=determineFileTypeByExtension(fileName); + if(!StringUtils.isBlank(type)) { + finalType=type; + logger.fine("Determined type: " + finalType); + } } } // Finally, if none of the special cases above were applicable (or From 5715f0e780245c161bdbc52d4078c9b62f3ddd64 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 12 Aug 2020 16:46:48 -0400 Subject: [PATCH 6/7] still check if suppliedType was blank --- src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index e5ccfc64d4a..080a4712529 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -1095,7 +1095,7 @@ public static List createDataFiles(DatasetVersion version, InputStream } else { //Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; - if(suppliedContentType==FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT) { + if(finalType==FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT) { String type=determineFileTypeByExtension(fileName); if(!StringUtils.isBlank(type)) { finalType=type; From b3be92f4ae977d7883abf1a1b1fed151daa9d7f0 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 12 Aug 2020 17:47:55 -0400 Subject: [PATCH 7/7] Use same rules for replacing mimetypes in direct upload as in normal upload. This will replace generic types as in this issue as well as replace text/tab-separated-value mimetypes so that tsv files get ingested and will stop the browser supplied mimetypes for r, stata, etc. in the same way that normal upload does. --- .../harvard/iq/dataverse/util/FileUtil.java | 90 ++++++++++--------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 080a4712529..b8bd7a6a929 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -155,6 +155,7 @@ public class FileUtil implements java.io.Serializable { public static final String SAVED_ORIGINAL_FILENAME_EXTENSION = "orig"; + //Todo - this is the same as MIME_TYPE_TSV_ALT public static final String MIME_TYPE_INGESTED_FILE = "text/tab-separated-values"; // File type "thumbnail classes" tags: @@ -760,39 +761,8 @@ public static List createDataFiles(DatasetVersion version, InputStream recognizedType = determineFileType(tempFile.toFile(), fileName); logger.fine("File utility recognized the file as " + recognizedType); if (recognizedType != null && !recognizedType.equals("")) { - // is it any better than the type that was supplied to us, - // if any? - // This is not as trivial a task as one might expect... - // We may need a list of "good" mime types, that should always - // be chosen over other choices available. Maybe it should - // even be a weighed list... as in, "application/foo" should - // be chosen over "application/foo-with-bells-and-whistles". - - // For now the logic will be as follows: - // - // 1. If the contentType supplied (by the browser, most likely) - // is some form of "unknown", we always discard it in favor of - // whatever our own utilities have determined; - // 2. We should NEVER trust the browser when it comes to the - // following "ingestable" types: Stata, SPSS, R; - // 2a. We are willing to TRUST the browser when it comes to - // the CSV and XSLX ingestable types. - // 3. We should ALWAYS trust our utilities when it comes to - // ingestable types. - - if (suppliedContentType == null - || suppliedContentType.equals("") - || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT) - || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY) - || (canIngestAsTabular(suppliedContentType) - && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV) - && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT) - && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX)) - || canIngestAsTabular(recognizedType) - || recognizedType.equals("application/fits-gzipped") - || recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE) - || recognizedType.equals(MIME_TYPE_ZIP)) { - finalType = recognizedType; + if(useRecognizedType(suppliedContentType, recognizedType)) { + finalType=recognizedType; } } @@ -1093,14 +1063,15 @@ public static List createDataFiles(DatasetVersion version, InputStream } } else { - //Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied - finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; - if(finalType==FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT) { - String type=determineFileTypeByExtension(fileName); - if(!StringUtils.isBlank(type)) { - finalType=type; - logger.fine("Determined type: " + finalType); - } + // Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied + finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; + String type = determineFileTypeByExtension(fileName); + if (!StringUtils.isBlank(type)) { + //Use rules for deciding when to trust browser supplied type + if (useRecognizedType(finalType, type)) { + finalType = type; + } + logger.fine("Supplied type: " + suppliedContentType + ", finalType: " + finalType); } } // Finally, if none of the special cases above were applicable (or @@ -1135,7 +1106,42 @@ public static List createDataFiles(DatasetVersion version, InputStream } // end createDataFiles - private static File saveInputStreamInTempFile(InputStream inputStream, Long fileSizeLimit) + private static boolean useRecognizedType(String suppliedContentType, String recognizedType) { + // is it any better than the type that was supplied to us, + // if any? + // This is not as trivial a task as one might expect... + // We may need a list of "good" mime types, that should always + // be chosen over other choices available. Maybe it should + // even be a weighed list... as in, "application/foo" should + // be chosen over "application/foo-with-bells-and-whistles". + + // For now the logic will be as follows: + // + // 1. If the contentType supplied (by the browser, most likely) + // is some form of "unknown", we always discard it in favor of + // whatever our own utilities have determined; + // 2. We should NEVER trust the browser when it comes to the + // following "ingestable" types: Stata, SPSS, R; + // 2a. We are willing to TRUST the browser when it comes to + // the CSV and XSLX ingestable types. + // 3. We should ALWAYS trust our utilities when it comes to + // ingestable types. + if (suppliedContentType == null || suppliedContentType.equals("") + || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT) + || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY) + || (canIngestAsTabular(suppliedContentType) + && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV) + && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT) + && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX)) + || canIngestAsTabular(recognizedType) || recognizedType.equals("application/fits-gzipped") + || recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE) + || recognizedType.equals(MIME_TYPE_ZIP)) { + return true; + } + return false; + } + + private static File saveInputStreamInTempFile(InputStream inputStream, Long fileSizeLimit) throws IOException, FileExceedsMaxSizeException { Path tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload");