diff --git a/src/main/java/META-INF/mime.types b/src/main/java/META-INF/mime.types index c9a18ae2d06..7e11a630e78 100644 --- a/src/main/java/META-INF/mime.types +++ b/src/main/java/META-INF/mime.types @@ -7,7 +7,7 @@ text/comma-separated-values csv CSV text/plain txt TXT text/xml xml XML # Common statistical data formats -text/tab-separated-values tab TAB tsv TSV +text/tsv tab TAB tsv TSV text/x-fixed-field dat DAT asc ASC application/x-rlang-transport Rdata RData rdata RDATA type/x-r-syntax r R diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 608566bcae1..b8bd7a6a929 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -83,6 +83,8 @@ import org.apache.commons.io.FilenameUtils; import com.amazonaws.AmazonServiceException; +import com.nimbusds.oauth2.sdk.util.StringUtils; + import edu.harvard.iq.dataverse.dataaccess.DataAccessOption; import edu.harvard.iq.dataverse.dataaccess.StorageIO; import java.util.Arrays; @@ -153,6 +155,7 @@ public class FileUtil implements java.io.Serializable { public static final String SAVED_ORIGINAL_FILENAME_EXTENSION = "orig"; + //Todo - this is the same as MIME_TYPE_TSV_ALT public static final String MIME_TYPE_INGESTED_FILE = "text/tab-separated-values"; // File type "thumbnail classes" tags: @@ -758,39 +761,8 @@ public static List createDataFiles(DatasetVersion version, InputStream recognizedType = determineFileType(tempFile.toFile(), fileName); logger.fine("File utility recognized the file as " + recognizedType); if (recognizedType != null && !recognizedType.equals("")) { - // is it any better than the type that was supplied to us, - // if any? - // This is not as trivial a task as one might expect... - // We may need a list of "good" mime types, that should always - // be chosen over other choices available. Maybe it should - // even be a weighed list... as in, "application/foo" should - // be chosen over "application/foo-with-bells-and-whistles". - - // For now the logic will be as follows: - // - // 1. If the contentType supplied (by the browser, most likely) - // is some form of "unknown", we always discard it in favor of - // whatever our own utilities have determined; - // 2. We should NEVER trust the browser when it comes to the - // following "ingestable" types: Stata, SPSS, R; - // 2a. We are willing to TRUST the browser when it comes to - // the CSV and XSLX ingestable types. - // 3. We should ALWAYS trust our utilities when it comes to - // ingestable types. - - if (suppliedContentType == null - || suppliedContentType.equals("") - || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT) - || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY) - || (canIngestAsTabular(suppliedContentType) - && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV) - && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT) - && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX)) - || canIngestAsTabular(recognizedType) - || recognizedType.equals("application/fits-gzipped") - || recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE) - || recognizedType.equals(MIME_TYPE_ZIP)) { - finalType = recognizedType; + if(useRecognizedType(suppliedContentType, recognizedType)) { + finalType=recognizedType; } } @@ -1091,8 +1063,16 @@ public static List createDataFiles(DatasetVersion version, InputStream } } else { - //Remote file, trust supplier - finalType = suppliedContentType; + // Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied + finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; + String type = determineFileTypeByExtension(fileName); + if (!StringUtils.isBlank(type)) { + //Use rules for deciding when to trust browser supplied type + if (useRecognizedType(finalType, type)) { + finalType = type; + } + logger.fine("Supplied type: " + suppliedContentType + ", finalType: " + finalType); + } } // Finally, if none of the special cases above were applicable (or // if we were unable to unpack an uploaded file, etc.), we'll just @@ -1126,7 +1106,42 @@ public static List createDataFiles(DatasetVersion version, InputStream } // end createDataFiles - private static File saveInputStreamInTempFile(InputStream inputStream, Long fileSizeLimit) + private static boolean useRecognizedType(String suppliedContentType, String recognizedType) { + // is it any better than the type that was supplied to us, + // if any? + // This is not as trivial a task as one might expect... + // We may need a list of "good" mime types, that should always + // be chosen over other choices available. Maybe it should + // even be a weighed list... as in, "application/foo" should + // be chosen over "application/foo-with-bells-and-whistles". + + // For now the logic will be as follows: + // + // 1. If the contentType supplied (by the browser, most likely) + // is some form of "unknown", we always discard it in favor of + // whatever our own utilities have determined; + // 2. We should NEVER trust the browser when it comes to the + // following "ingestable" types: Stata, SPSS, R; + // 2a. We are willing to TRUST the browser when it comes to + // the CSV and XSLX ingestable types. + // 3. We should ALWAYS trust our utilities when it comes to + // ingestable types. + if (suppliedContentType == null || suppliedContentType.equals("") + || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT) + || suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY) + || (canIngestAsTabular(suppliedContentType) + && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV) + && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT) + && !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX)) + || canIngestAsTabular(recognizedType) || recognizedType.equals("application/fits-gzipped") + || recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE) + || recognizedType.equals(MIME_TYPE_ZIP)) { + return true; + } + return false; + } + + private static File saveInputStreamInTempFile(InputStream inputStream, Long fileSizeLimit) throws IOException, FileExceedsMaxSizeException { Path tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload");