Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/main/java/META-INF/mime.types
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ text/comma-separated-values csv CSV
text/plain txt TXT
text/xml xml XML
# Common statistical data formats
text/tab-separated-values tab TAB tsv TSV
text/tsv tab TAB tsv TSV
text/x-fixed-field dat DAT asc ASC
application/x-rlang-transport Rdata RData rdata RDATA
type/x-r-syntax r R
Expand Down
87 changes: 51 additions & 36 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
import org.apache.commons.io.FilenameUtils;

import com.amazonaws.AmazonServiceException;
import com.nimbusds.oauth2.sdk.util.StringUtils;

import edu.harvard.iq.dataverse.dataaccess.DataAccessOption;
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
import java.util.Arrays;
Expand Down Expand Up @@ -153,6 +155,7 @@ public class FileUtil implements java.io.Serializable {

public static final String SAVED_ORIGINAL_FILENAME_EXTENSION = "orig";

//Todo - this is the same as MIME_TYPE_TSV_ALT
public static final String MIME_TYPE_INGESTED_FILE = "text/tab-separated-values";

// File type "thumbnail classes" tags:
Expand Down Expand Up @@ -758,39 +761,8 @@ public static List<DataFile> createDataFiles(DatasetVersion version, InputStream
recognizedType = determineFileType(tempFile.toFile(), fileName);
logger.fine("File utility recognized the file as " + recognizedType);
if (recognizedType != null && !recognizedType.equals("")) {
// is it any better than the type that was supplied to us,
// if any?
// This is not as trivial a task as one might expect...
// We may need a list of "good" mime types, that should always
// be chosen over other choices available. Maybe it should
// even be a weighed list... as in, "application/foo" should
// be chosen over "application/foo-with-bells-and-whistles".

// For now the logic will be as follows:
//
// 1. If the contentType supplied (by the browser, most likely)
// is some form of "unknown", we always discard it in favor of
// whatever our own utilities have determined;
// 2. We should NEVER trust the browser when it comes to the
// following "ingestable" types: Stata, SPSS, R;
// 2a. We are willing to TRUST the browser when it comes to
// the CSV and XSLX ingestable types.
// 3. We should ALWAYS trust our utilities when it comes to
// ingestable types.

if (suppliedContentType == null
|| suppliedContentType.equals("")
|| suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT)
|| suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY)
|| (canIngestAsTabular(suppliedContentType)
&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV)
&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT)
&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX))
|| canIngestAsTabular(recognizedType)
|| recognizedType.equals("application/fits-gzipped")
|| recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)
|| recognizedType.equals(MIME_TYPE_ZIP)) {
finalType = recognizedType;
if(useRecognizedType(suppliedContentType, recognizedType)) {
finalType=recognizedType;
}
}

Expand Down Expand Up @@ -1091,8 +1063,16 @@ public static List<DataFile> createDataFiles(DatasetVersion version, InputStream

}
} else {
//Remote file, trust supplier
finalType = suppliedContentType;
// Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied
finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType;
String type = determineFileTypeByExtension(fileName);
if (!StringUtils.isBlank(type)) {
//Use rules for deciding when to trust browser supplied type
if (useRecognizedType(finalType, type)) {
finalType = type;
}
logger.fine("Supplied type: " + suppliedContentType + ", finalType: " + finalType);
}
}
// Finally, if none of the special cases above were applicable (or
// if we were unable to unpack an uploaded file, etc.), we'll just
Expand Down Expand Up @@ -1126,7 +1106,42 @@ public static List<DataFile> createDataFiles(DatasetVersion version, InputStream
} // end createDataFiles


private static File saveInputStreamInTempFile(InputStream inputStream, Long fileSizeLimit)
private static boolean useRecognizedType(String suppliedContentType, String recognizedType) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just moved and made to return a boolean, no logic changes.

// is it any better than the type that was supplied to us,
// if any?
// This is not as trivial a task as one might expect...
// We may need a list of "good" mime types, that should always
// be chosen over other choices available. Maybe it should
// even be a weighed list... as in, "application/foo" should
// be chosen over "application/foo-with-bells-and-whistles".

// For now the logic will be as follows:
//
// 1. If the contentType supplied (by the browser, most likely)
// is some form of "unknown", we always discard it in favor of
// whatever our own utilities have determined;
// 2. We should NEVER trust the browser when it comes to the
// following "ingestable" types: Stata, SPSS, R;
// 2a. We are willing to TRUST the browser when it comes to
// the CSV and XSLX ingestable types.
// 3. We should ALWAYS trust our utilities when it comes to
// ingestable types.
if (suppliedContentType == null || suppliedContentType.equals("")
|| suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT)
|| suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY)
|| (canIngestAsTabular(suppliedContentType)
&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV)
&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT)
&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX))
|| canIngestAsTabular(recognizedType) || recognizedType.equals("application/fits-gzipped")
|| recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)
|| recognizedType.equals(MIME_TYPE_ZIP)) {
return true;
}
return false;
}

private static File saveInputStreamInTempFile(InputStream inputStream, Long fileSizeLimit)
throws IOException, FileExceedsMaxSizeException {
Path tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload");

Expand Down