diff --git a/doc/release-notes/9117-file-type-detection.md b/doc/release-notes/9117-file-type-detection.md new file mode 100644 index 00000000000..462eaace8ed --- /dev/null +++ b/doc/release-notes/9117-file-type-detection.md @@ -0,0 +1,5 @@ +NetCDF and HDF5 files are now detected based on their content rather than just their file extension. + +Both "classic" NetCDF 3 files and more modern NetCDF 4 files are detected based on content. + +Detection for HDF4 files is only done through the file extension ".hdf", as before. diff --git a/modules/dataverse-parent/pom.xml b/modules/dataverse-parent/pom.xml index c1ba693da1b..e36a78b11be 100644 --- a/modules/dataverse-parent/pom.xml +++ b/modules/dataverse-parent/pom.xml @@ -299,6 +299,11 @@ true + + unidata-all + Unidata All + https://artifacts.unidata.ucar.edu/repository/unidata-all/ + dvn.private Local repository for hosting jars not available from network repositories. diff --git a/pom.xml b/pom.xml index c6459cfc55c..8b6f98c5896 100644 --- a/pom.xml +++ b/pom.xml @@ -25,6 +25,7 @@ 0.8.7 5.2.1 2.4.1 + 5.5.3 org.junit.jupiter diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 339de904f9e..257bc166ea0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -108,6 +108,8 @@ import java.util.Arrays; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import ucar.nc2.NetcdfFile; +import ucar.nc2.NetcdfFiles; /** * a 4.0 implementation of the DVN FileUtil; @@ -467,6 +469,11 @@ public static String determineFileType(File f, String fileName) throws IOExcepti fileType = "application/fits"; } } + + // step 3: Check if NetCDF or HDF5 + if (fileType == null) { + fileType = checkNetcdfOrHdf5(f); + } // step 3: check the mime type of this file with Jhove if (fileType == null){ @@ -669,6 +676,43 @@ private static boolean isGraphMLFile(File file) { return isGraphML; } + public static String checkNetcdfOrHdf5(File file) { + try ( NetcdfFile netcdfFile = NetcdfFiles.open(file.getAbsolutePath())) { + if (netcdfFile == null) { + // Can't open as a NetCDF or HDF5 file. + return null; + } + String type = netcdfFile.getFileTypeId(); + if (type == null) { + return null; + } + switch (type) { + case "NetCDF": + return "application/netcdf"; + case "NetCDF-4": + return "application/netcdf"; + case "HDF5": + return "application/x-hdf5"; + default: + break; + } + } catch (IOException ex) { + /** + * When an HDF4 file is passed, it won't be detected. Instead, we've + * seen exceptions like this: + * + * ucar.nc2.internal.iosp.hdf4.H4header makeDimension WARNING: + * **dimension length=0 for TagVGroup= *refno=124 tag= VG (1965) + * Vgroup length=28 class= Dim0.0 name= ixx using data 123 + * + * java.lang.IllegalArgumentException: Dimension length =0 must be > + * 0 + */ + return null; + } + return null; + } + // from MD5Checksum.java public static String calculateChecksum(String datafile, ChecksumType checksumType) { diff --git a/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java index 6ed3755e049..1d481f18cf5 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java @@ -373,4 +373,40 @@ public void testIsThumbnailSupported() throws Exception { assertFalse(FileUtil.isThumbnailSupported(filewBogusContentType)); } } + + @Test + public void testNetcdfFile() throws IOException { + // We got madis-raob.nc from https://www.unidata.ucar.edu/software/netcdf/examples/files.html + // and named it "madis-raob" with no file extension for this test. + String path = "src/test/resources/netcdf/"; + String pathAndFile = path + "madis-raob"; + File file = new File(pathAndFile); + String contentType = FileUtil.determineFileType(file, pathAndFile); + assertEquals("application/netcdf", contentType); + } + + @Test + public void testHdf5File() throws IOException { + // We got vlen_string_dset.h5 from https://github.com/h5py/h5py/blob/3.7.0/h5py/tests/data_files/vlen_string_dset.h5 + // and named in "vlen_string_dset" with no file extension for this test. + String path = "src/test/resources/hdf/hdf5/"; + String pathAndFile = path + "vlen_string_dset"; + File file = new File(pathAndFile); + String contentType = FileUtil.determineFileType(file, pathAndFile); + assertEquals("application/x-hdf5", contentType); + } + + @Test + public void testHdf4File() throws IOException { + // We got test.hdf from https://people.sc.fsu.edu/~jburkardt/data/hdf/hdf.html + // and named in "hdf4test" with no file extension for this test. + // HDF4 is the old format, the previous generation before HDF5. + // We can't detect it based on its content. + String path = "src/test/resources/hdf/hdf4/"; + String pathAndFile = path + "hdf4test"; + File file = new File(pathAndFile); + String contentType = FileUtil.determineFileType(file, pathAndFile); + assertEquals("application/octet-stream", contentType); + } + } diff --git a/src/test/resources/hdf/hdf4/hdf4test b/src/test/resources/hdf/hdf4/hdf4test new file mode 100644 index 00000000000..4674fdde194 Binary files /dev/null and b/src/test/resources/hdf/hdf4/hdf4test differ diff --git a/src/test/resources/hdf/hdf5/vlen_string_dset b/src/test/resources/hdf/hdf5/vlen_string_dset new file mode 100644 index 00000000000..dd20547f8e9 Binary files /dev/null and b/src/test/resources/hdf/hdf5/vlen_string_dset differ diff --git a/src/test/resources/netcdf/madis-raob b/src/test/resources/netcdf/madis-raob new file mode 100644 index 00000000000..d0cae0d077d Binary files /dev/null and b/src/test/resources/netcdf/madis-raob differ