diff --git a/doc/release-notes/9117-file-type-detection.md b/doc/release-notes/9117-file-type-detection.md
new file mode 100644
index 00000000000..462eaace8ed
--- /dev/null
+++ b/doc/release-notes/9117-file-type-detection.md
@@ -0,0 +1,5 @@
+NetCDF and HDF5 files are now detected based on their content rather than just their file extension.
+
+Both "classic" NetCDF 3 files and more modern NetCDF 4 files are detected based on content.
+
+Detection for HDF4 files is only done through the file extension ".hdf", as before.
diff --git a/modules/dataverse-parent/pom.xml b/modules/dataverse-parent/pom.xml
index c1ba693da1b..e36a78b11be 100644
--- a/modules/dataverse-parent/pom.xml
+++ b/modules/dataverse-parent/pom.xml
@@ -299,6 +299,11 @@
true
+
+ unidata-all
+ Unidata All
+ https://artifacts.unidata.ucar.edu/repository/unidata-all/
+
dvn.private
Local repository for hosting jars not available from network repositories.
diff --git a/pom.xml b/pom.xml
index c6459cfc55c..8b6f98c5896 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,6 +25,7 @@
0.8.7
5.2.1
2.4.1
+ 5.5.3
org.junit.jupiter
diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
index 339de904f9e..257bc166ea0 100644
--- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
+++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
@@ -108,6 +108,8 @@
import java.util.Arrays;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
+import ucar.nc2.NetcdfFile;
+import ucar.nc2.NetcdfFiles;
/**
* a 4.0 implementation of the DVN FileUtil;
@@ -467,6 +469,11 @@ public static String determineFileType(File f, String fileName) throws IOExcepti
fileType = "application/fits";
}
}
+
+ // step 3: Check if NetCDF or HDF5
+ if (fileType == null) {
+ fileType = checkNetcdfOrHdf5(f);
+ }
// step 3: check the mime type of this file with Jhove
if (fileType == null){
@@ -669,6 +676,43 @@ private static boolean isGraphMLFile(File file) {
return isGraphML;
}
+ public static String checkNetcdfOrHdf5(File file) {
+ try ( NetcdfFile netcdfFile = NetcdfFiles.open(file.getAbsolutePath())) {
+ if (netcdfFile == null) {
+ // Can't open as a NetCDF or HDF5 file.
+ return null;
+ }
+ String type = netcdfFile.getFileTypeId();
+ if (type == null) {
+ return null;
+ }
+ switch (type) {
+ case "NetCDF":
+ return "application/netcdf";
+ case "NetCDF-4":
+ return "application/netcdf";
+ case "HDF5":
+ return "application/x-hdf5";
+ default:
+ break;
+ }
+ } catch (IOException ex) {
+ /**
+ * When an HDF4 file is passed, it won't be detected. Instead, we've
+ * seen exceptions like this:
+ *
+ * ucar.nc2.internal.iosp.hdf4.H4header makeDimension WARNING:
+ * **dimension length=0 for TagVGroup= *refno=124 tag= VG (1965)
+ * Vgroup length=28 class= Dim0.0 name= ixx using data 123
+ *
+ * java.lang.IllegalArgumentException: Dimension length =0 must be >
+ * 0
+ */
+ return null;
+ }
+ return null;
+ }
+
// from MD5Checksum.java
public static String calculateChecksum(String datafile, ChecksumType checksumType) {
diff --git a/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java
index 6ed3755e049..1d481f18cf5 100644
--- a/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java
+++ b/src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java
@@ -373,4 +373,40 @@ public void testIsThumbnailSupported() throws Exception {
assertFalse(FileUtil.isThumbnailSupported(filewBogusContentType));
}
}
+
+ @Test
+ public void testNetcdfFile() throws IOException {
+ // We got madis-raob.nc from https://www.unidata.ucar.edu/software/netcdf/examples/files.html
+ // and named it "madis-raob" with no file extension for this test.
+ String path = "src/test/resources/netcdf/";
+ String pathAndFile = path + "madis-raob";
+ File file = new File(pathAndFile);
+ String contentType = FileUtil.determineFileType(file, pathAndFile);
+ assertEquals("application/netcdf", contentType);
+ }
+
+ @Test
+ public void testHdf5File() throws IOException {
+ // We got vlen_string_dset.h5 from https://github.com/h5py/h5py/blob/3.7.0/h5py/tests/data_files/vlen_string_dset.h5
+ // and named in "vlen_string_dset" with no file extension for this test.
+ String path = "src/test/resources/hdf/hdf5/";
+ String pathAndFile = path + "vlen_string_dset";
+ File file = new File(pathAndFile);
+ String contentType = FileUtil.determineFileType(file, pathAndFile);
+ assertEquals("application/x-hdf5", contentType);
+ }
+
+ @Test
+ public void testHdf4File() throws IOException {
+ // We got test.hdf from https://people.sc.fsu.edu/~jburkardt/data/hdf/hdf.html
+ // and named in "hdf4test" with no file extension for this test.
+ // HDF4 is the old format, the previous generation before HDF5.
+ // We can't detect it based on its content.
+ String path = "src/test/resources/hdf/hdf4/";
+ String pathAndFile = path + "hdf4test";
+ File file = new File(pathAndFile);
+ String contentType = FileUtil.determineFileType(file, pathAndFile);
+ assertEquals("application/octet-stream", contentType);
+ }
+
}
diff --git a/src/test/resources/hdf/hdf4/hdf4test b/src/test/resources/hdf/hdf4/hdf4test
new file mode 100644
index 00000000000..4674fdde194
Binary files /dev/null and b/src/test/resources/hdf/hdf4/hdf4test differ
diff --git a/src/test/resources/hdf/hdf5/vlen_string_dset b/src/test/resources/hdf/hdf5/vlen_string_dset
new file mode 100644
index 00000000000..dd20547f8e9
Binary files /dev/null and b/src/test/resources/hdf/hdf5/vlen_string_dset differ
diff --git a/src/test/resources/netcdf/madis-raob b/src/test/resources/netcdf/madis-raob
new file mode 100644
index 00000000000..d0cae0d077d
Binary files /dev/null and b/src/test/resources/netcdf/madis-raob differ