Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/release-notes/9117-file-type-detection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
NetCDF and HDF5 files are now detected based on their content rather than just their file extension.

Both "classic" NetCDF 3 files and more modern NetCDF 4 files are detected based on content.

Detection for HDF4 files is only done through the file extension ".hdf", as before.
5 changes: 5 additions & 0 deletions modules/dataverse-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,11 @@
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>unidata-all</id>
<name>Unidata All</name>
<url>https://artifacts.unidata.ucar.edu/repository/unidata-all/</url>
</repository>
<repository>
<id>dvn.private</id>
<name>Local repository for hosting jars not available from network repositories.</name>
Expand Down
8 changes: 7 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
<jacoco.version>0.8.7</jacoco.version>
<poi.version>5.2.1</poi.version>
<tika.version>2.4.1</tika.version>
<netcdf.version>5.5.3</netcdf.version>
</properties>

<!-- Versions of dependencies used both directly and transitive are managed here.
Expand Down Expand Up @@ -493,7 +494,12 @@
<artifactId>java-json-canonicalization</artifactId>
<version>1.1</version>
</dependency>

<dependency>
<groupId>edu.ucar</groupId>
<artifactId>cdm-core</artifactId>
<version>${netcdf.version}</version>
</dependency>

<!-- TESTING DEPENDENCIES -->
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
44 changes: 44 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@
import java.util.Arrays;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import ucar.nc2.NetcdfFile;
import ucar.nc2.NetcdfFiles;

/**
* a 4.0 implementation of the DVN FileUtil;
Expand Down Expand Up @@ -467,6 +469,11 @@ public static String determineFileType(File f, String fileName) throws IOExcepti
fileType = "application/fits";
}
}

// step 3: Check if NetCDF or HDF5
if (fileType == null) {
fileType = checkNetcdfOrHdf5(f);
}

// step 3: check the mime type of this file with Jhove
if (fileType == null){
Expand Down Expand Up @@ -669,6 +676,43 @@ private static boolean isGraphMLFile(File file) {
return isGraphML;
}

public static String checkNetcdfOrHdf5(File file) {
try ( NetcdfFile netcdfFile = NetcdfFiles.open(file.getAbsolutePath())) {
if (netcdfFile == null) {
// Can't open as a NetCDF or HDF5 file.
return null;
}
String type = netcdfFile.getFileTypeId();
if (type == null) {
return null;
}
switch (type) {
case "NetCDF":
return "application/netcdf";
case "NetCDF-4":
return "application/netcdf";
case "HDF5":
return "application/x-hdf5";
default:
break;
}
} catch (IOException ex) {
/**
* When an HDF4 file is passed, it won't be detected. Instead, we've
* seen exceptions like this:
*
* ucar.nc2.internal.iosp.hdf4.H4header makeDimension WARNING:
* **dimension length=0 for TagVGroup= *refno=124 tag= VG (1965)
* Vgroup length=28 class= Dim0.0 name= ixx using data 123
*
* java.lang.IllegalArgumentException: Dimension length =0 must be >
* 0
*/
return null;
}
return null;
}

// from MD5Checksum.java
public static String calculateChecksum(String datafile, ChecksumType checksumType) {

Expand Down
36 changes: 36 additions & 0 deletions src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -373,4 +373,40 @@ public void testIsThumbnailSupported() throws Exception {
assertFalse(FileUtil.isThumbnailSupported(filewBogusContentType));
}
}

@Test
public void testNetcdfFile() throws IOException {
// We got madis-raob.nc from https://www.unidata.ucar.edu/software/netcdf/examples/files.html
// and named it "madis-raob" with no file extension for this test.
String path = "src/test/resources/netcdf/";
String pathAndFile = path + "madis-raob";
File file = new File(pathAndFile);
String contentType = FileUtil.determineFileType(file, pathAndFile);
assertEquals("application/netcdf", contentType);
}

@Test
public void testHdf5File() throws IOException {
// We got vlen_string_dset.h5 from https://github.com/h5py/h5py/blob/3.7.0/h5py/tests/data_files/vlen_string_dset.h5
// and named in "vlen_string_dset" with no file extension for this test.
String path = "src/test/resources/hdf/hdf5/";
String pathAndFile = path + "vlen_string_dset";
File file = new File(pathAndFile);
String contentType = FileUtil.determineFileType(file, pathAndFile);
assertEquals("application/x-hdf5", contentType);
}

@Test
public void testHdf4File() throws IOException {
// We got test.hdf from https://people.sc.fsu.edu/~jburkardt/data/hdf/hdf.html
// and named in "hdf4test" with no file extension for this test.
// HDF4 is the old format, the previous generation before HDF5.
// We can't detect it based on its content.
String path = "src/test/resources/hdf/hdf4/";
String pathAndFile = path + "hdf4test";
File file = new File(pathAndFile);
String contentType = FileUtil.determineFileType(file, pathAndFile);
assertEquals("application/octet-stream", contentType);
}

}
Binary file added src/test/resources/hdf/hdf4/hdf4test
Binary file not shown.
Binary file added src/test/resources/hdf/hdf5/vlen_string_dset
Binary file not shown.
Binary file added src/test/resources/netcdf/madis-raob
Binary file not shown.