diff --git a/doc/release-notes/9331-extract-bounding-box.md b/doc/release-notes/9331-extract-bounding-box.md index c4ff83e40c0..dfd2c4cadb7 100644 --- a/doc/release-notes/9331-extract-bounding-box.md +++ b/doc/release-notes/9331-extract-bounding-box.md @@ -1 +1,5 @@ An attempt will be made to extract a geospatial bounding box (west, south, east, north) from NetCDF and HDF5 files and then insert these values into the geospatial metadata block, if enabled. + +The following JVM setting has been added: + +- dataverse.netcdf.geo-extract-s3-direct-upload diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 734f678ceb9..b238a7623eb 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -36,6 +36,18 @@ At present, one potential drawback for direct-upload is that files are only part ``./asadmin create-jvm-options "-Ddataverse.files..ingestsizelimit="`` +.. _s3-direct-upload-features-disabled: + +Features that are Disabled if S3 Direct Upload is Enabled +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following features are disabled when S3 direct upload is enabled. + +- Unzipping of zip files. (See :ref:`compressed-files`.) +- Extraction of metadata from FITS files. (See :ref:`fits`.) +- Creation of NcML auxiliary files (See :ref:`netcdf-and-hdf5`.) +- Extraction of a geospatial bounding box from NetCDF and HDF5 files (see :ref:`netcdf-and-hdf5`) unless :ref:`dataverse.netcdf.geo-extract-s3-direct-upload` is set to true. + .. _cors-s3-bucket: Allow CORS for S3 Buckets diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 34217c2e73f..ca17fe09c15 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2433,6 +2433,15 @@ If this value is not set (the default), Dataverse will not use external Exporter Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SPI_EXPORTERS_DIRECTORY``. +.. _dataverse.netcdf.geo-extract-s3-direct-upload: + +dataverse.netcdf.geo-extract-s3-direct-upload ++++++++++++++++++++++++++++++++++++++++++++++ + +This setting was added to keep S3 direct upload lightweight. When that feature is enabled and you still want NetCDF and HDF5 files to go through metadata extraction of a Geospatial Bounding Box (see :ref:`netcdf-and-hdf5`), which requires the file to be downloaded from S3 in this scenario, make this setting true. + +See also :ref:`s3-direct-upload-features-disabled`. + .. _feature-flags: Feature Flags diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index 7dfce6153dc..648d81c1076 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -329,6 +329,8 @@ You can also search for files within datasets that have been tagged as "Workflow |cw-image6| +.. _fits: + Astronomy (FITS) ---------------- @@ -360,6 +362,8 @@ NcML For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML_ (XML) format and save it as an auxiliary file. (See also :doc:`/developers/aux-file-support` in the Developer Guide.) A previewer for these NcML files is available (see :ref:`file-previews`). +Please note that only modern versions of these formats, the ones based on HDF5 such as NetCDF 4+ and HDF5 itself (rather than HDF4), will yield an NcML auxiliary file. + .. _NcML: https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_overview.html Geospatial Bounding Box @@ -380,9 +384,12 @@ Please note the following rules regarding these fields: - If West Longitude and East Longitude are both over 180 (outside the expected -180:180 range), 360 will be subtracted to shift the values from the 0:360 range to the expected -180:180 range. - If either West Longitude or East Longitude are less than zero but the other longitude is greater than 180 (which would imply an indeterminate domain, a lack of clarity of if the domain is -180:180 or 0:360), metadata will be not be extracted. - If the bounding box was successfully populated, the subsequent removal of the NetCDF or HDF5 file from the dataset does not automatically remove the bounding box from the dataset metadata. You must remove the bounding box manually, if desired. +- This feature is disabled if S3 direct upload is enabled (see :ref:`s3-direct-upload-features-disabled`) unless :ref:`dataverse.netcdf.geo-extract-s3-direct-upload` has been set to true. If the bounding box was successfully populated, :ref:`geospatial-search` should be able to find it. +.. _compressed-files: + Compressed Files ---------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 0bbbfb3cb24..c1e82042898 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -69,6 +69,7 @@ import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SAVFileReaderSpi; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReaderSpi; +import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.util.*; import org.apache.commons.io.IOUtils; @@ -105,6 +106,7 @@ import java.util.ListIterator; import java.util.logging.Logger; import java.util.Hashtable; +import java.util.Optional; import javax.ejb.EJB; import javax.ejb.Stateless; import javax.inject.Named; @@ -1189,53 +1191,14 @@ public boolean fileMetadataExtractable(DataFile dataFile) { // Inspired by fileMetadataExtractable, above public boolean fileMetadataExtractableFromNetcdf(DataFile dataFile, Path tempLocationPath) { - logger.fine("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath); - boolean extractable = false; - String dataFileLocation = null; - if (tempLocationPath != null) { - // This file was just uploaded and hasn't been saved to S3 or local storage. - dataFileLocation = tempLocationPath.toString(); - } else { - // This file is already on S3 or local storage. - File tempFile = null; - File localFile; - StorageIO storageIO; - try { - storageIO = dataFile.getStorageIO(); - storageIO.open(); - if (storageIO.isLocalFile()) { - localFile = storageIO.getFileSystemPath().toFile(); - dataFileLocation = localFile.getAbsolutePath(); - logger.info("fileMetadataExtractable2: file is local. Path: " + dataFileLocation); - } else { - // Need to create a temporary local file: - tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); - try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { - tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); - } - dataFileLocation = tempFile.getAbsolutePath(); - logger.info("fileMetadataExtractable2: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); - } - } catch (IOException ex) { - logger.info("fileMetadataExtractable2, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); - } - } - if (dataFileLocation != null) { - try ( NetcdfFile netcdfFile = NetcdfFiles.open(dataFileLocation)) { - logger.info("fileMetadataExtractable2: trying to open " + dataFileLocation); - if (netcdfFile != null) { - logger.info("fileMetadataExtractable2: returning true"); - extractable = true; - } else { - logger.info("NetcdfFiles.open() could not open file id " + dataFile.getId() + " (null returned)."); - } - } catch (IOException ex) { - logger.info("NetcdfFiles.open() could not open file id " + dataFile.getId() + ". Exception caught: " + ex); - } - } else { - logger.info("dataFileLocation is null for file id " + dataFile.getId() + ". Can't extract NcML."); + logger.fine("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath + ". contentType: " + dataFile.getContentType()); + if (dataFile.getContentType() != null + && (dataFile.getContentType().equals(FileUtil.MIME_TYPE_NETCDF) + || dataFile.getContentType().equals(FileUtil.MIME_TYPE_XNETCDF) + || dataFile.getContentType().equals(FileUtil.MIME_TYPE_HDF5))) { + return true; } - return extractable; + return false; } /* @@ -1299,37 +1262,59 @@ public boolean extractMetadata(String tempFileLocation, DataFile dataFile, Datas * extractable from all files that the NetCDF Java library can open only * some NetCDF files will have a bounding box. * - * Note that if we ever create an API endpoint for this method for files - * that are already persisted to disk or S3, we will need to use something - * like getExistingFile() from extractMetadataNcml() to pull the file down - * from S3 to a temporary file location on local disk so that it can - * (ultimately) be opened by the NetcdfFiles.open() method, which only - * operates on local files (not an input stream). What we have now is not a - * problem for S3 because the files are saved locally before the are - * uploaded to S3. It's during this time that the files are local that this - * method is run. + * Note that if we haven't yet created an API endpoint for this method for + * files that are already persisted to disk or S3, but the code should work + * to download files from S3 as necessary. */ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataFile, DatasetVersion editVersion) throws IOException { boolean ingestSuccessful = false; - InputStream tempFileInputStream = null; - if (tempFileLocation == null) { - StorageIO sio = dataFile.getStorageIO(); - sio.open(DataAccessOption.READ_ACCESS); - tempFileInputStream = sio.getInputStream(); + String dataFileLocation = null; + if (tempFileLocation != null) { + logger.fine("tempFileLocation is non null. Setting dataFileLocation to " + tempFileLocation); + dataFileLocation = tempFileLocation; } else { + logger.fine("tempFileLocation is null. Perhaps the file is alrady on disk or S3 direct upload is enabled."); + File tempFile = null; + File localFile; + StorageIO storageIO; try { - tempFileInputStream = new FileInputStream(new File(tempFileLocation)); - } catch (FileNotFoundException notfoundEx) { - throw new IOException("Could not open temp file " + tempFileLocation); + storageIO = dataFile.getStorageIO(); + storageIO.open(); + if (storageIO.isLocalFile()) { + localFile = storageIO.getFileSystemPath().toFile(); + dataFileLocation = localFile.getAbsolutePath(); + logger.fine("extractMetadataFromNetcdf: file is local. Path: " + dataFileLocation); + } else { + Optional allow = JvmSettings.GEO_EXTRACT_S3_DIRECT_UPLOAD.lookupOptional(Boolean.class); + if (!(allow.isPresent() && allow.get())) { + logger.fine("extractMetadataFromNetcdf: skipping because of config is set to not slow down S3 remote upload."); + return false; + } + // Need to create a temporary local file: + tempFile = File.createTempFile("tempFileExtractMetadataNetcdf", ".tmp"); + try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { + tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); + } + dataFileLocation = tempFile.getAbsolutePath(); + logger.fine("extractMetadataFromNetcdf: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + } + } catch (IOException ex) { + logger.info("extractMetadataFromNetcdf, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + return false; } } + if (dataFileLocation == null) { + logger.fine("after all that dataFileLocation is still null! Returning early."); + return false; + } + // Locate metadata extraction plugin for the file format by looking // it up with the Ingest Service Provider Registry: NetcdfFileMetadataExtractor extractorPlugin = new NetcdfFileMetadataExtractor(); - logger.fine("creating file from " + tempFileLocation); - File file = new File(tempFileLocation); + logger.fine("creating file from " + dataFileLocation); + File file = new File(dataFileLocation); FileMetadataIngest extractedMetadata = extractorPlugin.ingestFile(file); Map> extractedMetadataMap = extractedMetadata.getMetadataMap(); @@ -1366,9 +1351,11 @@ public boolean extractMetadataNcml(DataFile dataFile, Path tempLocationPath) { InputStream inputStream = null; String dataFileLocation = null; if (tempLocationPath != null) { + logger.fine("extractMetadataNcml: tempLocationPath is non null. Setting dataFileLocation to " + tempLocationPath); // This file was just uploaded and hasn't been saved to S3 or local storage. dataFileLocation = tempLocationPath.toString(); } else { + logger.fine("extractMetadataNcml: tempLocationPath null. Calling getExistingFile for dataFileLocation."); dataFileLocation = getExistingFile(dataFile, dataFileLocation); } if (dataFileLocation != null) { @@ -1430,7 +1417,7 @@ private boolean isNcmlFileCreated(final NetcdfFile netcdfFile, Path tempLocation } private String getExistingFile(DataFile dataFile, String dataFileLocation) { - // This file is already on S3 or local storage. + // This file is already on S3 (non direct upload) or local storage. File tempFile = null; File localFile; StorageIO storageIO; @@ -1440,7 +1427,7 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { if (storageIO.isLocalFile()) { localFile = storageIO.getFileSystemPath().toFile(); dataFileLocation = localFile.getAbsolutePath(); - logger.fine("extractMetadataNcml: file is local. Path: " + dataFileLocation); + logger.fine("getExistingFile: file is local. Path: " + dataFileLocation); } else { // Need to create a temporary local file: tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); @@ -1448,10 +1435,10 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); } dataFileLocation = tempFile.getAbsolutePath(); - logger.fine("extractMetadataNcml: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + logger.fine("getExistingFile: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); } } catch (IOException ex) { - logger.info("While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + logger.fine("getExistingFile: While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); } return dataFileLocation; } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java b/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java index 66f0c25f3d7..9221a6ca679 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java @@ -56,7 +56,7 @@ public FileMetadataIngest ingestFile(File file) throws IOException { String northLatitudeFinal = geoFields.get(NORTH_LATITUDE); String southLatitudeFinal = geoFields.get(SOUTH_LATITUDE); - logger.info(getLineStringsUrl(westLongitudeFinal, southLatitudeFinal, eastLongitudeFinal, northLatitudeFinal)); + logger.fine(getLineStringsUrl(westLongitudeFinal, southLatitudeFinal, eastLongitudeFinal, northLatitudeFinal)); Map> metadataMap = new HashMap<>(); metadataMap.put(WEST_LONGITUDE, new HashSet<>()); @@ -102,7 +102,7 @@ private Map parseGeospatial(NetcdfFile netcdfFile) { geoFields.put(DatasetFieldConstant.northLatitude, getValue(northLatitude)); geoFields.put(DatasetFieldConstant.southLatitude, getValue(southLatitude)); - logger.info(getLineStringsUrl( + logger.fine(getLineStringsUrl( geoFields.get(DatasetFieldConstant.westLongitude), geoFields.get(DatasetFieldConstant.southLatitude), geoFields.get(DatasetFieldConstant.eastLongitude), diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index ff04a633ea7..af325bfa426 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -123,6 +123,10 @@ public enum JvmSettings { SCOPE_UI(PREFIX, "ui"), UI_ALLOW_REVIEW_INCOMPLETE(SCOPE_UI, "allow-review-for-incomplete"), UI_SHOW_VALIDITY_FILTER(SCOPE_UI, "show-validity-filter"), + + // NetCDF SETTINGS + SCOPE_NETCDF(PREFIX, "netcdf"), + GEO_EXTRACT_S3_DIRECT_UPLOAD(SCOPE_NETCDF, "geo-extract-s3-direct-upload"), ; private static final String SCOPE_SEPARATOR = "."; diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 6bb7e1d583b..7137db9ca78 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -182,6 +182,10 @@ public class FileUtil implements java.io.Serializable { //Todo - this is the same as MIME_TYPE_TSV_ALT public static final String MIME_TYPE_INGESTED_FILE = "text/tab-separated-values"; + public static final String MIME_TYPE_NETCDF = "application/netcdf"; + public static final String MIME_TYPE_XNETCDF = "application/x-netcdf"; + public static final String MIME_TYPE_HDF5 = "application/x-hdf5"; + // File type "thumbnail classes" tags: public static final String FILE_THUMBNAIL_CLASS_AUDIO = "audio";