From cf2088d000e7c475a15243222f42b19be0c76312 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 23 May 2023 17:15:56 -0400 Subject: [PATCH 1/6] fix S3 direct upload NPE and keep NetCDF metadata extraction #9601 Note that the NcML aux file is not created when S3 direct upload is enabled. --- .../dataverse/ingest/IngestServiceBean.java | 68 ++++++++++++------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 7cdfda8d082..9d3e7fb1161 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -332,9 +332,6 @@ public List saveAndAddFilesToDataset(DatasetVersion version, } catch (IOException e) { logger.warning("Error getting ingest limit for file: " + dataFile.getIdentifier() + " : " + e.getMessage()); } - if (unattached) { - dataFile.setOwner(null); - } if (savedSuccess && belowLimit) { // These are all brand new files, so they should all have // one filemetadata total. -- L.A. @@ -388,6 +385,9 @@ public List saveAndAddFilesToDataset(DatasetVersion version, dataFile.setContentType(FileUtil.MIME_TYPE_TSV); } } + if (unattached) { + dataFile.setOwner(null); + } // ... and let's delete the main temp file if it exists: if(tempLocationPath!=null) { try { @@ -1294,37 +1294,54 @@ public boolean extractMetadata(String tempFileLocation, DataFile dataFile, Datas * extractable from all files that the NetCDF Java library can open only * some NetCDF files will have a bounding box. * - * Note that if we ever create an API endpoint for this method for files - * that are already persisted to disk or S3, we will need to use something - * like getExistingFile() from extractMetadataNcml() to pull the file down - * from S3 to a temporary file location on local disk so that it can - * (ultimately) be opened by the NetcdfFiles.open() method, which only - * operates on local files (not an input stream). What we have now is not a - * problem for S3 because the files are saved locally before the are - * uploaded to S3. It's during this time that the files are local that this - * method is run. + * Note that if we haven't yet created an API endpoint for this method for + * files that are already persisted to disk or S3, but the code should work + * to download files from S3 as necessary. */ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataFile, DatasetVersion editVersion) throws IOException { boolean ingestSuccessful = false; - InputStream tempFileInputStream = null; - if (tempFileLocation == null) { - StorageIO sio = dataFile.getStorageIO(); - sio.open(DataAccessOption.READ_ACCESS); - tempFileInputStream = sio.getInputStream(); + String dataFileLocation = null; + if (tempFileLocation != null) { + logger.info("tempFileLocation is non null. Setting dataFileLocation to " + tempFileLocation); + dataFileLocation = tempFileLocation; } else { + logger.info("tempFileLocation is null. Perhaps the file is alrady on disk or S3 direct upload is enabled."); + File tempFile = null; + File localFile; + StorageIO storageIO; try { - tempFileInputStream = new FileInputStream(new File(tempFileLocation)); - } catch (FileNotFoundException notfoundEx) { - throw new IOException("Could not open temp file " + tempFileLocation); + storageIO = dataFile.getStorageIO(); + storageIO.open(); + if (storageIO.isLocalFile()) { + localFile = storageIO.getFileSystemPath().toFile(); + dataFileLocation = localFile.getAbsolutePath(); + logger.info("extractMetadataFromNetcdf: file is local. Path: " + dataFileLocation); + } else { + // Need to create a temporary local file: + tempFile = File.createTempFile("tempFileExtractMetadataNetcdf", ".tmp"); + try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { + tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); + } + dataFileLocation = tempFile.getAbsolutePath(); + logger.info("extractMetadataFromNetcdf: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + } + } catch (IOException ex) { + logger.info("extractMetadataFromNetcdf, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + return false; } } + if (dataFileLocation == null) { + logger.fine("after all that dataFileLocation is still null! Returning early."); + return false; + } + // Locate metadata extraction plugin for the file format by looking // it up with the Ingest Service Provider Registry: NetcdfFileMetadataExtractor extractorPlugin = new NetcdfFileMetadataExtractor(); - logger.fine("creating file from " + tempFileLocation); - File file = new File(tempFileLocation); + logger.info("creating file from " + dataFileLocation); + File file = new File(dataFileLocation); FileMetadataIngest extractedMetadata = extractorPlugin.ingestFile(file); Map> extractedMetadataMap = extractedMetadata.getMetadataMap(); @@ -1361,9 +1378,11 @@ public boolean extractMetadataNcml(DataFile dataFile, Path tempLocationPath) { InputStream inputStream = null; String dataFileLocation = null; if (tempLocationPath != null) { + logger.info("extractMetadataNcml: tempLocationPath is non null. Setting dataFileLocation to " + tempLocationPath); // This file was just uploaded and hasn't been saved to S3 or local storage. dataFileLocation = tempLocationPath.toString(); } else { + logger.info("extractMetadataNcml: tempLocationPath null. Calling getExistingFile for dataFileLocation."); dataFileLocation = getExistingFile(dataFile, dataFileLocation); } if (dataFileLocation != null) { @@ -1425,7 +1444,7 @@ private boolean isNcmlFileCreated(final NetcdfFile netcdfFile, Path tempLocation } private String getExistingFile(DataFile dataFile, String dataFileLocation) { - // This file is already on S3 or local storage. + // This file is already on S3 (non direct upload) or local storage. File tempFile = null; File localFile; StorageIO storageIO; @@ -1436,6 +1455,7 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { localFile = storageIO.getFileSystemPath().toFile(); dataFileLocation = localFile.getAbsolutePath(); logger.fine("extractMetadataNcml: file is local. Path: " + dataFileLocation); + logger.info("getExistingFile: file is local. Path: " + dataFileLocation); } else { // Need to create a temporary local file: tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); @@ -1444,9 +1464,11 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { } dataFileLocation = tempFile.getAbsolutePath(); logger.fine("extractMetadataNcml: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + logger.info("getExistingFile: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); } } catch (IOException ex) { logger.info("While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + logger.info("getExistingFile: While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); } return dataFileLocation; } From 8f8373859236e18b97612cc3330c4614ff9303f9 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 24 May 2023 09:40:20 -0400 Subject: [PATCH 2/6] set expectations about S3 direct upload behavior #9601 --- .../source/developers/big-data-support.rst | 11 +++++++++++ doc/sphinx-guides/source/user/dataset-management.rst | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 734f678ceb9..4d409b407f7 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -36,6 +36,17 @@ At present, one potential drawback for direct-upload is that files are only part ``./asadmin create-jvm-options "-Ddataverse.files..ingestsizelimit="`` +.. _s3-direct-upload-features-disabled: + +Features that are Disabled if S3 Direct Upload is Enabled +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following features are disabled when S3 direct upload is enabled. + +- Unzipping of zip files. (See :ref:`compressed-files`.) +- Extraction of metadata from FITS files. (See :ref:`fits`.) +- Creation of NcML auxiliary files (See :ref:`netcdf-and-hdf5`.) + .. _cors-s3-bucket: Allow CORS for S3 Buckets diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index 9223768b49f..bf6e118cc21 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -329,6 +329,8 @@ You can also search for files within datasets that have been tagged as "Workflow |cw-image6| +.. _fits: + Astronomy (FITS) ---------------- @@ -374,6 +376,8 @@ Please note the following rules regarding these fields: If the bounding box was successfully populated, :ref:`geospatial-search` should be able to find it. +.. _compressed-files: + Compressed Files ---------------- From 505e8f236c903f9955917dda576bcae4e269426f Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 24 May 2023 16:59:32 -0400 Subject: [PATCH 3/6] switch to mime type check #9601 --- .../dataverse/ingest/IngestServiceBean.java | 52 +++---------------- .../harvard/iq/dataverse/util/FileUtil.java | 4 ++ 2 files changed, 11 insertions(+), 45 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 9d3e7fb1161..560843a7e71 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -1185,52 +1185,14 @@ public boolean fileMetadataExtractable(DataFile dataFile) { // Inspired by fileMetadataExtractable, above public boolean fileMetadataExtractableFromNetcdf(DataFile dataFile, Path tempLocationPath) { logger.fine("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath); - boolean extractable = false; - String dataFileLocation = null; - if (tempLocationPath != null) { - // This file was just uploaded and hasn't been saved to S3 or local storage. - dataFileLocation = tempLocationPath.toString(); - } else { - // This file is already on S3 or local storage. - File tempFile = null; - File localFile; - StorageIO storageIO; - try { - storageIO = dataFile.getStorageIO(); - storageIO.open(); - if (storageIO.isLocalFile()) { - localFile = storageIO.getFileSystemPath().toFile(); - dataFileLocation = localFile.getAbsolutePath(); - logger.info("fileMetadataExtractable2: file is local. Path: " + dataFileLocation); - } else { - // Need to create a temporary local file: - tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); - try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { - tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); - } - dataFileLocation = tempFile.getAbsolutePath(); - logger.info("fileMetadataExtractable2: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); - } - } catch (IOException ex) { - logger.info("fileMetadataExtractable2, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); - } - } - if (dataFileLocation != null) { - try ( NetcdfFile netcdfFile = NetcdfFiles.open(dataFileLocation)) { - logger.info("fileMetadataExtractable2: trying to open " + dataFileLocation); - if (netcdfFile != null) { - logger.info("fileMetadataExtractable2: returning true"); - extractable = true; - } else { - logger.info("NetcdfFiles.open() could not open file id " + dataFile.getId() + " (null returned)."); - } - } catch (IOException ex) { - logger.info("NetcdfFiles.open() could not open file id " + dataFile.getId() + ". Exception caught: " + ex); - } - } else { - logger.info("dataFileLocation is null for file id " + dataFile.getId() + ". Can't extract NcML."); + logger.info("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath + ". contentType: " + dataFile.getContentType()); + if (dataFile.getContentType() != null + && (dataFile.getContentType().equals(FileUtil.MIME_TYPE_NETCDF) + || dataFile.getContentType().equals(FileUtil.MIME_TYPE_XNETCDF) + || dataFile.getContentType().equals(FileUtil.MIME_TYPE_HDF5))) { + return true; } - return extractable; + return false; } /* diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 6bb7e1d583b..7137db9ca78 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -182,6 +182,10 @@ public class FileUtil implements java.io.Serializable { //Todo - this is the same as MIME_TYPE_TSV_ALT public static final String MIME_TYPE_INGESTED_FILE = "text/tab-separated-values"; + public static final String MIME_TYPE_NETCDF = "application/netcdf"; + public static final String MIME_TYPE_XNETCDF = "application/x-netcdf"; + public static final String MIME_TYPE_HDF5 = "application/x-hdf5"; + // File type "thumbnail classes" tags: public static final String FILE_THUMBNAIL_CLASS_AUDIO = "audio"; From 18076ede66b95907f2b7996b7c8a834d9b169a94 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 25 May 2023 15:52:04 -0400 Subject: [PATCH 4/6] add dataverse.netcdf.geo-extract-s3-direct-upload config #9601 By default, keep S3 direct upload fast. Don't download NetCDF or HDF5 files to try to pull geospatial metadata out of them when S3 direct upload is configured. If you really want this, add this setting and make it true. --- doc/release-notes/9331-extract-bounding-box.md | 4 ++++ doc/sphinx-guides/source/developers/big-data-support.rst | 1 + doc/sphinx-guides/source/installation/config.rst | 8 ++++++++ doc/sphinx-guides/source/user/dataset-management.rst | 1 + .../harvard/iq/dataverse/ingest/IngestServiceBean.java | 7 +++++++ .../edu/harvard/iq/dataverse/settings/JvmSettings.java | 4 ++++ 6 files changed, 25 insertions(+) diff --git a/doc/release-notes/9331-extract-bounding-box.md b/doc/release-notes/9331-extract-bounding-box.md index c4ff83e40c0..dfd2c4cadb7 100644 --- a/doc/release-notes/9331-extract-bounding-box.md +++ b/doc/release-notes/9331-extract-bounding-box.md @@ -1 +1,5 @@ An attempt will be made to extract a geospatial bounding box (west, south, east, north) from NetCDF and HDF5 files and then insert these values into the geospatial metadata block, if enabled. + +The following JVM setting has been added: + +- dataverse.netcdf.geo-extract-s3-direct-upload diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 4d409b407f7..b238a7623eb 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -46,6 +46,7 @@ The following features are disabled when S3 direct upload is enabled. - Unzipping of zip files. (See :ref:`compressed-files`.) - Extraction of metadata from FITS files. (See :ref:`fits`.) - Creation of NcML auxiliary files (See :ref:`netcdf-and-hdf5`.) +- Extraction of a geospatial bounding box from NetCDF and HDF5 files (see :ref:`netcdf-and-hdf5`) unless :ref:`dataverse.netcdf.geo-extract-s3-direct-upload` is set to true. .. _cors-s3-bucket: diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index f8aef8c59da..b2f9d6b5150 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2419,6 +2419,14 @@ Defaults to ``false``. Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_UI_SHOW_VALIDITY_FILTER``. Will accept ``[tT][rR][uU][eE]|1|[oO][nN]`` as "true" expressions. +.. _dataverse.netcdf.geo-extract-s3-direct-upload: + +dataverse.netcdf.geo-extract-s3-direct-upload ++++++++++++++++++++++++++++++++++++++++++++++ + +This setting was added to keep S3 direct upload lightweight. When that feature is enabled and you still want NetCDF and HDF5 files to go through metadata extraction of a Geospatial Bounding Box (see :ref:`netcdf-and-hdf5`), which requires the file to be downloaded from S3 in this scenario, make this setting true. + +See also :ref:`s3-direct-upload-features-disabled`. .. _feature-flags: diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index bf6e118cc21..a4f6251f11f 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -373,6 +373,7 @@ Please note the following rules regarding these fields: - If West Longitude and East Longitude are both over 180 (outside the expected -180:180 range), 360 will be subtracted to shift the values from the 0:360 range to the expected -180:180 range. - If either West Longitude or East Longitude are less than zero but the other longitude is greater than 180 (which would imply an indeterminate domain, a lack of clarity of if the domain is -180:180 or 0:360), metadata will be not be extracted. - If the bounding box was successfully populated, the subsequent removal of the NetCDF or HDF5 file from the dataset does not automatically remove the bounding box from the dataset metadata. You must remove the bounding box manually, if desired. +- This feature is disabled if S3 direct upload is enabled (see :ref:`s3-direct-upload-features-disabled`) unless :ref:`dataverse.netcdf.geo-extract-s3-direct-upload` has been set to true. If the bounding box was successfully populated, :ref:`geospatial-search` should be able to find it. diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 560843a7e71..c9e2cb3115d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -69,6 +69,7 @@ import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SAVFileReaderSpi; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReaderSpi; +import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.util.*; import org.apache.commons.io.IOUtils; @@ -105,6 +106,7 @@ import java.util.ListIterator; import java.util.logging.Logger; import java.util.Hashtable; +import java.util.Optional; import javax.ejb.EJB; import javax.ejb.Stateless; import javax.inject.Named; @@ -1280,6 +1282,11 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF dataFileLocation = localFile.getAbsolutePath(); logger.info("extractMetadataFromNetcdf: file is local. Path: " + dataFileLocation); } else { + Optional allow = JvmSettings.GEO_EXTRACT_S3_DIRECT_UPLOAD.lookupOptional(Boolean.class); + if (!(allow.isPresent() && allow.get())) { + logger.info("extractMetadataFromNetcdf: skipping because of config is set to not slow down S3 remote upload."); + return false; + } // Need to create a temporary local file: tempFile = File.createTempFile("tempFileExtractMetadataNetcdf", ".tmp"); try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 1f2f84bc256..62da27671b9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -118,6 +118,10 @@ public enum JvmSettings { SCOPE_UI(PREFIX, "ui"), UI_ALLOW_REVIEW_INCOMPLETE(SCOPE_UI, "allow-review-for-incomplete"), UI_SHOW_VALIDITY_FILTER(SCOPE_UI, "show-validity-filter"), + + // NetCDF SETTINGS + SCOPE_NETCDF(PREFIX, "netcdf"), + GEO_EXTRACT_S3_DIRECT_UPLOAD(SCOPE_NETCDF, "geo-extract-s3-direct-upload"), ; private static final String SCOPE_SEPARATOR = "."; From 5a0bf2441addb9b8f8de7ba96b1af973d810420a Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 25 May 2023 16:01:14 -0400 Subject: [PATCH 5/6] turn down logging #9601 --- .../dataverse/ingest/IngestServiceBean.java | 28 ++++++++----------- .../netcdf/NetcdfFileMetadataExtractor.java | 4 +-- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index c9e2cb3115d..fc68454aba9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -1186,8 +1186,7 @@ public boolean fileMetadataExtractable(DataFile dataFile) { // Inspired by fileMetadataExtractable, above public boolean fileMetadataExtractableFromNetcdf(DataFile dataFile, Path tempLocationPath) { - logger.fine("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath); - logger.info("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath + ". contentType: " + dataFile.getContentType()); + logger.fine("fileMetadataExtractableFromNetcdf dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath + ". contentType: " + dataFile.getContentType()); if (dataFile.getContentType() != null && (dataFile.getContentType().equals(FileUtil.MIME_TYPE_NETCDF) || dataFile.getContentType().equals(FileUtil.MIME_TYPE_XNETCDF) @@ -1267,10 +1266,10 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF String dataFileLocation = null; if (tempFileLocation != null) { - logger.info("tempFileLocation is non null. Setting dataFileLocation to " + tempFileLocation); + logger.fine("tempFileLocation is non null. Setting dataFileLocation to " + tempFileLocation); dataFileLocation = tempFileLocation; } else { - logger.info("tempFileLocation is null. Perhaps the file is alrady on disk or S3 direct upload is enabled."); + logger.fine("tempFileLocation is null. Perhaps the file is alrady on disk or S3 direct upload is enabled."); File tempFile = null; File localFile; StorageIO storageIO; @@ -1280,11 +1279,11 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF if (storageIO.isLocalFile()) { localFile = storageIO.getFileSystemPath().toFile(); dataFileLocation = localFile.getAbsolutePath(); - logger.info("extractMetadataFromNetcdf: file is local. Path: " + dataFileLocation); + logger.fine("extractMetadataFromNetcdf: file is local. Path: " + dataFileLocation); } else { Optional allow = JvmSettings.GEO_EXTRACT_S3_DIRECT_UPLOAD.lookupOptional(Boolean.class); if (!(allow.isPresent() && allow.get())) { - logger.info("extractMetadataFromNetcdf: skipping because of config is set to not slow down S3 remote upload."); + logger.fine("extractMetadataFromNetcdf: skipping because of config is set to not slow down S3 remote upload."); return false; } // Need to create a temporary local file: @@ -1293,7 +1292,7 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); } dataFileLocation = tempFile.getAbsolutePath(); - logger.info("extractMetadataFromNetcdf: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + logger.fine("extractMetadataFromNetcdf: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); } } catch (IOException ex) { logger.info("extractMetadataFromNetcdf, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); @@ -1309,7 +1308,7 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF // Locate metadata extraction plugin for the file format by looking // it up with the Ingest Service Provider Registry: NetcdfFileMetadataExtractor extractorPlugin = new NetcdfFileMetadataExtractor(); - logger.info("creating file from " + dataFileLocation); + logger.fine("creating file from " + dataFileLocation); File file = new File(dataFileLocation); FileMetadataIngest extractedMetadata = extractorPlugin.ingestFile(file); Map> extractedMetadataMap = extractedMetadata.getMetadataMap(); @@ -1347,11 +1346,11 @@ public boolean extractMetadataNcml(DataFile dataFile, Path tempLocationPath) { InputStream inputStream = null; String dataFileLocation = null; if (tempLocationPath != null) { - logger.info("extractMetadataNcml: tempLocationPath is non null. Setting dataFileLocation to " + tempLocationPath); + logger.fine("extractMetadataNcml: tempLocationPath is non null. Setting dataFileLocation to " + tempLocationPath); // This file was just uploaded and hasn't been saved to S3 or local storage. dataFileLocation = tempLocationPath.toString(); } else { - logger.info("extractMetadataNcml: tempLocationPath null. Calling getExistingFile for dataFileLocation."); + logger.fine("extractMetadataNcml: tempLocationPath null. Calling getExistingFile for dataFileLocation."); dataFileLocation = getExistingFile(dataFile, dataFileLocation); } if (dataFileLocation != null) { @@ -1423,8 +1422,7 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { if (storageIO.isLocalFile()) { localFile = storageIO.getFileSystemPath().toFile(); dataFileLocation = localFile.getAbsolutePath(); - logger.fine("extractMetadataNcml: file is local. Path: " + dataFileLocation); - logger.info("getExistingFile: file is local. Path: " + dataFileLocation); + logger.fine("getExistingFile: file is local. Path: " + dataFileLocation); } else { // Need to create a temporary local file: tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); @@ -1432,12 +1430,10 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); } dataFileLocation = tempFile.getAbsolutePath(); - logger.fine("extractMetadataNcml: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); - logger.info("getExistingFile: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + logger.fine("getExistingFile: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); } } catch (IOException ex) { - logger.info("While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); - logger.info("getExistingFile: While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + logger.fine("getExistingFile: While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); } return dataFileLocation; } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java b/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java index 66f0c25f3d7..9221a6ca679 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/metadataextraction/impl/plugins/netcdf/NetcdfFileMetadataExtractor.java @@ -56,7 +56,7 @@ public FileMetadataIngest ingestFile(File file) throws IOException { String northLatitudeFinal = geoFields.get(NORTH_LATITUDE); String southLatitudeFinal = geoFields.get(SOUTH_LATITUDE); - logger.info(getLineStringsUrl(westLongitudeFinal, southLatitudeFinal, eastLongitudeFinal, northLatitudeFinal)); + logger.fine(getLineStringsUrl(westLongitudeFinal, southLatitudeFinal, eastLongitudeFinal, northLatitudeFinal)); Map> metadataMap = new HashMap<>(); metadataMap.put(WEST_LONGITUDE, new HashSet<>()); @@ -102,7 +102,7 @@ private Map parseGeospatial(NetcdfFile netcdfFile) { geoFields.put(DatasetFieldConstant.northLatitude, getValue(northLatitude)); geoFields.put(DatasetFieldConstant.southLatitude, getValue(southLatitude)); - logger.info(getLineStringsUrl( + logger.fine(getLineStringsUrl( geoFields.get(DatasetFieldConstant.westLongitude), geoFields.get(DatasetFieldConstant.southLatitude), geoFields.get(DatasetFieldConstant.eastLongitude), From 2d6564aba1c44381ffd141c86c9de4ba82b5781d Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 26 May 2023 11:10:49 -0400 Subject: [PATCH 6/6] set expectations about NcML files (modern only) #9153 #9601 --- doc/sphinx-guides/source/user/dataset-management.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index a4f6251f11f..f530e825a19 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -353,6 +353,8 @@ NcML For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML_ (XML) format and save it as an auxiliary file. (See also :doc:`/developers/aux-file-support` in the Developer Guide.) A previewer for these NcML files is available (see :ref:`file-previews`). +Please note that only modern versions of these formats, the ones based on HDF5 such as NetCDF 4+ and HDF5 itself (rather than HDF4), will yield an NcML auxiliary file. + .. _NcML: https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_overview.html Geospatial Bounding Box