From aaf7d9a4ad53f4723d52d84a986a27cc156a25f2 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Tue, 14 Jan 2020 12:52:34 -0500 Subject: [PATCH 1/2] disables ingest of filetype text/tab-separated-values --- .../edu/harvard/iq/dataverse/ingest/IngestServiceBean.java | 4 ++-- src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index eb1b2b658d5..b2b91da93f9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -428,7 +428,7 @@ public String startIngestJobs(List dataFiles, AuthenticatedUser user) // refresh the copy of the DataFile: dataFile = fileService.find(dataFile.getId()); - long ingestSizeLimit = -1; + long ingestSizeLimit = 0; try { ingestSizeLimit = systemConfig.getTabularIngestSizeLimit(getTabDataReaderByMimeType(dataFile.getContentType()).getFormatName()); } catch (IOException ioex) { @@ -1068,7 +1068,7 @@ public static TabularDataFileReader getTabDataReaderByMimeType(String mimeType) ingestPlugin = new RDATAFileReader(new RDATAFileReaderSpi()); } else if (mimeType.equals(FileUtil.MIME_TYPE_CSV) || mimeType.equals(FileUtil.MIME_TYPE_CSV_ALT)) { ingestPlugin = new CSVFileReader(new CSVFileReaderSpi(), ','); - } else if (mimeType.equals(FileUtil.MIME_TYPE_TSV) || mimeType.equals(FileUtil.MIME_TYPE_TSV_ALT)) { + } else if (mimeType.equals(FileUtil.MIME_TYPE_TSV) /*|| mimeType.equals(FileUtil.MIME_TYPE_TSV_ALT)*/) { ingestPlugin = new CSVFileReader(new CSVFileReaderSpi(), '\t'); } else if (mimeType.equals(FileUtil.MIME_TYPE_XLSX)) { ingestPlugin = new XLSXFileReader(new XLSXFileReaderSpi()); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 45ce6949127..832043530fc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -1246,7 +1246,7 @@ public static boolean canIngestAsTabular(String mimeType) { case MIME_TYPE_CSV: case MIME_TYPE_CSV_ALT: case MIME_TYPE_TSV: - case MIME_TYPE_TSV_ALT: + //case MIME_TYPE_TSV_ALT: case MIME_TYPE_XLSX: case MIME_TYPE_SPSS_SAV: case MIME_TYPE_SPSS_POR: From 4898066810b82ff98414579fb68866f287eb4647 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Tue, 14 Jan 2020 17:04:03 -0500 Subject: [PATCH 2/2] (a few more lines of code to protect against attempted ingests of already ingested files) --- .../dataverse/ingest/IngestServiceBean.java | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index b2b91da93f9..ccc9468f89f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -325,6 +325,10 @@ from a local InputStream (or a readChannel) into the } else { logger.fine("Failed to extract indexable metadata from file " + fileName); } + } else if (FileUtil.MIME_TYPE_INGESTED_FILE.equals(dataFile.getContentType())) { + // Make sure no *uningested* tab-delimited files are saved with the type "text/tab-separated-values"! + // "text/tsv" should be used instead: + dataFile.setContentType(FileUtil.MIME_TYPE_TSV); } // temp dbug line //System.out.println("ADDING FILE: " + fileName + "; for dataset: " + dataset.getGlobalId()); @@ -423,11 +427,11 @@ public String startIngestJobs(List dataFiles, AuthenticatedUser user) List scheduledFiles = new ArrayList<>(); for (DataFile dataFile : dataFiles) { + // refresh the copy of the DataFile: + dataFile = fileService.find(dataFile.getId()); + if (dataFile.isIngestScheduled()) { - // refresh the copy of the DataFile: - dataFile = fileService.find(dataFile.getId()); - long ingestSizeLimit = 0; try { ingestSizeLimit = systemConfig.getTabularIngestSizeLimit(getTabDataReaderByMimeType(dataFile.getContentType()).getFormatName()); @@ -731,6 +735,15 @@ public boolean ingestAsTabular(Long datafile_id) { boolean ingestSuccessful = false; boolean forceTypeCheck = false; + // Never attempt to ingest a file that's already ingested! + if (dataFile.isTabularData()) { + FileUtil.createIngestFailureReport(dataFile, "Repeated ingest attempted on a tabular data file! (status flag was: "+dataFile.getIngestStatus()); + dataFile.setIngestDone(); + dataFile = fileService.save(dataFile); + logger.warning("Repeated ingest attempted on a tabular data file (datafile id "+datafile_id+"); exiting."); + return false; + } + IngestRequest ingestRequest = dataFile.getIngestRequest(); if (ingestRequest != null) { forceTypeCheck = ingestRequest.isForceTypeCheck();