From 99c606676b359f80f397e84dcc0adf2aecd2d207 Mon Sep 17 00:00:00 2001 From: mlm483 <128052931+mlm483@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:27:58 -0400 Subject: [PATCH 1/5] [BI-1900] - duplicate column name handling --- .../breedinginsight/utilities/FileUtil.java | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/breedinginsight/utilities/FileUtil.java b/src/main/java/org/breedinginsight/utilities/FileUtil.java index 37f401f76..00cd0abc0 100644 --- a/src/main/java/org/breedinginsight/utilities/FileUtil.java +++ b/src/main/java/org/breedinginsight/utilities/FileUtil.java @@ -18,6 +18,9 @@ package org.breedinginsight.utilities; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.io.input.BOMInputStream; import org.apache.poi.EncryptedDocumentException; import org.apache.poi.ss.usermodel.*; import org.breedinginsight.services.parsers.ParsingException; @@ -31,6 +34,7 @@ import java.io.*; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.util.*; @@ -70,7 +74,14 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR try { columns = new HashMap<>(); headerRow = sheet.getRow(headerRowIndex); - headerRow.forEach(cell -> columns.put(formatter.formatCellValue(cell), new ArrayList<>())); + // Throw an exception if duplicate header values are found. + for (Cell cell: headerRow) { + if (columns.containsKey(formatter.formatCellValue(cell))) { + // Duplicate column header. + throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); + } + columns.put(formatter.formatCellValue(cell), new ArrayList<>()); + } for (int i = headerRowIndex + 1; i <= sheet.getLastRowNum(); i++) { Row row = sheet.getRow(i); Iterator cellIterator = row.cellIterator(); @@ -94,7 +105,12 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR } } } - } catch (Exception e) { + } + catch (ParsingException e) { + log.error(e.toString()); + throw e; + } + catch (Exception e) { log.error(e.toString()); throw new ParsingException(ParsingExceptionType.ERROR_READING_FILE); } @@ -131,6 +147,17 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR public static Table parseTableFromCsv(InputStream inputStream) throws ParsingException { //TODO: See if this has the windows BOM issue try { + Reader reader = new InputStreamReader(new BOMInputStream(inputStream), StandardCharsets.UTF_8); + CSVParser parser = new CSVParser(reader, CSVFormat.EXCEL); + HashSet columnNames = new HashSet<>(); + for (String columnName: parser.getRecords().get(0)) { + log.debug(columnName); + if (columnNames.contains(columnName)) { + log.debug("Duplicate column header name: " + columnName); + throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); + } + columnNames.add(columnName); + } //Jackson used downstream messily converts LOCAL_DATE/LOCAL_DATETIME, so need to interpret date input as strings //Note that if another type is needed later this is what needs to be updated ArrayList acceptedTypes = new ArrayList<>(Arrays.asList(ColumnType.STRING, ColumnType.INTEGER, ColumnType.DOUBLE, ColumnType.FLOAT)); @@ -141,7 +168,7 @@ public static Table parseTableFromCsv(InputStream inputStream) throws ParsingExc .separator(',') ); return removeNullColumns(removeNullRows(df)); - } catch (IOException e) { + } catch (Exception e) { log.error(e.getMessage()); throw new ParsingException(ParsingExceptionType.ERROR_READING_FILE); } From 87971ec9913140ad347bfb076a2a424b48bd33fe Mon Sep 17 00:00:00 2001 From: mlm483 <128052931+mlm483@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:03:57 -0400 Subject: [PATCH 2/5] [BI-1900] - improved file validation --- .../breedinginsight/utilities/FileUtil.java | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/breedinginsight/utilities/FileUtil.java b/src/main/java/org/breedinginsight/utilities/FileUtil.java index 00cd0abc0..2d898577f 100644 --- a/src/main/java/org/breedinginsight/utilities/FileUtil.java +++ b/src/main/java/org/breedinginsight/utilities/FileUtil.java @@ -147,27 +147,39 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR public static Table parseTableFromCsv(InputStream inputStream) throws ParsingException { //TODO: See if this has the windows BOM issue try { - Reader reader = new InputStreamReader(new BOMInputStream(inputStream), StandardCharsets.UTF_8); - CSVParser parser = new CSVParser(reader, CSVFormat.EXCEL); - HashSet columnNames = new HashSet<>(); - for (String columnName: parser.getRecords().get(0)) { - log.debug(columnName); - if (columnNames.contains(columnName)) { - log.debug("Duplicate column header name: " + columnName); - throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); + // Read inputStream into a String. + String input = new String(inputStream.readAllBytes()); + + // Check for duplicate (non-blank) column names, could also do other validations. + try (CSVParser parser = CSVParser.parse(input, CSVFormat.EXCEL);) { + HashSet columnNames = new HashSet<>(); + for (String columnName: parser.getRecords().get(0)) { + if (columnName.isBlank()) { + log.debug("Skipping blank column header."); + } else { + log.debug("Column name: " + columnName); + if (columnNames.contains(columnName)) { + log.debug("Duplicate column header name: " + columnName); + throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); + } + columnNames.add(columnName); + } } - columnNames.add(columnName); } + + // Convert to Table. //Jackson used downstream messily converts LOCAL_DATE/LOCAL_DATETIME, so need to interpret date input as strings //Note that if another type is needed later this is what needs to be updated ArrayList acceptedTypes = new ArrayList<>(Arrays.asList(ColumnType.STRING, ColumnType.INTEGER, ColumnType.DOUBLE, ColumnType.FLOAT)); Table df = Table.read().usingOptions( CsvReadOptions - .builder(inputStream) + .builderFromString(input) .columnTypesToDetect(acceptedTypes) .separator(',') ); return removeNullColumns(removeNullRows(df)); + } catch (ParsingException e) { + throw e; } catch (Exception e) { log.error(e.getMessage()); throw new ParsingException(ParsingExceptionType.ERROR_READING_FILE); From 7cbff94848dbadb6c6bf4a48a8d9b6d9891896d1 Mon Sep 17 00:00:00 2001 From: mlm483 <128052931+mlm483@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:29:49 -0400 Subject: [PATCH 3/5] [BI-1900] - ensure tests pass --- src/main/java/org/breedinginsight/utilities/FileUtil.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/breedinginsight/utilities/FileUtil.java b/src/main/java/org/breedinginsight/utilities/FileUtil.java index 2d898577f..2c3cd2f8c 100644 --- a/src/main/java/org/breedinginsight/utilities/FileUtil.java +++ b/src/main/java/org/breedinginsight/utilities/FileUtil.java @@ -74,10 +74,10 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR try { columns = new HashMap<>(); headerRow = sheet.getRow(headerRowIndex); - // Throw an exception if duplicate header values are found. + // Build column map, throw if duplicate (non-blank) column header values are found. for (Cell cell: headerRow) { - if (columns.containsKey(formatter.formatCellValue(cell))) { - // Duplicate column header. + if (columns.containsKey(formatter.formatCellValue(cell)) && !formatter.formatCellValue(cell).isBlank()) { + // Duplicate (non-blank) column header. throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); } columns.put(formatter.formatCellValue(cell), new ArrayList<>()); @@ -148,7 +148,7 @@ public static Table parseTableFromCsv(InputStream inputStream) throws ParsingExc //TODO: See if this has the windows BOM issue try { // Read inputStream into a String. - String input = new String(inputStream.readAllBytes()); + String input = new String(new BOMInputStream(inputStream).readAllBytes()); // Check for duplicate (non-blank) column names, could also do other validations. try (CSVParser parser = CSVParser.parse(input, CSVFormat.EXCEL);) { From c0d5ec53dfd9d8f42bb6306019d2999baa41926f Mon Sep 17 00:00:00 2001 From: mlm483 <128052931+mlm483@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:35:56 -0400 Subject: [PATCH 4/5] [BI-1900] - explicitly drop BOMs --- src/main/java/org/breedinginsight/utilities/FileUtil.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/breedinginsight/utilities/FileUtil.java b/src/main/java/org/breedinginsight/utilities/FileUtil.java index 2c3cd2f8c..626f783b7 100644 --- a/src/main/java/org/breedinginsight/utilities/FileUtil.java +++ b/src/main/java/org/breedinginsight/utilities/FileUtil.java @@ -147,8 +147,8 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR public static Table parseTableFromCsv(InputStream inputStream) throws ParsingException { //TODO: See if this has the windows BOM issue try { - // Read inputStream into a String. - String input = new String(new BOMInputStream(inputStream).readAllBytes()); + // Read inputStream into a String, exclude any Byte Order Marks. + String input = new String(new BOMInputStream(inputStream, false).readAllBytes()); // Check for duplicate (non-blank) column names, could also do other validations. try (CSVParser parser = CSVParser.parse(input, CSVFormat.EXCEL);) { From 6952b158532271f7dca5c50c026704671e8df000 Mon Sep 17 00:00:00 2001 From: mlm483 <128052931+mlm483@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:01:21 -0400 Subject: [PATCH 5/5] improved comments --- src/main/java/org/breedinginsight/utilities/FileUtil.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/breedinginsight/utilities/FileUtil.java b/src/main/java/org/breedinginsight/utilities/FileUtil.java index 626f783b7..09a2962fa 100644 --- a/src/main/java/org/breedinginsight/utilities/FileUtil.java +++ b/src/main/java/org/breedinginsight/utilities/FileUtil.java @@ -77,7 +77,7 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR // Build column map, throw if duplicate (non-blank) column header values are found. for (Cell cell: headerRow) { if (columns.containsKey(formatter.formatCellValue(cell)) && !formatter.formatCellValue(cell).isBlank()) { - // Duplicate (non-blank) column header. + // Duplicate (non-blank) column header found. throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); } columns.put(formatter.formatCellValue(cell), new ArrayList<>()); @@ -145,7 +145,6 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR } public static Table parseTableFromCsv(InputStream inputStream) throws ParsingException { - //TODO: See if this has the windows BOM issue try { // Read inputStream into a String, exclude any Byte Order Marks. String input = new String(new BOMInputStream(inputStream, false).readAllBytes());