diff --git a/src/main/java/org/breedinginsight/services/parsers/ParsingExceptionType.java b/src/main/java/org/breedinginsight/services/parsers/ParsingExceptionType.java index fcae51935..31e7356d2 100644 --- a/src/main/java/org/breedinginsight/services/parsers/ParsingExceptionType.java +++ b/src/main/java/org/breedinginsight/services/parsers/ParsingExceptionType.java @@ -20,11 +20,12 @@ public enum ParsingExceptionType { - MISSING_COLUMN_NAMES("Missing column names row"), + MISSING_COLUMN_NAMES_ROW("Missing column names row"), COLUMN_NAME_NOT_STRING("Column name must be string cell"), DUPLICATE_COLUMN_NAMES("Found duplicate column names"), MISSING_EXPECTED_COLUMNS("Missing expected columns"), ERROR_READING_FILE("Error reading file"), + MISSING_COLUMN_NAME("Missing one or more column names"), MISSING_SHEET( String.format("Worksheet titled '%s' is missing", FileUtil.EXCEL_DATA_SHEET_NAME) ), EMPTY_ROW("Empty row"), INVALID_TRAIT_STATUS("Invalid trait status value"), diff --git a/src/main/java/org/breedinginsight/services/parsers/excel/ExcelParser.java b/src/main/java/org/breedinginsight/services/parsers/excel/ExcelParser.java index c51913693..bee8dae82 100644 --- a/src/main/java/org/breedinginsight/services/parsers/excel/ExcelParser.java +++ b/src/main/java/org/breedinginsight/services/parsers/excel/ExcelParser.java @@ -46,7 +46,7 @@ public static List parse(Sheet sheet, Set columns) throws P Row columnNames = sheet.getRow(EXCEL_COLUMN_NAMES_ROW); if (columnNames == null) { - throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAMES); + throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAMES_ROW); } Map indexColNameMap = new HashMap<>(); diff --git a/src/main/java/org/breedinginsight/utilities/FileUtil.java b/src/main/java/org/breedinginsight/utilities/FileUtil.java index e04a332fc..37f401f76 100644 --- a/src/main/java/org/breedinginsight/utilities/FileUtil.java +++ b/src/main/java/org/breedinginsight/utilities/FileUtil.java @@ -25,13 +25,13 @@ import tech.tablesaw.api.ColumnType; import tech.tablesaw.api.StringColumn; import tech.tablesaw.api.Table; +import tech.tablesaw.columns.Column; import tech.tablesaw.io.csv.CsvReadOptions; import tech.tablesaw.io.json.JsonReadOptions; import java.io.*; import java.math.BigDecimal; import java.util.*; -import java.util.stream.Collectors; @Slf4j @@ -106,6 +106,19 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR while (headerIterator.hasNext()) { Cell cell = headerIterator.next(); StringColumn column = StringColumn.create(formatter.formatCellValue(cell), columns.get(formatter.formatCellValue(cell))); + // Drop columns with no data, throw exception if column has data but no header. + if (cell.getCellType() == CellType.BLANK) + { + // If data in column with no header, throw parsing exception, user likely wants to add header. + for (String value : column.asList()) { + if (!value.isBlank()) + { + throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAME); + } + } + // Silently drop columns with neither headers nor data, user likely doesn't know they exist. + continue; + } if (!colNames.add(column.name())) { throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES); } @@ -127,7 +140,7 @@ public static Table parseTableFromCsv(InputStream inputStream) throws ParsingExc .columnTypesToDetect(acceptedTypes) .separator(',') ); - return removeNullRows(df); + return removeNullColumns(removeNullRows(df)); } catch (IOException e) { log.error(e.getMessage()); throw new ParsingException(ParsingExceptionType.ERROR_READING_FILE); @@ -152,10 +165,11 @@ public static Table removeNullRows(Table table) { List allNullRows = new ArrayList<>(); // Find all null rows table.stream().forEach(row -> { - Boolean allNull = true; + boolean allNull = true; for (String columnName: row.columnNames()) { if (row.getObject(columnName) != null && !row.getObject(columnName).toString().isEmpty()) { allNull = false; + break; } } if (allNull) { @@ -168,4 +182,30 @@ public static Table removeNullRows(Table table) { } return table; } + + /** Removes columns with an empty or null header and no data from a table. */ + public static Table removeNullColumns(Table table) throws ParsingException { + ArrayList columnsToRemove = new ArrayList<>(); + int columnIndex = 0; + for (Column column : table.columns()) { + // Empty/null column headers are replaced with a placeholder by tablesaw, e.g. "C23" for the 23rd column. + // See https://github.com/jtablesaw/tablesaw/blob/42ca803e1a5fff1d4a01f5a3deabc38ced783125/core/src/main/java/tech/tablesaw/io/FileReader.java#L101. + String placeholderName = String.format("C%d", columnIndex); + if (column.name().equals(placeholderName)) { + if (column.countMissing() == column.size()) { + // Silently drop columns with neither headers nor data, user likely doesn't know they exist. + columnsToRemove.add(column); + } + else { + // If data in column with no header, throw parsing exception, user likely wants to add header. + throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAME); + } + } + ++columnIndex; + } + + table.removeColumns(columnsToRemove.toArray(Column[]::new)); + + return table; + } } diff --git a/src/test/java/org/breedinginsight/services/parsers/TraitFileParserUnitTest.java b/src/test/java/org/breedinginsight/services/parsers/TraitFileParserUnitTest.java index 06f8a9e85..17bd289b9 100644 --- a/src/test/java/org/breedinginsight/services/parsers/TraitFileParserUnitTest.java +++ b/src/test/java/org/breedinginsight/services/parsers/TraitFileParserUnitTest.java @@ -81,7 +81,7 @@ void parseCsvEmptyFile() { InputStream inputStream = new FileInputStream(file); ParsingException e = assertThrows(ParsingException.class, () -> parser.parseCsv(inputStream), "expected parsing exception"); - assertEquals(ParsingExceptionType.MISSING_COLUMN_NAMES, e.getType(), "Wrong type"); + assertEquals(ParsingExceptionType.MISSING_COLUMN_NAMES_ROW, e.getType(), "Wrong type"); } @Test diff --git a/src/test/java/org/breedinginsight/utilities/response/FileUtilUnitTest.java b/src/test/java/org/breedinginsight/utilities/response/FileUtilUnitTest.java index 14d00a25c..fbf630d8c 100644 --- a/src/test/java/org/breedinginsight/utilities/response/FileUtilUnitTest.java +++ b/src/test/java/org/breedinginsight/utilities/response/FileUtilUnitTest.java @@ -41,7 +41,6 @@ void parseExcelRemoveAllNullRows() { assertEquals(2, resultTable.rowCount(), "Wrong number of rows were parsed"); } - @Test @SneakyThrows void parseCsvNoRemoveSomeNullRows() { @@ -60,6 +59,26 @@ void parseExcelNoRemoveSomeNullRows() { assertEquals(5, resultTable.rowCount(), "Wrong number of rows were parsed"); } + @Test + @SneakyThrows + void parseCsvRemoveAllNullColumns() { + // Columns with no header and no data should be silently dropped. + File file = new File("src/test/resources/files/fileutil/file_all_null_columns.csv"); + InputStream inputStream = new FileInputStream(file); + Table resultTable = FileUtil.parseTableFromCsv(inputStream); + assertEquals(21, resultTable.columnCount(), "Wrong number of columns were parsed"); + } + + @Test + @SneakyThrows + void parseExcelRemoveAllNullColumns() { + // Columns with no header and no data should be silently dropped. + File file = new File("src/test/resources/files/fileutil/file_all_null_columns.xls"); + InputStream inputStream = new FileInputStream(file); + Table resultTable = FileUtil.parseTableFromExcel(inputStream, 0); + assertEquals(21, resultTable.columnCount(), "Wrong number of columns were parsed"); + } + @Test @SneakyThrows void writeExcelCheckColumns() { diff --git a/src/test/resources/files/fileutil/file_all_null_columns.csv b/src/test/resources/files/fileutil/file_all_null_columns.csv new file mode 100644 index 000000000..ceb3dbef3 --- /dev/null +++ b/src/test/resources/files/fileutil/file_all_null_columns.csv @@ -0,0 +1,4 @@ +Germplasm Name,Germplasm GID,Test (T) or Check (C ),Exp Title,Exp Description,Exp Unit,Exp Type,Env,Env Location,Env Year,Exp Unit ID,Exp Replicate #,Exp Block #,Row,Column,Treatment Factors,ObsUnitID,Color,INCAUDPS,INCW10SUM,INCW6SUM,,, +BRG,453,C,KRSP22-3,INSV phenotyping trial BRG x Eruption RIL population and parents,Plot,Disease resistance screening,"Salinas, CA 2022","Salinas, CA",2022,89,1,1,,,,,Green,120,6,0,,, +Eruption,454,C,KRSP22-3,INSV phenotyping trial BRG x Eruption RIL population and parents,Plot,Disease resistance screening,"Salinas, CA 2022","Salinas, CA",2022,90,1,1,,,,,Medium red,110,4,0,,, +BxE16-001,455,,KRSP22-3,INSV phenotyping trial BRG x Eruption RIL population and parents,Plot,Disease resistance screening,"Salinas, CA 2022","Salinas, CA",2022,91,1,1,,,,,Dark red,110,6,0,,, \ No newline at end of file diff --git a/src/test/resources/files/fileutil/file_all_null_columns.xls b/src/test/resources/files/fileutil/file_all_null_columns.xls new file mode 100644 index 000000000..7c76e4754 Binary files /dev/null and b/src/test/resources/files/fileutil/file_all_null_columns.xls differ