Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@

public enum ParsingExceptionType {

MISSING_COLUMN_NAMES("Missing column names row"),
MISSING_COLUMN_NAMES_ROW("Missing column names row"),
COLUMN_NAME_NOT_STRING("Column name must be string cell"),
DUPLICATE_COLUMN_NAMES("Found duplicate column names"),
MISSING_EXPECTED_COLUMNS("Missing expected columns"),
ERROR_READING_FILE("Error reading file"),
MISSING_COLUMN_NAME("Missing one or more column names"),
MISSING_SHEET( String.format("Worksheet titled '%s' is missing", FileUtil.EXCEL_DATA_SHEET_NAME) ),
EMPTY_ROW("Empty row"),
INVALID_TRAIT_STATUS("Invalid trait status value"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public static List<ExcelRecord> parse(Sheet sheet, Set<String> columns) throws P
Row columnNames = sheet.getRow(EXCEL_COLUMN_NAMES_ROW);

if (columnNames == null) {
throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAMES);
throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAMES_ROW);
}

Map<Integer, String> indexColNameMap = new HashMap<>();
Expand Down
46 changes: 43 additions & 3 deletions src/main/java/org/breedinginsight/utilities/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.Column;
import tech.tablesaw.io.csv.CsvReadOptions;
import tech.tablesaw.io.json.JsonReadOptions;

import java.io.*;
import java.math.BigDecimal;
import java.util.*;
import java.util.stream.Collectors;


@Slf4j
Expand Down Expand Up @@ -106,6 +106,19 @@ public static Table parseTableFromExcel(InputStream inputStream, Integer headerR
while (headerIterator.hasNext()) {
Cell cell = headerIterator.next();
StringColumn column = StringColumn.create(formatter.formatCellValue(cell), columns.get(formatter.formatCellValue(cell)));
// Drop columns with no data, throw exception if column has data but no header.
if (cell.getCellType() == CellType.BLANK)
{
// If data in column with no header, throw parsing exception, user likely wants to add header.
for (String value : column.asList()) {
if (!value.isBlank())
{
throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAME);
}
}
// Silently drop columns with neither headers nor data, user likely doesn't know they exist.
continue;
}
if (!colNames.add(column.name())) {
throw new ParsingException(ParsingExceptionType.DUPLICATE_COLUMN_NAMES);
}
Expand All @@ -127,7 +140,7 @@ public static Table parseTableFromCsv(InputStream inputStream) throws ParsingExc
.columnTypesToDetect(acceptedTypes)
.separator(',')
);
return removeNullRows(df);
return removeNullColumns(removeNullRows(df));
} catch (IOException e) {
log.error(e.getMessage());
throw new ParsingException(ParsingExceptionType.ERROR_READING_FILE);
Expand All @@ -152,10 +165,11 @@ public static Table removeNullRows(Table table) {
List<Integer> allNullRows = new ArrayList<>();
// Find all null rows
table.stream().forEach(row -> {
Boolean allNull = true;
boolean allNull = true;
for (String columnName: row.columnNames()) {
if (row.getObject(columnName) != null && !row.getObject(columnName).toString().isEmpty()) {
allNull = false;
break;
}
}
if (allNull) {
Expand All @@ -168,4 +182,30 @@ public static Table removeNullRows(Table table) {
}
return table;
}

/** Removes columns with an empty or null header and no data from a table. */
public static Table removeNullColumns(Table table) throws ParsingException {
ArrayList<Column> columnsToRemove = new ArrayList<>();
int columnIndex = 0;
for (Column column : table.columns()) {
// Empty/null column headers are replaced with a placeholder by tablesaw, e.g. "C23" for the 23rd column.
// See https://github.com/jtablesaw/tablesaw/blob/42ca803e1a5fff1d4a01f5a3deabc38ced783125/core/src/main/java/tech/tablesaw/io/FileReader.java#L101.
String placeholderName = String.format("C%d", columnIndex);
if (column.name().equals(placeholderName)) {
if (column.countMissing() == column.size()) {
// Silently drop columns with neither headers nor data, user likely doesn't know they exist.
columnsToRemove.add(column);
}
else {
// If data in column with no header, throw parsing exception, user likely wants to add header.
throw new ParsingException(ParsingExceptionType.MISSING_COLUMN_NAME);
}
}
++columnIndex;
}

table.removeColumns(columnsToRemove.toArray(Column[]::new));

return table;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void parseCsvEmptyFile() {
InputStream inputStream = new FileInputStream(file);

ParsingException e = assertThrows(ParsingException.class, () -> parser.parseCsv(inputStream), "expected parsing exception");
assertEquals(ParsingExceptionType.MISSING_COLUMN_NAMES, e.getType(), "Wrong type");
assertEquals(ParsingExceptionType.MISSING_COLUMN_NAMES_ROW, e.getType(), "Wrong type");
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ void parseExcelRemoveAllNullRows() {
assertEquals(2, resultTable.rowCount(), "Wrong number of rows were parsed");
}


@Test
@SneakyThrows
void parseCsvNoRemoveSomeNullRows() {
Expand All @@ -60,6 +59,26 @@ void parseExcelNoRemoveSomeNullRows() {
assertEquals(5, resultTable.rowCount(), "Wrong number of rows were parsed");
}

@Test
@SneakyThrows
void parseCsvRemoveAllNullColumns() {
// Columns with no header and no data should be silently dropped.
File file = new File("src/test/resources/files/fileutil/file_all_null_columns.csv");
InputStream inputStream = new FileInputStream(file);
Table resultTable = FileUtil.parseTableFromCsv(inputStream);
assertEquals(21, resultTable.columnCount(), "Wrong number of columns were parsed");
}

@Test
@SneakyThrows
void parseExcelRemoveAllNullColumns() {
// Columns with no header and no data should be silently dropped.
File file = new File("src/test/resources/files/fileutil/file_all_null_columns.xls");
InputStream inputStream = new FileInputStream(file);
Table resultTable = FileUtil.parseTableFromExcel(inputStream, 0);
assertEquals(21, resultTable.columnCount(), "Wrong number of columns were parsed");
}

@Test
@SneakyThrows
void writeExcelCheckColumns() {
Expand Down
4 changes: 4 additions & 0 deletions src/test/resources/files/fileutil/file_all_null_columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Germplasm Name,Germplasm GID,Test (T) or Check (C ),Exp Title,Exp Description,Exp Unit,Exp Type,Env,Env Location,Env Year,Exp Unit ID,Exp Replicate #,Exp Block #,Row,Column,Treatment Factors,ObsUnitID,Color,INCAUDPS,INCW10SUM,INCW6SUM,,,
BRG,453,C,KRSP22-3,INSV phenotyping trial BRG x Eruption RIL population and parents,Plot,Disease resistance screening,"Salinas, CA 2022","Salinas, CA",2022,89,1,1,,,,,Green,120,6,0,,,
Eruption,454,C,KRSP22-3,INSV phenotyping trial BRG x Eruption RIL population and parents,Plot,Disease resistance screening,"Salinas, CA 2022","Salinas, CA",2022,90,1,1,,,,,Medium red,110,4,0,,,
BxE16-001,455,,KRSP22-3,INSV phenotyping trial BRG x Eruption RIL population and parents,Plot,Disease resistance screening,"Salinas, CA 2022","Salinas, CA",2022,91,1,1,,,,,Dark red,110,6,0,,,
Binary file not shown.