diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java index 655a9f93092..130576fcbc0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java @@ -25,27 +25,20 @@ import java.text.*; import java.util.logging.*; import java.util.*; -import java.security.NoSuchAlgorithmException; - -import javax.inject.Inject; import edu.harvard.iq.dataverse.DataTable; import edu.harvard.iq.dataverse.datavariable.DataVariable; -import edu.harvard.iq.dataverse.ingest.plugin.spi.*; import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi; import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; -import java.math.BigDecimal; import java.math.MathContext; import java.math.RoundingMode; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; - -import org.apache.commons.lang.RandomStringUtils; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.csv.CSVFormat; import org.apache.commons.lang.StringUtils; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVPrinter; +import org.apache.commons.csv.CSVRecord; /** * Dataverse 4.0 implementation of TabularDataFileReader for the @@ -59,18 +52,22 @@ public class CSVFileReader extends TabularDataFileReader { private static final Logger dbglog = Logger.getLogger(CSVFileReader.class.getPackage().getName()); - private static final int DIGITS_OF_PRECISION_DOUBLE = 15; + private static final int DIGITS_OF_PRECISION_DOUBLE = 15; private static final String FORMAT_IEEE754 = "%+#." + DIGITS_OF_PRECISION_DOUBLE + "e"; private MathContext doubleMathContext; - private char delimiterChar = ','; - + private CSVFormat inFormat = CSVFormat.EXCEL.withHeader(); + private CSVFormat outFormat = CSVFormat.TDF; + private Set firstNumCharSet = new HashSet<>(); + // DATE FORMATS - private static SimpleDateFormat[] DATE_FORMATS = new SimpleDateFormat[] { - new SimpleDateFormat("yyyy-MM-dd") + private static SimpleDateFormat[] DATE_FORMATS = new SimpleDateFormat[]{ + new SimpleDateFormat("yyyy-MM-dd"), //new SimpleDateFormat("yyyy/MM/dd"), + //new SimpleDateFormat("MM/dd/yyyy"), + //new SimpleDateFormat("MM-dd-yyyy"), }; - + // TIME FORMATS - private static SimpleDateFormat[] TIME_FORMATS = new SimpleDateFormat[] { + private static SimpleDateFormat[] TIME_FORMATS = new SimpleDateFormat[]{ // Date-time up to seconds with timezone, e.g. 2013-04-08 13:14:23 -0500 new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z"), // Date-time up to seconds and no timezone, e.g. 2013-04-08 13:14:23 @@ -83,20 +80,23 @@ public CSVFileReader(TabularDataFileReaderSpi originator) { private void init() throws IOException { doubleMathContext = new MathContext(DIGITS_OF_PRECISION_DOUBLE, RoundingMode.HALF_EVEN); + firstNumCharSet.addAll(Arrays.asList(new Character[]{'+', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'})); } - + /** * Reads a CSV file, converts it into a dataverse DataTable. * * @param stream a BufferedInputStream. - * @param ignored * @return an TabularDataIngest object * @throws java.io.IOException if a reading error occurs. */ @Override public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { init(); - + + if (stream == null) { + throw new IOException("Stream can't be null."); + } TabularDataIngest ingesteddata = new TabularDataIngest(); DataTable dataTable = new DataTable(); @@ -105,13 +105,13 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws File tabFileDestination = File.createTempFile("data-", ".tab"); PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath()); - int lineCount = readFile(localBufferedReader, dataTable, tabFileWriter); - - dbglog.fine("CSV ingest: found "+lineCount+" data cases/observations."); - dbglog.fine("Tab file produced: "+tabFileDestination.getAbsolutePath()); - + int lineCount = readFile(localBufferedReader, dataTable, tabFileWriter); + + dbglog.fine("CSV ingest: found " + lineCount + " data cases/observations."); + dbglog.fine("Tab file produced: " + tabFileDestination.getAbsolutePath()); + dataTable.setUnf("UNF:6:NOTCALCULATED"); - + ingesteddata.setTabDelimitedFile(tabFileDestination); ingesteddata.setDataTable(dataTable); return ingesteddata; @@ -119,312 +119,211 @@ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws } public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException { - - String line; - String[] valueTokens; - - int lineCounter = 0; - - // Read first line: - - line = csvReader.readLine(); - line = line.replaceFirst("[\r\n]*$", ""); - valueTokens = line.split("" + delimiterChar, -2); - - if (valueTokens == null || valueTokens.length < 1) { - throw new IOException("Failed to read first, variable name line of the CSV file."); - } - int variableCount = valueTokens.length; - - // Create variables: - - List variableList = new ArrayList(); - - for (int i = 0; i < variableCount; i++) { - String varName = valueTokens[i]; - - if (varName == null || varName.equals("")) { + List variableList = new ArrayList<>(); + CSVParser parser = new CSVParser(csvReader, inFormat); + dbglog.fine("Headers: " + parser.getHeaderMap()); + Map headers = parser.getHeaderMap(); + for (String varName : headers.keySet()) { + + if (varName == null || varName.isEmpty()) { // TODO: // Add a sensible variable name validation algorithm. // -- L.A. 4.0 alpha 1 - throw new IOException ("Invalid variable names in the first line! - First line of a CSV file must contain a comma-separated list of the names of the variables."); + throw new IOException("Invalid variable names in the first line! - First line of a CSV file must contain a comma-separated list of the names of the variables."); } - + DataVariable dv = new DataVariable(); dv.setName(varName); dv.setLabel(varName); - dv.setInvalidRanges(new ArrayList()); - dv.setSummaryStatistics(new ArrayList()); + dv.setInvalidRanges(new ArrayList<>()); + dv.setSummaryStatistics(new ArrayList<>()); dv.setUnf("UNF:6:NOTCALCULATED"); - dv.setCategories(new ArrayList()); + dv.setCategories(new ArrayList<>()); variableList.add(dv); dv.setTypeCharacter(); dv.setIntervalDiscrete(); - dv.setFileOrder(i); + dv.setFileOrder(headers.get(varName)); dv.setDataTable(dataTable); } - - dataTable.setVarQuantity(new Long(variableCount)); + + dataTable.setVarQuantity(new Long(variableList.size())); dataTable.setDataVariables(variableList); - - boolean[] isNumericVariable = new boolean[variableCount]; - boolean[] isIntegerVariable = new boolean[variableCount]; - boolean[] isTimeVariable = new boolean[variableCount]; - boolean[] isDateVariable = new boolean[variableCount]; - - for (int i = 0; i < variableCount; i++) { + + boolean[] isNumericVariable = new boolean[headers.size()]; + boolean[] isIntegerVariable = new boolean[headers.size()]; + boolean[] isTimeVariable = new boolean[headers.size()]; + boolean[] isDateVariable = new boolean[headers.size()]; + + for (int i = 0; i < headers.size(); i++) { // OK, let's assume that every variable is numeric; // but we'll go through the file and examine every value; the // moment we find a value that's not a legit numeric one, we'll // assume that it is in fact a String. - isNumericVariable[i] = true; + isNumericVariable[i] = true; isIntegerVariable[i] = true; - isDateVariable[i] = true; - isTimeVariable[i] = true; + isDateVariable[i] = true; + isTimeVariable[i] = true; } // First, "learning" pass. // (we'll save the incoming stream in another temp file:) - - SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[variableCount]; - SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[variableCount]; + SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()]; + SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()]; - File firstPassTempFile = File.createTempFile("firstpass-", ".tab"); - PrintWriter firstPassWriter = new PrintWriter(firstPassTempFile.getAbsolutePath()); - - - while ((line = csvReader.readLine()) != null) { - // chop the line: - line = line.replaceFirst("[\r\n]*$", ""); - valueTokens = line.split("" + delimiterChar, -2); - - if (valueTokens == null) { - throw new IOException("Failed to read line " + (lineCounter + 1) + " of the Data file."); - } - int tokenCount = valueTokens.length; - - if (tokenCount > variableCount) { - - // we'll make another attempt to parse the fields - there could be commas - // inside character strings. The only way to disambiguate this situation - // we are going to support, for now, is to allow commas inside tokens - // wrapped in double quotes. We may potentially add other mechanisms, - // such as allowing to specify a custom string wrapper character (something other - // than the double quote), or maybe recognizing escaped commas ("\,") as - // non-separating ones. - // -- L.A. 4.0.2 - - valueTokens = null; - valueTokens = new String[variableCount]; - - int tokenStart = 0; - boolean quotedStringMode = false; - boolean potentialDoubleDoubleQuote = false; - tokenCount = 0; - - for (int i = 0; i < line.length(); i++) { - if (tokenCount > variableCount) { - throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " of the data file contains more than " - + variableCount + " comma-delimited values."); - } - - char c = line.charAt(i); - - if (tokenStart == i && c == '"') { - quotedStringMode = true; - } else if (c == ',' && !quotedStringMode) { - valueTokens[tokenCount] = line.substring(tokenStart, i); - tokenCount++; - tokenStart = i+1; - } else if (i == line.length() - 1) { - valueTokens[tokenCount] = line.substring(tokenStart, line.length()); - tokenCount++; - } else if (quotedStringMode && c == '"') { - quotedStringMode = false; - //unless this is a double double quote in the middle of a quoted - // string; apparently a standard notation for encoding double - // quotes inside quoted strings (??) - potentialDoubleDoubleQuote = true; - } else if (potentialDoubleDoubleQuote && c == '"') { - // OK, that was a "double double" quote. - // going back into the quoted mode: - quotedStringMode = true; - potentialDoubleDoubleQuote = false; - // TODO: figure out what we do with such double double quote - // sequences in the final tab file. Do we want to convert - // them back to a "single double" quote? - // -- L.A. 4.0.2/4.1 - } - + try (CSVPrinter csvFilePrinter = new CSVPrinter( + new FileWriter(firstPassTempFile.getAbsolutePath()), outFormat)) { + // Write the header line + csvFilePrinter.printRecord(headers.keySet()); + for (CSVRecord record : parser.getRecords()) { + // Checks if #records = #columns in header + if (!record.isConsistent()) { + throw new IOException("Reading mismatch, line " + (parser.getCurrentLineNumber() + 1) + + " of the Data file: " + headers.size() + + " delimited values expected, " + record.size() + " found."); } - } - - //dbglog.info("Number of CSV tokens in the line number " + lineCounter + " : "+tokenCount); - - // final token count check: - - if (tokenCount != variableCount) { - - throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " of the Data file: " - + variableCount + " delimited values expected, " + tokenCount + " found."); - } - for (int i = 0; i < variableCount; i++) { - if (isNumericVariable[i]) { - // If we haven't given up on the "numeric" status of this - // variable, let's perform some tests on it, and see if - // this value is still a parsable number: - if (valueTokens[i] != null && (!valueTokens[i].equals(""))) { - - boolean isNumeric = false; - boolean isInteger = false; - - if (valueTokens[i].equalsIgnoreCase("NaN") - || valueTokens[i].equalsIgnoreCase("NA") - || valueTokens[i].equalsIgnoreCase("Inf") - || valueTokens[i].equalsIgnoreCase("+Inf") - || valueTokens[i].equalsIgnoreCase("-Inf") - || valueTokens[i].equalsIgnoreCase("null")) { - isNumeric = true; - } else { - try { - Double testDoubleValue = new Double(valueTokens[i]); - isNumeric = true; - } catch (NumberFormatException ex) { - // the token failed to parse as a double number; - // so we'll have to assume it's just a string variable. - } - } - - if (!isNumeric) { - isNumericVariable[i] = false; - } else if (isIntegerVariable[i]) { - try { - Integer testIntegerValue = new Integer(valueTokens[i]); - isInteger = true; - } catch (NumberFormatException ex) { - // the token failed to parse as an integer number; - // we'll assume it's a non-integere numeric... - } - if (!isInteger) { - isIntegerVariable[i] = false; + for (int i = 0; i < headers.size(); i++) { + String varString = record.get(i); + isIntegerVariable[i] = isIntegerVariable[i] + && varString != null + && !varString.isEmpty() + && (varString.equals("null") + || firstNumCharSet.contains(varString.charAt(0)) + && StringUtils.isNumeric(varString.substring(1))); + if (isNumericVariable[i]) { + // If we haven't given up on the "numeric" status of this + // variable, let's perform some tests on it, and see if + // this value is still a parsable number: + if (varString != null && !varString.isEmpty()) { + + boolean isNumeric = false; + boolean isInteger = false; + + if (varString.equalsIgnoreCase("NaN") + || varString.equalsIgnoreCase("NA") + || varString.equalsIgnoreCase("Inf") + || varString.equalsIgnoreCase("+Inf") + || varString.equalsIgnoreCase("-Inf") + || varString.equalsIgnoreCase("null")) { + continue; + } else { + try { + Double testDoubleValue = new Double(varString); + continue; + } catch (NumberFormatException ex) { + // the token failed to parse as a double number; + // so we'll have to assume it's just a string variable. + } } + isNumericVariable[i] = false; } } - } - - // And if we have concluded that this is not a numeric column, - // let's see if we can parse the string token as a date or - // a date-time value: - - if (!isNumericVariable[i]) { - - Date dateResult = null; - - if (isTimeVariable[i]) { - if (valueTokens[i] != null && (!valueTokens[i].equals(""))) { - boolean isTime = false; - - if (selectedDateTimeFormat[i] != null) { - dbglog.fine("will try selected format " + selectedDateTimeFormat[i].toPattern()); - ParsePosition pos = new ParsePosition(0); - dateResult = selectedDateTimeFormat[i].parse(valueTokens[i], pos); - - if (dateResult == null) { - dbglog.fine(selectedDateTimeFormat[i].toPattern() + ": null result."); - } else if (pos.getIndex() != valueTokens[i].length()) { - dbglog.fine(selectedDateTimeFormat[i].toPattern() + ": didn't parse to the end - bad time zone?"); - } else { - // OK, successfully parsed a value! - isTime = true; - dbglog.fine(selectedDateTimeFormat[i].toPattern() + " worked!"); - } - } else { - for (SimpleDateFormat format : TIME_FORMATS) { - dbglog.fine("will try format " + format.toPattern()); + + // And if we have concluded that this is not a numeric column, + // let's see if we can parse the string token as a date or + // a date-time value: + if (!isNumericVariable[i]) { + + Date dateResult = null; + + if (isTimeVariable[i]) { + if (varString != null && !varString.isEmpty()) { + boolean isTime = false; + + if (selectedDateTimeFormat[i] != null) { + dbglog.fine("will try selected format " + selectedDateTimeFormat[i].toPattern()); ParsePosition pos = new ParsePosition(0); - dateResult = format.parse(valueTokens[i], pos); + dateResult = selectedDateTimeFormat[i].parse(varString, pos); + if (dateResult == null) { - dbglog.fine(format.toPattern() + ": null result."); - continue; + dbglog.fine(selectedDateTimeFormat[i].toPattern() + ": null result."); + } else if (pos.getIndex() != varString.length()) { + dbglog.fine(selectedDateTimeFormat[i].toPattern() + ": didn't parse to the end - bad time zone?"); + } else { + // OK, successfully parsed a value! + isTime = true; + dbglog.fine(selectedDateTimeFormat[i].toPattern() + " worked!"); } - if (pos.getIndex() != valueTokens[i].length()) { - dbglog.fine(format.toPattern() + ": didn't parse to the end - bad time zone?"); - continue; + } else { + for (SimpleDateFormat format : TIME_FORMATS) { + dbglog.fine("will try format " + format.toPattern()); + ParsePosition pos = new ParsePosition(0); + dateResult = format.parse(varString, pos); + if (dateResult == null) { + dbglog.fine(format.toPattern() + ": null result."); + continue; + } + if (pos.getIndex() != varString.length()) { + dbglog.fine(format.toPattern() + ": didn't parse to the end - bad time zone?"); + continue; + } + // OK, successfully parsed a value! + isTime = true; + dbglog.fine(format.toPattern() + " worked!"); + selectedDateTimeFormat[i] = format; + break; } - // OK, successfully parsed a value! - isTime = true; - dbglog.fine(format.toPattern() + " worked!"); - selectedDateTimeFormat[i] = format; - break; } - } - if (!isTime) { - isTimeVariable[i] = false; - // OK, the token didn't parse as a time value; - // But we will still try to parse it as a date, below. - // unless of course we have already decided that this column - // is NOT a date. - } else { - // And if it is a time value, we are going to assume it's - // NOT a date. - isDateVariable[i] = false; + if (!isTime) { + isTimeVariable[i] = false; + // OK, the token didn't parse as a time value; + // But we will still try to parse it as a date, below. + // unless of course we have already decided that this column + // is NOT a date. + } else { + // And if it is a time value, we are going to assume it's + // NOT a date. + isDateVariable[i] = false; + } } } - } - if (isDateVariable[i]) { - if (valueTokens[i] != null && (!valueTokens[i].equals(""))) { - boolean isDate = false; - - // TODO: - // Strictly speaking, we should be doing the same thing - // here as with the time formats above; select the - // first one that works, then insist that all the - // other values in this column match it... but we - // only have one, as of now, so it should be ok. - // -- L.A. 4.0 beta - - for (SimpleDateFormat format : DATE_FORMATS) { - // Strict parsing - it will throw an - // exception if it doesn't parse! - format.setLenient(false); - dbglog.fine("will try format " + format.toPattern()); - try { - dateResult = format.parse(valueTokens[i]); - dbglog.fine("format " + format.toPattern() + " worked!"); - isDate = true; - selectedDateFormat[i] = format; - break; - } catch (ParseException ex) { - //Do nothing - dbglog.fine("format " + format.toPattern() + " didn't work."); + if (isDateVariable[i]) { + if (varString != null && !varString.isEmpty()) { + boolean isDate = false; + + // TODO: + // Strictly speaking, we should be doing the same thing + // here as with the time formats above; select the + // first one that works, then insist that all the + // other values in this column match it... but we + // only have one, as of now, so it should be ok. + // -- L.A. 4.0 beta + for (SimpleDateFormat format : DATE_FORMATS) { + // Strict parsing - it will throw an + // exception if it doesn't parse! + format.setLenient(false); + dbglog.fine("will try format " + format.toPattern()); + try { + format.parse(record.get(i)); + dbglog.fine("format " + format.toPattern() + " worked!"); + isDate = true; + selectedDateFormat[i] = format; + break; + } catch (ParseException ex) { + //Do nothing + dbglog.fine("format " + format.toPattern() + " didn't work."); + } } + isDateVariable[i] = isDate; } - if (!isDate) { - isDateVariable[i] = false; - } } } } + + csvFilePrinter.printRecord(record); } - - firstPassWriter.println(line); - lineCounter++; } - - firstPassWriter.close(); + dataTable.setCaseQuantity(parser.getCurrentLineNumber()); + parser.close(); csvReader.close(); - dataTable.setCaseQuantity(new Long(lineCounter)); - // Re-type the variables that we've determined are numerics: - - for (int i = 0; i < variableCount; i++) { + for (int i = 0; i < headers.size(); i++) { if (isNumericVariable[i]) { dataTable.getDataVariables().get(i).setTypeNumeric(); @@ -444,207 +343,147 @@ public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter f dataTable.getDataVariables().get(i).setFormatCategory("time"); } } - - // Second, final pass. - - // Re-open the saved file and reset the line counter: - - BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile)); - lineCounter = 0; - String[] caseRow = new String[variableCount]; - - - while ((line = secondPassReader.readLine()) != null) { - // chop the line: - line = line.replaceFirst("[\r\n]*$", ""); - valueTokens = line.split("" + delimiterChar, -2); - - if (valueTokens == null) { - throw new IOException("Failed to read line " + (lineCounter + 1) + " during the second pass."); - } - - int tokenCount = valueTokens.length; - - if (tokenCount > variableCount) { - - // again, similar check for quote-encased strings that contain - // commas inside them. - // -- L.A. 4.0.2 - - valueTokens = null; - valueTokens = new String[variableCount]; - - int tokenStart = 0; - boolean quotedStringMode = false; - boolean potentialDoubleDoubleQuote = false; - tokenCount = 0; - - for (int i = 0; i < line.length(); i++) { - if (tokenCount > variableCount) { - throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " of the data file contains more than " - + variableCount + " comma-delimited values."); - } - - char c = line.charAt(i); - - if (tokenStart == i && c == '"') { - quotedStringMode = true; - } else if (c == ',' && !quotedStringMode) { - valueTokens[tokenCount] = line.substring(tokenStart, i); - tokenCount++; - tokenStart = i+1; - } else if (i == line.length() - 1) { - valueTokens[tokenCount] = line.substring(tokenStart, line.length()); - tokenCount++; - } else if (quotedStringMode && c == '"') { - quotedStringMode = false; - potentialDoubleDoubleQuote = true; - } else if (potentialDoubleDoubleQuote && c == '"') { - quotedStringMode = true; - potentialDoubleDoubleQuote = false; - } - + // Second, final pass. + try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) { + dbglog.info("Tmp File: " + firstPassTempFile); + parser = new CSVParser(secondPassReader, outFormat.withHeader()); + String[] caseRow = new String[headers.size()]; + + finalOut.println(StringUtils.join(headers.keySet().toArray(new String[0]), "\t")); + for (CSVRecord record : parser) { + if (!record.isConsistent()) { + throw new IOException("Reading mismatch, line " + (parser.getCurrentLineNumber() + 1) + + " of the Data file: " + headers.size() + + " delimited values expected, " + record.size() + " found."); } - } - - // TODO: - // isolate CSV parsing into its own method/class, to avoid - // code duplication in the 2 passes, above; - // do not save the result of the 1st pass - simply reopen the - // original file (?). - // -- L.A. 4.0.2/4.1 - - if (tokenCount != variableCount) { - throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " during the second pass: " - + variableCount + " delimited values expected, " + valueTokens.length + " found."); - } - - for (int i = 0; i < variableCount; i++) { - if (isNumericVariable[i]) { - if (valueTokens[i] == null || valueTokens[i].equalsIgnoreCase("") || valueTokens[i].equalsIgnoreCase("NA")) { - // Missing value - represented as an empty string in - // the final tab file - caseRow[i] = ""; - } else if (valueTokens[i].equalsIgnoreCase("NaN")) { - // "Not a Number" special value: - caseRow[i] = "NaN"; - } else if (valueTokens[i].equalsIgnoreCase("Inf") - || valueTokens[i].equalsIgnoreCase("+Inf")) { - // Positive infinity: - caseRow[i] = "Inf"; - } else if (valueTokens[i].equalsIgnoreCase("-Inf")) { - // Negative infinity: - caseRow[i] = "-Inf"; - } else if (valueTokens[i].equalsIgnoreCase("null")) { - // By request from Gus - "NULL" is recognized as a - // numeric zero: - if (isIntegerVariable[i]) { - caseRow[i] = "0"; + + // TODO: + // isolate CSV parsing into its own method/class, to avoid + // code duplication in the 2 passes, above; + // do not save the result of the 1st pass - simply reopen the + // original file (?). + // -- L.A. 4.0.2/4.1 + for (int i = 0; i < headers.size(); i++) { + String varString = record.get(i); + if (isNumericVariable[i]) { + if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) { + // Missing value - represented as an empty string in + // the final tab file + caseRow[i] = ""; + } else if (varString.equalsIgnoreCase("NaN")) { + // "Not a Number" special value: + caseRow[i] = "NaN"; + } else if (varString.equalsIgnoreCase("Inf") + || varString.equalsIgnoreCase("+Inf")) { + // Positive infinity: + caseRow[i] = "Inf"; + } else if (varString.equalsIgnoreCase("-Inf")) { + // Negative infinity: + caseRow[i] = "-Inf"; + } else if (varString.equalsIgnoreCase("null")) { + // By request from Gus - "NULL" is recognized as a + // numeric zero: + caseRow[i] = isIntegerVariable[i] ? "0" : "0.0"; } else { - caseRow[i] = "0.0"; + /* No re-formatting is done on any other numeric values. + * We'll save them as they were, for archival purposes. + * The alternative solution - formatting in sci. notation + * is commented-out below. + */ + caseRow[i] = varString; + /* + if (isIntegerVariable[i]) { + try { + Integer testIntegerValue = new Integer(valueTokens[i]); + caseRow[i] = testIntegerValue.toString(); + } catch (NumberFormatException ex) { + throw new IOException ("Failed to parse a value recognized as an integer in the first pass! (?)"); + } + } else { + try { + Double testDoubleValue = new Double(valueTokens[i]); + if (testDoubleValue.equals(0.0)) { + caseRow[i] = "0.0"; + } else { + // One possible implementation: + // + // Round our fractional values to 15 digits + // (minimum number of digits of precision guaranteed by + // type Double) and format the resulting representations + // in a IEEE 754-like "scientific notation" - for ex., + // 753.24 will be encoded as 7.5324e2 + BigDecimal testBigDecimal = new BigDecimal(valueTokens[i], doubleMathContext); + // an experiment - what's gonna happen if we just + // use the string representation of the bigdecimal object + // above? + //caseRow[i] = testBigDecimal.toString(); + = + caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal); + + // Strip meaningless zeros and extra + signs: + caseRow[i] = caseRow[i].replaceFirst("00*e", "e"); + caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e"); + caseRow[i] = caseRow[i].replaceFirst("e\\+00", ""); + caseRow[i] = caseRow[i].replaceFirst("^\\+", ""); + } + } catch (NumberFormatException ex) { + throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)"); + } + } + */ } - } else { - /* No re-formatting is done on any other numeric values. - * We'll save them as they were, for archival purposes. - * The alternative solution - formatting in sci. notation - * is commented-out below. - */ - caseRow[i] = valueTokens[i]; - /* - if (isIntegerVariable[i]) { - try { - Integer testIntegerValue = new Integer(valueTokens[i]); - caseRow[i] = testIntegerValue.toString(); - } catch (NumberFormatException ex) { - throw new IOException ("Failed to parse a value recognized as an integer in the first pass! (?)"); - } + } else if (isTimeVariable[i] || isDateVariable[i]) { + // Time and Dates are stored NOT quoted (don't ask). + if (varString != null) { + String charToken = varString; + // Dealing with quotes: + // remove the leading and trailing quotes, if present: + charToken = charToken.replaceFirst("^\"*", ""); + charToken = charToken.replaceFirst("\"*$", ""); + caseRow[i] = charToken; } else { - try { - Double testDoubleValue = new Double(valueTokens[i]); - if (testDoubleValue.equals(0.0)) { - caseRow[i] = "0.0"; - } else { - // One possible implementation: - // - // Round our fractional values to 15 digits - // (minimum number of digits of precision guaranteed by - // type Double) and format the resulting representations - // in a IEEE 754-like "scientific notation" - for ex., - // 753.24 will be encoded as 7.5324e2 - BigDecimal testBigDecimal = new BigDecimal(valueTokens[i], doubleMathContext); - // an experiment - what's gonna happen if we just - // use the string representation of the bigdecimal object - // above? - //caseRow[i] = testBigDecimal.toString(); -= - caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal); - - // Strip meaningless zeros and extra + signs: - caseRow[i] = caseRow[i].replaceFirst("00*e", "e"); - caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e"); - caseRow[i] = caseRow[i].replaceFirst("e\\+00", ""); - caseRow[i] = caseRow[i].replaceFirst("^\\+", ""); - } - - } catch (NumberFormatException ex) { - throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)"); - } + caseRow[i] = ""; } - */ - } - } else if (isTimeVariable[i] || isDateVariable[i]) { - // Time and Dates are stored NOT quoted (don't ask). - if (valueTokens[i] != null) { - String charToken = valueTokens[i]; - // Dealing with quotes: - // remove the leading and trailing quotes, if present: - charToken = charToken.replaceFirst("^\"*", ""); - charToken = charToken.replaceFirst("\"*$", ""); - caseRow[i] = charToken; - } else { - caseRow[i] = ""; - } - } else { - // Treat as a String: - // Strings are stored in tab files quoted; - // Missing values are stored as tab-delimited nothing - - // i.e., an empty string between two tabs (or one tab and - // the new line); - // Empty strings stored as "" (quoted empty string). - // For the purposes of this CSV ingest reader, we are going - // to assume that all the empty strings in the file are - // indeed empty strings, and NOT missing values: - if (valueTokens[i] != null) { - String charToken = valueTokens[i]; - // Dealing with quotes: - // remove the leading and trailing quotes, if present: - charToken = charToken.replaceFirst("^\"", ""); - charToken = charToken.replaceFirst("\"$", ""); - // escape the remaining ones: - charToken = charToken.replace("\"", "\\\""); - // final pair of quotes: - charToken = "\"" + charToken + "\""; - caseRow[i] = charToken; } else { - caseRow[i] = "\"\""; + // Treat as a String: + // Strings are stored in tab files quoted; + // Missing values are stored as tab-delimited nothing - + // i.e., an empty string between two tabs (or one tab and + // the new line); + // Empty strings stored as "" (quoted empty string). + // For the purposes of this CSV ingest reader, we are going + // to assume that all the empty strings in the file are + // indeed empty strings, and NOT missing values: + if (record.get(i) != null) { + String charToken = varString; + // Dealing with quotes: + // remove the leading and trailing quotes, if present: + charToken = charToken.replaceFirst("^\"", ""); + charToken = charToken.replaceFirst("\"$", ""); + // escape the remaining ones: + charToken = charToken.replace("\"", "\\\""); + // final pair of quotes: + charToken = "\"" + charToken + "\""; + caseRow[i] = charToken; + } else { + caseRow[i] = "\"\""; + } } } + dbglog.fine("CaseRow: " + Arrays.toString(caseRow)); + finalOut.println(StringUtils.join(caseRow, "\t")); } - - finalOut.println(StringUtils.join(caseRow, "\t")); - lineCounter++; - - } - - secondPassReader.close(); finalOut.close(); - - if (dataTable.getCaseQuantity().intValue() != lineCounter) { - throw new IOException("Mismatch between line counts in first and final passes!"); + long linecount = parser.getCurrentLineNumber(); + parser.close(); + if (dataTable.getCaseQuantity().intValue() != linecount) { + throw new IOException("Mismatch between line counts in first and final passes!, " + + dataTable.getCaseQuantity().intValue() + " found on first count, but " + + linecount + " found on second."); } - - return lineCounter; + new File(firstPassTempFile.getAbsolutePath()).delete(); + return (int) linecount; } } diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/BrokenCSV.csv b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/BrokenCSV.csv new file mode 100644 index 00000000000..4b84b5d601a --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/BrokenCSV.csv @@ -0,0 +1,4 @@ +1,2,3,4,5,6 +1,3,4,5,6,7 +"1,2",3,4,5,6,4 +3,1,3,4 diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java new file mode 100644 index 00000000000..531cf6c184d --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReaderTest.java @@ -0,0 +1,91 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.csv; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.logging.Logger; +import org.junit.Test; +import static org.junit.Assert.*; + +/** + * + * @author oscardssmith + */ +public class CSVFileReaderTest { + + private static final Logger logger = Logger.getLogger(CSVFileReaderTest.class.getCanonicalName()); + + /** + * Test CSVFileReader with a hellish CSV containing everything nasty I could + * think of to throw at it. + * + */ + @Test + public void testRead() { + String testFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/InjestCSV.csv"; + String[] expResult = {"ints Strings Times Not quite Times Dates Not quite Dates Numbers Not quite Ints Not quite Numbers Column that hates you and is so long that things might start breaking because we previously had a 255 character limit on length for things even when that might not be completely justified. Wow this is an increadibly long header name. Who made this? Oh, that's right, I did.", + "-199 \"hello\" 2013-04-08 13:14:23 2013-04-08 13:14:23 2017-06-20 \"2017/06/20\" 0.0 1 \"2\" \"823478788778713\"", + "2 \"Sdfwer\" 2013-04-08 13:14:23 2013-04-08 13:14:23 2017-06-20 \"1100/06/20\" Inf 2 \"NaN\" \",1,2,3\"", + "0 \"cjlajfo.\" 2013-04-08 13:14:23 2013-04-08 13:14:23 2017-06-20 \"3000/06/20\" -Inf 3 \"inf\" \"\\casdf\"", + "-1 \"Mywer\" 2013-04-08 13:14:23 2013-04-08 13:14:23 2017-06-20 \"06-20-2011\" 3.141592653 4 \"4.8\" \" \\\" \"", + "266128 \"Sf\" 2013-04-08 13:14:23 2013-04-08 13:14:23 2017-06-20 \"06-20-1917\" 0 5 \"Inf+11\" \"\"", + "0 \"werxc\" 2013-04-08 13:14:23 2013-04-08 13:14:23 2017-06-20 \"03/03/1817\" 123 6.000001 \"11-2\" \"adf\\0\\na\\td", + "sf\"", + "-2389 \"Dfjl\" 2013-04-08 13:14:23 2013-04-08 13:14:72 2017-06-20 \"2017-03-12\" NaN 2 \"nap\" \"💩⌛👩🏻■\""}; + BufferedReader result = null; + try (BufferedInputStream stream = new BufferedInputStream( + new FileInputStream(testFile))) { + CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi()); + result = new BufferedReader(new FileReader(instance.read(stream, null).getTabDelimitedFile())); + } catch (IOException ex) { + fail("" + ex); + } + + String foundLine = null; + assertNotNull(result); + for (String expLine : expResult) { + try { + foundLine = result.readLine(); + } catch (IOException ex) { + fail(); + } + if (!expLine.equals(foundLine)) { + logger.info("expected: " + expLine); + logger.info("found : " + foundLine); + } + assertEquals(expLine, foundLine); + } + + } + + /** + * Tests CSVFileReader with a CSV with one more column than header. Tests + * CSVFileReader with a null CSV. + */ + @Test + public void testBrokenCSV() { + String brokenFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/BrokenCSV.csv"; + try { + new CSVFileReader(new CSVFileReaderSpi()).read(null, null); + fail("IOException not thrown on null csv"); + } catch (IOException ex) { + String expMessage = "Stream can't be null."; + assertEquals(expMessage, ex.getMessage()); + } + try (BufferedInputStream stream = new BufferedInputStream( + new FileInputStream(brokenFile))) { + new CSVFileReader(new CSVFileReaderSpi()).read(stream, null); + fail("IOException was not thrown when collumns do not align."); + } catch (IOException ex) { + String expMessage = "Reading mismatch, line 5 of the Data file: 6 delimited values expected, 4 found."; + assertEquals(expMessage, ex.getMessage()); + } + } +} diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/InjestCSV.csv b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/InjestCSV.csv new file mode 100644 index 00000000000..d67a34644a5 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/InjestCSV.csv @@ -0,0 +1,9 @@ +ints,Strings,Times,Not quite Times,Dates,Not quite Dates,Numbers,Not quite Ints,Not quite Numbers,"Column that hates you and is so long that things might start breaking because we previously had a 255 character limit on length for things even when that might not be completely justified. Wow this is an increadibly long header name. Who made this? Oh, that's right, I did." +-199,hello,2013-04-08 13:14:23,2013-04-08 13:14:23,2017-06-20,2017/06/20,null,1,2,823478788778713 +2,Sdfwer,2013-04-08 13:14:23,2013-04-08 13:14:23,2017-06-20,1100/06/20,INF,2,NaN,",1,2,3" +0,cjlajfo.,2013-04-08 13:14:23,2013-04-08 13:14:23,2017-06-20,3000/06/20,-inf,3,inf,\casdf +-1,Mywer,2013-04-08 13:14:23,2013-04-08 13:14:23,2017-06-20,06-20-2011,3.141592653,4,4.8," "" " +266128,Sf,2013-04-08 13:14:23,2013-04-08 13:14:23,2017-06-20,06-20-1917,0,5,Inf+11, +null,werxc,2013-04-08 13:14:23,2013-04-08 13:14:23,2017-06-20,03/03/1817,123,6.000001,11-2,"""adf\0\na\td +sf""" +-2389,Dfjl,2013-04-08 13:14:23,2013-04-08 13:14:72,2017-06-20,2017-03-12,nan,2,nap,💩⌛👩🏻■