From ead7b168421eb62dbbc8a3e3e85eb1531d597261 Mon Sep 17 00:00:00 2001 From: Dylan Phelan Date: Mon, 6 Dec 2021 09:35:14 -0500 Subject: [PATCH 1/5] can read from CSV's with empty lines and empty values --- src/modules/CSVFileModule.js | 2 ++ test/modules/CSVFileModule.test.js | 21 ++++++++++++++++++- .../fixtures/example-csv-empty-line.csv | 4 ++++ .../fixtures/example-csv-empty-values.csv | 4 ++++ 4 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 test/modules/fixtures/example-csv-empty-line.csv create mode 100644 test/modules/fixtures/example-csv-empty-values.csv diff --git a/src/modules/CSVFileModule.js b/src/modules/CSVFileModule.js index d098fb32..41eed7f7 100644 --- a/src/modules/CSVFileModule.js +++ b/src/modules/CSVFileModule.js @@ -11,6 +11,8 @@ class CSVFileModule { const parsedData = parse(fs.readFileSync(csvFilePath), { columns: (header) => header.map((column) => stringNormalizer(column)), bom: true, + skip_empty_lines: true, + skip_records_with_empty_values: true, }); this.filePath = csvFilePath; diff --git a/test/modules/CSVFileModule.test.js b/test/modules/CSVFileModule.test.js index 7e62b287..91773f8c 100644 --- a/test/modules/CSVFileModule.test.js +++ b/test/modules/CSVFileModule.test.js @@ -4,7 +4,6 @@ const exampleResponse = require('./fixtures/csv-response.json'); const INVALID_MRN = 'INVALID MRN'; const csvFileModule = new CSVFileModule(path.join(__dirname, './fixtures/example-csv.csv')); -const csvFileModuleWithBOMs = new CSVFileModule(path.join(__dirname, './fixtures/example-csv-bom.csv')); describe('CSVFileModule', () => { @@ -15,10 +14,30 @@ describe('CSVFileModule', () => { }); test('Reads data from CSV with a Byte Order Mark', async () => { + const csvFileModuleWithBOMs = new CSVFileModule( + path.join(__dirname, './fixtures/example-csv-empty-values.csv'), + ); + const data = await csvFileModuleWithBOMs.get('mrn', 'example-mrn-1'); expect(data).toEqual(exampleResponse); }); + test('Reads data from CSV with Empty Values', async () => { + const csvFileModuleWithEmptyValues = new CSVFileModule( + path.join(__dirname, './fixtures/example-csv-empty-values.csv'), + ); + const data = await csvFileModuleWithEmptyValues.get('mrn', 'example-mrn-1'); + expect(data).toEqual(exampleResponse); + }); + + test('Reads data from CSV with Empty Lines', async () => { + const csvFileModuleWithEmptyLines = new CSVFileModule( + path.join(__dirname, './fixtures/example-csv-empty-line.csv'), + ); + const data = await csvFileModuleWithEmptyLines.get('mrn', 'example-mrn-1'); + expect(data).toEqual(exampleResponse); + }); + test('Returns multiple rows', async () => { const data = await csvFileModule.get('mrn', 'example-mrn-2'); expect(data).toHaveLength(2); diff --git a/test/modules/fixtures/example-csv-empty-line.csv b/test/modules/fixtures/example-csv-empty-line.csv new file mode 100644 index 00000000..d2613bfe --- /dev/null +++ b/test/modules/fixtures/example-csv-empty-line.csv @@ -0,0 +1,4 @@ +mrn,trialSubjectID,enrollmentStatus,trialResearchID,trialStatus,dateRecorded +example-mrn-1,subjectId-1,status-1,researchId-1,trialStatus-1,2020-01-10 + +example-mrn-2,subjectId-3,status-3,researchId-3,trialStatus-3,2020-06-10 \ No newline at end of file diff --git a/test/modules/fixtures/example-csv-empty-values.csv b/test/modules/fixtures/example-csv-empty-values.csv new file mode 100644 index 00000000..a2869857 --- /dev/null +++ b/test/modules/fixtures/example-csv-empty-values.csv @@ -0,0 +1,4 @@ +mrn,trialSubjectID,enrollmentStatus,trialResearchID,trialStatus,dateRecorded +example-mrn-1,subjectId-1,status-1,researchId-1,trialStatus-1,2020-01-10 +, , , \t\t\t,,\t +example-mrn-2,subjectId-3,status-3,researchId-3,trialStatus-3,2020-06-10 From dee841115f394146dd848627b1090d7795b65960 Mon Sep 17 00:00:00 2001 From: Dylan Phelan Date: Mon, 6 Dec 2021 09:35:25 -0500 Subject: [PATCH 2/5] added comments for what the options do --- src/modules/CSVFileModule.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/modules/CSVFileModule.js b/src/modules/CSVFileModule.js index 41eed7f7..cae6d86a 100644 --- a/src/modules/CSVFileModule.js +++ b/src/modules/CSVFileModule.js @@ -10,12 +10,15 @@ class CSVFileModule { // Parse then normalize the data const parsedData = parse(fs.readFileSync(csvFilePath), { columns: (header) => header.map((column) => stringNormalizer(column)), + // https://csv.js.org/parse/options/bom/ bom: true, + // https://csv.js.org/parse/options/skip_empty_lines/ skip_empty_lines: true, + // NOTE: This will skip any records with empty values, not just skip the empty values themselves + // https://csv.js.org/parse/options/skip_records_with_empty_values/ skip_records_with_empty_values: true, }); this.filePath = csvFilePath; - this.data = normalizeEmptyValues(parsedData, unalterableColumns); } From 605bda5fe417e56c29321817634971c05dcb4a79 Mon Sep 17 00:00:00 2001 From: Dylan Phelan Date: Wed, 8 Dec 2021 16:48:07 -0500 Subject: [PATCH 3/5] fixed the flag; new tests --- src/modules/CSVFileModule.js | 4 +++- test/modules/CSVFileModule.test.js | 6 ++++++ test/modules/fixtures/example-csv-empty-values.csv | 6 ++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/modules/CSVFileModule.js b/src/modules/CSVFileModule.js index cae6d86a..cced85f6 100644 --- a/src/modules/CSVFileModule.js +++ b/src/modules/CSVFileModule.js @@ -15,9 +15,11 @@ class CSVFileModule { // https://csv.js.org/parse/options/skip_empty_lines/ skip_empty_lines: true, // NOTE: This will skip any records with empty values, not just skip the empty values themselves + // NOTE-2: The name of the flag changed from v4 (what we use) to v5 (what is documented) // https://csv.js.org/parse/options/skip_records_with_empty_values/ - skip_records_with_empty_values: true, + skip_lines_with_empty_values: true, }); + this.filePath = csvFilePath; this.data = normalizeEmptyValues(parsedData, unalterableColumns); } diff --git a/test/modules/CSVFileModule.test.js b/test/modules/CSVFileModule.test.js index 91773f8c..8fe84b4d 100644 --- a/test/modules/CSVFileModule.test.js +++ b/test/modules/CSVFileModule.test.js @@ -23,11 +23,17 @@ describe('CSVFileModule', () => { }); test('Reads data from CSV with Empty Values', async () => { + // Five row file, with three rows of empty values + // Should be just two rows of data after ingestion const csvFileModuleWithEmptyValues = new CSVFileModule( path.join(__dirname, './fixtures/example-csv-empty-values.csv'), ); const data = await csvFileModuleWithEmptyValues.get('mrn', 'example-mrn-1'); expect(data).toEqual(exampleResponse); + const data2 = await csvFileModuleWithEmptyValues.get('mrn', 'example-mrn-not-ignored'); + expect(data2).toHaveLength(1); + // Should be just two rows of data after ingestion + expect(csvFileModuleWithEmptyValues.data).toHaveLength(2); }); test('Reads data from CSV with Empty Lines', async () => { diff --git a/test/modules/fixtures/example-csv-empty-values.csv b/test/modules/fixtures/example-csv-empty-values.csv index a2869857..761b3678 100644 --- a/test/modules/fixtures/example-csv-empty-values.csv +++ b/test/modules/fixtures/example-csv-empty-values.csv @@ -1,4 +1,6 @@ mrn,trialSubjectID,enrollmentStatus,trialResearchID,trialStatus,dateRecorded example-mrn-1,subjectId-1,status-1,researchId-1,trialStatus-1,2020-01-10 -, , , \t\t\t,,\t -example-mrn-2,subjectId-3,status-3,researchId-3,trialStatus-3,2020-06-10 +, , , , , +, , , ,, +,,, , , +example-mrn-not-ignored,,,,, \ No newline at end of file From 7a7c387426ebc075523cc4b23883e785d91f1623 Mon Sep 17 00:00:00 2001 From: Dylan Phelan Date: Fri, 10 Dec 2021 09:49:28 -0500 Subject: [PATCH 4/5] single point of truth for csv parsing --- src/helpers/appUtils.js | 7 ++----- src/helpers/csvParsingUtils.js | 21 +++++++++++++++++++++ src/modules/CSVFileModule.js | 16 ++-------------- src/modules/CSVURLModule.js | 8 ++------ 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/helpers/appUtils.js b/src/helpers/appUtils.js index f5438cad..7ab38a6c 100644 --- a/src/helpers/appUtils.js +++ b/src/helpers/appUtils.js @@ -1,6 +1,6 @@ const fs = require('fs'); const path = require('path'); -const parse = require('csv-parse/lib/sync'); +const { csvParse } = require('./csvParsingUtils'); /** * Parses a provided CSV with MRN column into string array of IDs @@ -11,10 +11,7 @@ const parse = require('csv-parse/lib/sync'); function parsePatientIds(pathToCSV) { // Parse CSV for list of patient IDs const patientIdsCsvPath = path.resolve(pathToCSV); - const patientIds = parse(fs.readFileSync(patientIdsCsvPath, 'utf8'), { - columns: (header) => header.map((column) => column.toLowerCase()), - bom: true, - }).map((row) => { + const patientIds = csvParse(fs.readFileSync(patientIdsCsvPath, 'utf8')).map((row) => { if (!row.mrn) { throw new Error(`${pathToCSV} has no "mrn" column`); } diff --git a/src/helpers/csvParsingUtils.js b/src/helpers/csvParsingUtils.js index 8670e5f2..ca3b9f0c 100644 --- a/src/helpers/csvParsingUtils.js +++ b/src/helpers/csvParsingUtils.js @@ -1,3 +1,4 @@ +const parse = require('csv-parse/lib/sync'); const logger = require('./logger'); // The standard string normalizer function @@ -38,7 +39,27 @@ function normalizeEmptyValues(data, unalterableColumns = []) { return newData; } +// Default options for CSV parsing +const DEFAULT_OPTIONS = { + columns: (header) => header.map((column) => stringNormalizer(column)), + // https://csv.js.org/parse/options/bom/ + bom: true, + // https://csv.js.org/parse/options/skip_empty_lines/ + skip_empty_lines: true, + // NOTE: This will skip any records with empty values, not just skip the empty values themselves + // NOTE-2: The name of the flag changed from v4 (what we use) to v5 (what is documented) + // https://csv.js.org/parse/options/skip_records_with_empty_values/ + skip_lines_with_empty_values: true, +}; + +// Common utility for parsing CSV files +function csvParse(csvData, options = {}) { + return parse(csvData, { ...DEFAULT_OPTIONS, ...options }); +} + + module.exports = { stringNormalizer, normalizeEmptyValues, + csvParse, }; diff --git a/src/modules/CSVFileModule.js b/src/modules/CSVFileModule.js index cced85f6..66f181d6 100644 --- a/src/modules/CSVFileModule.js +++ b/src/modules/CSVFileModule.js @@ -1,25 +1,13 @@ const fs = require('fs'); const moment = require('moment'); -const parse = require('csv-parse/lib/sync'); const logger = require('../helpers/logger'); const { validateCSV } = require('../helpers/csvValidator'); -const { stringNormalizer, normalizeEmptyValues } = require('../helpers/csvParsingUtils'); +const { csvParse, stringNormalizer, normalizeEmptyValues } = require('../helpers/csvParsingUtils'); class CSVFileModule { constructor(csvFilePath, unalterableColumns) { // Parse then normalize the data - const parsedData = parse(fs.readFileSync(csvFilePath), { - columns: (header) => header.map((column) => stringNormalizer(column)), - // https://csv.js.org/parse/options/bom/ - bom: true, - // https://csv.js.org/parse/options/skip_empty_lines/ - skip_empty_lines: true, - // NOTE: This will skip any records with empty values, not just skip the empty values themselves - // NOTE-2: The name of the flag changed from v4 (what we use) to v5 (what is documented) - // https://csv.js.org/parse/options/skip_records_with_empty_values/ - skip_lines_with_empty_values: true, - }); - + const parsedData = csvParse(fs.readFileSync(csvFilePath)); this.filePath = csvFilePath; this.data = normalizeEmptyValues(parsedData, unalterableColumns); } diff --git a/src/modules/CSVURLModule.js b/src/modules/CSVURLModule.js index 8beba716..00afb6aa 100644 --- a/src/modules/CSVURLModule.js +++ b/src/modules/CSVURLModule.js @@ -1,9 +1,8 @@ const axios = require('axios'); const moment = require('moment'); -const parse = require('csv-parse/lib/sync'); const logger = require('../helpers/logger'); const { validateCSV } = require('../helpers/csvValidator'); -const { stringNormalizer, normalizeEmptyValues } = require('../helpers/csvParsingUtils'); +const { csvParse, stringNormalizer, normalizeEmptyValues } = require('../helpers/csvParsingUtils'); class CSVURLModule { constructor(url, unalterableColumns) { @@ -25,10 +24,7 @@ class CSVURLModule { }); logger.debug('Web request successful'); // Parse then normalize the data - const parsedData = parse(csvData, { - columns: (header) => header.map((column) => stringNormalizer(column)), - bom: true, - }); + const parsedData = csvParse(csvData); logger.debug('CSV Data parsing successful'); this.data = normalizeEmptyValues(parsedData, this.unalterableColumns); } From cbdaaa9984428fadc1bb32632092313302eea935 Mon Sep 17 00:00:00 2001 From: Dylan Phelan Date: Fri, 10 Dec 2021 10:38:48 -0500 Subject: [PATCH 5/5] Fixed accidental use of empty-values.csv in bom test --- test/modules/CSVFileModule.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/modules/CSVFileModule.test.js b/test/modules/CSVFileModule.test.js index 8fe84b4d..248f7363 100644 --- a/test/modules/CSVFileModule.test.js +++ b/test/modules/CSVFileModule.test.js @@ -15,7 +15,7 @@ describe('CSVFileModule', () => { test('Reads data from CSV with a Byte Order Mark', async () => { const csvFileModuleWithBOMs = new CSVFileModule( - path.join(__dirname, './fixtures/example-csv-empty-values.csv'), + path.join(__dirname, './fixtures/example-csv-bom.csv'), ); const data = await csvFileModuleWithBOMs.get('mrn', 'example-mrn-1');