diff --git a/.changeset/rotten-avocados-play.md b/.changeset/rotten-avocados-play.md new file mode 100644 index 000000000..4ca7b0798 --- /dev/null +++ b/.changeset/rotten-avocados-play.md @@ -0,0 +1,6 @@ +--- +'@flatfile/plugin-delimiter-extractor': patch +'@flatfile/plugin-xlsx-extractor': patch +--- + +Update to documentation diff --git a/plugins/delimiter-extractor/ref/csv.txt b/plugins/delimiter-extractor/ref/test-empty-lines.txt similarity index 94% rename from plugins/delimiter-extractor/ref/csv.txt rename to plugins/delimiter-extractor/ref/test-empty-lines.txt index 871f047ec..179df7855 100644 --- a/plugins/delimiter-extractor/ref/csv.txt +++ b/plugins/delimiter-extractor/ref/test-empty-lines.txt @@ -1,6 +1,6 @@ header1,header2,header3 column1,column2,column3 -,, + , , ,, column4,column5,column6 column7,column8,column9 diff --git a/plugins/delimiter-extractor/src/parser.spec.ts b/plugins/delimiter-extractor/src/parser.spec.ts index b09a50190..6f4c5b0c6 100644 --- a/plugins/delimiter-extractor/src/parser.spec.ts +++ b/plugins/delimiter-extractor/src/parser.spec.ts @@ -11,8 +11,8 @@ describe('parser', () => { path.join(__dirname, '../ref/test-complex.pound') ) - const csvBuffer: Buffer = fs.readFileSync( - path.join(__dirname, '../ref/csv.txt') + const emptyLinesBuffer: Buffer = fs.readFileSync( + path.join(__dirname, '../ref/test-empty-lines.txt') ) test('colon to WorkbookCapture', async () => { @@ -139,12 +139,41 @@ describe('parser', () => { parseBuffer(colonBasicBuffer, { delimiter: '#' }) ) }) - test('skip empty lines', async () => { - const parsedBuffer = await parseBuffer(csvBuffer, { + test('skip empty lines: true', async () => { + const parsedBuffer = await parseBuffer(emptyLinesBuffer, { delimiter: ',', skipEmptyLines: true, }) const data = parsedBuffer.Sheet1.data + expect(data).toEqual([ + { + header1: { value: 'column1' }, + header2: { value: 'column2' }, + header3: { value: 'column3' }, + }, + { + header1: { value: ' ' }, + header2: { value: ' ' }, + header3: { value: ' ' }, + }, + { + header1: { value: 'column4' }, + header2: { value: 'column5' }, + header3: { value: 'column6' }, + }, + { + header1: { value: 'column7' }, + header2: { value: 'column8' }, + header3: { value: 'column9' }, + }, + ]) + }) + test('skip empty lines: greedy', async () => { + const parsedBuffer = await parseBuffer(emptyLinesBuffer, { + delimiter: ',', + skipEmptyLines: 'greedy', + }) + const data = parsedBuffer.Sheet1.data expect(data).toEqual([ { header1: { value: 'column1' }, @@ -163,8 +192,8 @@ describe('parser', () => { }, ]) }) - test("don'tskip empty lines", async () => { - const parsedBuffer = await parseBuffer(csvBuffer, { + test('skip empty lines: false', async () => { + const parsedBuffer = await parseBuffer(emptyLinesBuffer, { delimiter: ',', skipEmptyLines: false, }) @@ -176,9 +205,9 @@ describe('parser', () => { header3: { value: 'column3' }, }, { - header1: { value: '' }, - header2: { value: '' }, - header3: { value: '' }, + header1: { value: ' ' }, + header2: { value: ' ' }, + header3: { value: ' ' }, }, { header1: { value: '' }, diff --git a/plugins/delimiter-extractor/src/parser.ts b/plugins/delimiter-extractor/src/parser.ts index 206f00017..57f6812ce 100644 --- a/plugins/delimiter-extractor/src/parser.ts +++ b/plugins/delimiter-extractor/src/parser.ts @@ -17,9 +17,7 @@ export async function parseBuffer( try { const skipEmptyLines = options?.headerSelectionEnabled ? false - : options?.skipEmptyLines === false - ? false - : 'greedy' + : options?.skipEmptyLines ?? false const fileContents = buffer.toString('utf8') const results: ParseResult> = Papa.parse( fileContents, @@ -76,12 +74,22 @@ export async function parseBuffer( const headers = prependNonUniqueHeaderColumns(columnHeaders) - const data: Flatfile.RecordData[] = rows.map((row) => { - const mappedRow = mapKeys(row, (key) => headers[key]) - return mapValues(mappedRow, (value) => ({ - value: transform(value), - })) as Flatfile.RecordData - }) + const data: Flatfile.RecordData[] = rows + .filter((row) => { + if (!skipEmptyLines) return true + const isEmpty = Object.values(row).every( + skipEmptyLines === 'greedy' + ? isNullOrWhitespace + : (value) => value === '' + ) + return !isEmpty + }) + .map((row) => { + const mappedRow = mapKeys(row, (key) => headers[key]) + return mapValues(mappedRow, (value) => ({ + value: transform(value), + })) as Flatfile.RecordData + }) let metadata: { rowHeaders: number[] } | null diff --git a/plugins/xlsx-extractor/README.md b/plugins/xlsx-extractor/README.md index b611e6d2f..2b2524f3f 100644 --- a/plugins/xlsx-extractor/README.md +++ b/plugins/xlsx-extractor/README.md @@ -51,6 +51,10 @@ detecting headers in the file. By default, the first 10 rows are scanned for the row with the most non-empty cells. +#### `skipEmptyLines` - `default: "false"` - `boolean` - (optional) +The `skipEmptyLines` parameter allows you to specify if empty lines should be +skipped. By default, empty lines are included. + #### `debug` - `default: "false"` - `boolean` - (optional) The `debug` parameter lets you toggle on/off helpful debugging messages for development purposes. diff --git a/plugins/xlsx-extractor/src/index.ts b/plugins/xlsx-extractor/src/index.ts index 5746cfd07..986d0f26f 100644 --- a/plugins/xlsx-extractor/src/index.ts +++ b/plugins/xlsx-extractor/src/index.ts @@ -11,6 +11,7 @@ import { parseBuffer } from './parser' * @property {number} chunkSize - the size of chunk to process when inserting records. * @property {number} parallel - the quantity of parallel process when inserting records. * @property {GetHeadersOptions} headerDetectionOptions - the options for header detection. + * @property {boolean} skipEmptyLines - if true, skip empty lines; if false, include empty lines. * @property {boolean} debug - if true, display helpful console logs. */ export interface ExcelExtractorOptions {