diff --git a/flatfilers/sandbox/src/index.ts b/flatfilers/sandbox/src/index.ts index 7a170cad0..8ed867929 100644 --- a/flatfilers/sandbox/src/index.ts +++ b/flatfilers/sandbox/src/index.ts @@ -1,46 +1,6 @@ import type { FlatfileListener } from '@flatfile/listener' -import { summarize } from '@flatfile/plugin-enrich-summarize' -import { configureSpace } from '@flatfile/plugin-space-configure' +import { MarkdownExtractor } from '@flatfile/plugin-markdown-extractor' export default async function (listener: FlatfileListener) { - listener.use( - summarize({ - sheetSlug: 'summarization', - contentField: 'content', - summaryField: 'summary', - keyPhrasesField: 'keyPhrases', - }) - ) - listener.use( - configureSpace({ - workbooks: [ - { - name: 'Sandbox', - sheets: [ - { - name: 'Summarization', - slug: 'summarization', - fields: [ - { - key: 'content', - type: 'string', - label: 'Content', - }, - { - key: 'summary', - type: 'string', - label: 'Summary', - }, - { - key: 'keyPhrases', - type: 'string', - label: 'Key Phrases', - }, - ], - }, - ], - }, - ], - }) - ) + listener.use(MarkdownExtractor()) } diff --git a/package-lock.json b/package-lock.json index 9b0855032..c79d62e16 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3185,6 +3185,10 @@ "resolved": "plugins/json-extractor", "link": true }, + "node_modules/@flatfile/plugin-markdown-extractor": { + "resolved": "plugins/markdown-extractor", + "link": true + }, "node_modules/@flatfile/plugin-pdf-extractor": { "resolved": "plugins/pdf-extractor", "link": true @@ -20185,6 +20189,7 @@ "version": "0.20.2", "resolved": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz", "integrity": "sha512-+nKZ39+nvK7Qq6i0PvWWRA4j/EkfWOtkP/YhMtupm+lJIiHxUrgTr1CcKv1nBk1rHtkRRQ3O2+Ih/q/sA+FXZA==", + "license": "Apache-2.0", "bin": { "xlsx": "bin/xlsx.njs" }, @@ -20517,6 +20522,20 @@ "@flatfile/listener": "^1.1.0" } }, + "plugins/markdown-extractor": { + "name": "@flatfile/plugin-markdown-extractor", + "version": "0.0.1", + "license": "ISC", + "dependencies": { + "@flatfile/util-extractor": "^2.1.2" + }, + "devDependencies": { + "@flatfile/rollup-config": "0.1.1" + }, + "engines": { + "node": ">= 16" + } + }, "plugins/merge-connection": { "name": "@flatfile/plugin-connect-via-merge", "version": "0.4.1", diff --git a/plugins/markdown-extractor/README.md b/plugins/markdown-extractor/README.md new file mode 100644 index 000000000..f9c5281b8 --- /dev/null +++ b/plugins/markdown-extractor/README.md @@ -0,0 +1,69 @@ + + +The `@flatfile/plugin-markdown-extractor` plugin parses Markdown files and extracts tabular data, creating sheets in Flatfile for each table found. + +**Event Type:** +`listener.on('file:created')` + +**Supported file types:** +`.md` + + + +> When embedding Flatfile, this plugin should be deployed in a server-side listener. [Learn more](/docs/orchestration/listeners#listener-types) + +## Parameters + + + +#### `options.maxTables` - `default: Infinity` - `number` - (optional) +The `maxTables` parameter allows you to limit the number of tables extracted from a single Markdown file. + +#### `options.errorHandling` - `default: "lenient"` - `"strict" | "lenient"` - (optional) +The `errorHandling` parameter determines how the plugin handles parsing errors. In 'strict' mode, it throws errors, while in 'lenient' mode, it logs warnings and skips problematic tables. + +#### `options.debug` - `default: false` - `boolean` - (optional) +The `debug` parameter enables additional logging for troubleshooting. + +## Usage + +Listen for a Markdown file to be uploaded to Flatfile. The platform will then extract the file automatically. Once complete, the file will be ready for import in the Files area. + +```bash Install +npm i @flatfile/plugin-markdown-extractor +``` + +```js import +import { MarkdownExtractor } from "@flatfile/plugin-markdown-extractor"; +``` + +```js listener.js +listener.use(MarkdownExtractor()); +``` + +### Full Example + +In this example, the `MarkdownExtractor` is initialized with custom options, and then registered as middleware with the Flatfile listener. When a Markdown file is uploaded, the plugin will extract the tabular data and process it using the extractor's parser. + +```javascript +import { MarkdownExtractor } from "@flatfile/plugin-markdown-extractor"; + +export default async function (listener) { + // Define optional options for the extractor + const options = { + maxTables: 5, + errorHandling: 'strict', + debug: true + }; + + // Initialize the Markdown extractor + const markdownExtractor = MarkdownExtractor(options); + + // Register the extractor as a middleware for the Flatfile listener + listener.use(markdownExtractor); + + // When a Markdown file is uploaded, the tabular data will be extracted and processed using the extractor's parser. +} +``` + +This plugin will create a new sheet for each table found in the Markdown file, with the table headers becoming field names and the rows becoming records. diff --git a/plugins/markdown-extractor/jest.config.js b/plugins/markdown-extractor/jest.config.js new file mode 100644 index 000000000..e6d7ca40b --- /dev/null +++ b/plugins/markdown-extractor/jest.config.js @@ -0,0 +1,16 @@ +module.exports = { + testEnvironment: 'node', + + transform: { + '^.+\\.tsx?$': 'ts-jest', + }, + setupFiles: ['../../test/dotenv-config.js'], + setupFilesAfterEnv: [ + '../../test/betterConsoleLog.js', + '../../test/unit.cleanup.js', + ], + testTimeout: 60_000, + globalSetup: '../../test/setup-global.js', + forceExit: true, + passWithNoTests: true, +} diff --git a/plugins/markdown-extractor/package.json b/plugins/markdown-extractor/package.json new file mode 100644 index 000000000..c109d2f1e --- /dev/null +++ b/plugins/markdown-extractor/package.json @@ -0,0 +1,62 @@ +{ + "name": "@flatfile/plugin-markdown-extractor", + "version": "0.0.1", + "url": "https://github.com/FlatFilers/flatfile-plugins/tree/main/plugins/markdown-extractor", + "description": "A plugin for parsing markdown files in Flatfile.", + "registryMetadata": { + "category": "extractors" + }, + "engines": { + "node": ">= 16" + }, + "type": "module", + "browser": { + "./dist/index.cjs": "./dist/index.browser.cjs", + "./dist/index.mjs": "./dist/index.browser.mjs" + }, + "exports": { + "types": "./dist/index.d.ts", + "node": { + "import": "./dist/index.mjs", + "require": "./dist/index.cjs" + }, + "browser": { + "require": "./dist/index.browser.cjs", + "import": "./dist/index.browser.mjs" + }, + "default": "./dist/index.mjs" + }, + "main": "./dist/index.cjs", + "module": "./dist/index.mjs", + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "files": [ + "dist/**" + ], + "scripts": { + "build": "rollup -c", + "build:watch": "rollup -c --watch", + "build:prod": "NODE_ENV=production rollup -c", + "check": "tsc ./**/*.ts --noEmit --esModuleInterop", + "test": "jest src/*.spec.ts --detectOpenHandles", + "test:unit": "jest src/*.spec.ts --testPathIgnorePatterns=.*\\.e2e\\.spec\\.ts$ --detectOpenHandles", + "test:e2e": "jest src/*.e2e.spec.ts --detectOpenHandles" + }, + "keywords": [ + "flatfile-plugins", + "category-extractors" + ], + "author": "FlatFilers", + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "plugins/markdown-extractor" + }, + "license": "ISC", + "dependencies": { + "@flatfile/util-extractor": "^2.1.2" + }, + "devDependencies": { + "@flatfile/rollup-config": "0.1.1" + } +} diff --git a/plugins/markdown-extractor/rollup.config.mjs b/plugins/markdown-extractor/rollup.config.mjs new file mode 100644 index 000000000..fafa813c6 --- /dev/null +++ b/plugins/markdown-extractor/rollup.config.mjs @@ -0,0 +1,5 @@ +import { buildConfig } from '@flatfile/rollup-config' + +const config = buildConfig({}) + +export default config diff --git a/plugins/markdown-extractor/samples/complex_table.md b/plugins/markdown-extractor/samples/complex_table.md new file mode 100644 index 000000000..3d34d8f01 --- /dev/null +++ b/plugins/markdown-extractor/samples/complex_table.md @@ -0,0 +1,21 @@ +# Complex Table Example + +This Markdown file contains a more complex table with various data types and potential parsing challenges. + +| Product | Price | Stock | Last Updated | Features | On Sale | +|---------|-------|-------|--------------|----------|--------| +| Laptop | $999.99 | 50 | 2023-05-01 | 15" screen, 16GB RAM | true | +| Smartphone | $599.99 | 100 | 2023-05-02 | 6.5" display, 128GB storage | false | +| Tablet | $399.99 | 75 | 2023-05-03 | 10" screen, 64GB storage | true | +| Headphones | $149.99 | 200 | 2023-05-04 | Noise-cancelling, Bluetooth 5.0 | false | +| Smart Watch | $249.99 | 30 | 2023-05-05 | Heart rate monitor, GPS | true | +| External SSD | $89.99 | 150 | 2023-05-06 | 1TB, USB 3.1 | false | + +This table includes: +- Currency values +- Integers +- Dates +- Booleans +- Strings with commas + +It should test the parser's ability to handle various data types and potential edge cases. \ No newline at end of file diff --git a/plugins/markdown-extractor/samples/lenient_tables.md b/plugins/markdown-extractor/samples/lenient_tables.md new file mode 100644 index 000000000..29214e77d --- /dev/null +++ b/plugins/markdown-extractor/samples/lenient_tables.md @@ -0,0 +1,28 @@ +# Lenient Tables Example + +This Markdown file contains multiple tables with mismatched column counts. + +## Table 1: Employees + +| ID | Name | Department | +|----|------|------------| +| 1 | John Doe | HR | +| 2 | Jane Smith | +| 3 | Mike Johnson | Finance | + +## Table 2: Projects + +| Project Name | Start Date | End Date | +|--------------|------------|----------| +| Website Redesign | 2023-01-01 | 2023-06-30 | +| Mobile App | 2023-03-15 | 2023-12-31 | extra column | + +## Table 3: Budget + +| Category | Q1 | Q2 | Q3 | Q4 | +|----------|----|----|----|----|----| +| Marketing | $10,000 | $15,000 | $20,000 | $25,000 | +| R&D | $50,000 | $60,000 | $70,000 | $80,000 | extra column | +| Operations | $100,000 | $110,000 | $120,000 | $130,000 | + +End of the file. \ No newline at end of file diff --git a/plugins/markdown-extractor/samples/multiple_tables.md b/plugins/markdown-extractor/samples/multiple_tables.md new file mode 100644 index 000000000..1e9c618cd --- /dev/null +++ b/plugins/markdown-extractor/samples/multiple_tables.md @@ -0,0 +1,28 @@ +# Multiple Tables Example + +This Markdown file contains multiple tables. + +## Table 1: Employees + +| ID | Name | Department | +|----|------|------------| +| 1 | John Doe | HR | +| 2 | Jane Smith | IT | +| 3 | Mike Johnson | Finance | + +## Table 2: Projects + +| Project Name | Start Date | End Date | +|--------------|------------|----------| +| Website Redesign | 2023-01-01 | 2023-06-30 | +| Mobile App | 2023-03-15 | 2023-12-31 | + +## Table 3: Budget + +| Category | Q1 | Q2 | Q3 | Q4 | +|----------|----|----|----|----| +| Marketing | $10,000 | $15,000 | $20,000 | $25,000 | +| R&D | $50,000 | $60,000 | $70,000 | $80,000 | +| Operations | $100,000 | $110,000 | $120,000 | $130,000 | + +End of the file. \ No newline at end of file diff --git a/plugins/markdown-extractor/samples/simple_table.md b/plugins/markdown-extractor/samples/simple_table.md new file mode 100644 index 000000000..09f265d9c --- /dev/null +++ b/plugins/markdown-extractor/samples/simple_table.md @@ -0,0 +1,11 @@ +# Simple Table Example + +This is a simple Markdown file with a single table. + +| Name | Age | City | +|------|-----|------| +| John | 30 | New York | +| Alice | 25 | London | +| Bob | 35 | Paris | + +End of the file. \ No newline at end of file diff --git a/plugins/markdown-extractor/src/index.ts b/plugins/markdown-extractor/src/index.ts new file mode 100644 index 000000000..5d4393fb5 --- /dev/null +++ b/plugins/markdown-extractor/src/index.ts @@ -0,0 +1,14 @@ +import { Extractor } from '@flatfile/util-extractor' +import { parseBuffer } from './parser' + +export interface MarkdownExtractorOptions { + maxTables?: number + errorHandling?: 'strict' | 'lenient' + debug?: boolean +} + +export const MarkdownExtractor = (options: MarkdownExtractorOptions = {}) => { + return Extractor('.md', 'markdown', parseBuffer, options) +} + +export const markdownParser = parseBuffer diff --git a/plugins/markdown-extractor/src/parser.spec.ts b/plugins/markdown-extractor/src/parser.spec.ts new file mode 100644 index 000000000..44a9fd7a7 --- /dev/null +++ b/plugins/markdown-extractor/src/parser.spec.ts @@ -0,0 +1,59 @@ +import * as fs from 'fs' +import * as path from 'path' +import { parseBuffer } from './parser' +import { MarkdownExtractorOptions } from './index' + +describe('Markdown Extractor Parser', () => { + const testCases = [ + { file: 'simple_table.md', expectedTables: 1 }, + { file: 'multiple_tables.md', expectedTables: 3 }, + { file: 'complex_table.md', expectedTables: 1 }, + ] + + testCases.forEach(({ file, expectedTables }) => { + test(`should correctly parse ${file}`, () => { + const filePath = path.join(__dirname, '..', 'samples', file) + const buffer = fs.readFileSync(filePath) + const options: MarkdownExtractorOptions = { + maxTables: Infinity, + } + + const result = parseBuffer(buffer, options) + + expect(Object.keys(result).length).toBe(expectedTables) + + // Additional checks for each table + Object.values(result).forEach((table) => { + expect(table.headers).toBeDefined() + expect(table.headers.length).toBeGreaterThan(0) + expect(table.data).toBeDefined() + expect(table.data.length).toBeGreaterThan(0) + }) + }) + }) + + test('should respect maxTables option', () => { + const filePath = path.join(__dirname, '..', 'samples', 'multiple_tables.md') + const buffer = fs.readFileSync(filePath) + const options: MarkdownExtractorOptions = { + maxTables: 2, + } + + const result = parseBuffer(buffer, options) + + expect(Object.keys(result).length).toBe(2) + }) + + test('should handle errorHandling option', () => { + const filePath = path.join(__dirname, '..', 'samples', 'lenient_tables.md') + const buffer = fs.readFileSync(filePath) + + const strictOptions: MarkdownExtractorOptions = { errorHandling: 'strict' } + expect(() => parseBuffer(buffer, strictOptions)).toThrow() + + const lenientOptions: MarkdownExtractorOptions = { + errorHandling: 'lenient', + } + expect(() => parseBuffer(buffer, lenientOptions)).not.toThrow() + }) +}) diff --git a/plugins/markdown-extractor/src/parser.ts b/plugins/markdown-extractor/src/parser.ts new file mode 100644 index 000000000..a72d2c60f --- /dev/null +++ b/plugins/markdown-extractor/src/parser.ts @@ -0,0 +1,132 @@ +import type { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor' +import type { MarkdownExtractorOptions } from '.' + +export function parseBuffer( + buffer: Buffer, + options: MarkdownExtractorOptions +): WorkbookCapture { + const content = buffer.toString('utf-8') + const tables = extractTablesFromMarkdown(content, options) + + const sheets: Record = {} + + tables.forEach((table, index) => { + const sheetName = `Table_${index + 1}` + sheets[sheetName] = { + headers: table.headers, + data: table.rows.map((row) => { + const record: Record = {} + row.forEach((cell, cellIndex) => { + record[table.headers[cellIndex]] = { value: cell } + }) + return record + }), + } + }) + + return sheets +} + +export function extractTablesFromMarkdown( + content: string, + options: MarkdownExtractorOptions +): Array<{ headers: string[]; rows: string[][] }> { + const tables: Array<{ headers: string[]; rows: string[][] }> = [] + // More flexible regex pattern + const tableRegex = + /\|(.+?)\|[\r\n]+\|(?:[-:]+\|)+[\r\n]+((?:\|.+?\|[\r\n]+)+)/g + let match + + if (options.debug) { + console.log('Content to parse:', content) + } + + while ((match = tableRegex.exec(content)) !== null) { + if (options.maxTables && tables.length >= options.maxTables) { + if (options.debug) { + console.log( + `Max tables (${options.maxTables}) reached. Stopping extraction.` + ) + } + break + } + + try { + if (options.debug) { + console.log('Found potential table:', match[0]) + } + + const headerRow = match[1] + .split('|') + .map((h) => h.trim()) + .filter(Boolean) + + if (headerRow.length === 0) { + throw new Error('No headers found in table') + } + + const dataRowsText = match[2] + const dataRows = dataRowsText + .split('\n') + .map((row) => + row + .split('|') + .map((cell) => cell.trim()) + .filter(Boolean) + ) + .filter((row) => row.length > 0) // Filter out empty rows + + if (options.debug) { + console.log('Parsed headers:', headerRow) + console.log('Parsed data rows:', dataRows) + } + + // Attempt to reconcile mismatched column counts + const maxColumns = Math.max( + headerRow.length, + ...dataRows.map((row) => row.length) + ) + const normalizedDataRows = dataRows.map((row) => { + if ( + options.errorHandling === 'strict' && + row.length !== headerRow.length + ) { + throw new Error('Data row length does not match header row length') + } + + while (row.length < maxColumns) { + row.push('') // Pad with empty cells + } + return row.slice(0, maxColumns) // Trim excess cells + }) + + if (options.debug) { + console.log('Normalized data rows:', normalizedDataRows) + } + + tables.push({ + headers: headerRow, + rows: normalizedDataRows, + }) + + if (options.debug) { + console.log( + 'Successfully added table. Current table count:', + tables.length + ) + } + } catch (error) { + if (options.errorHandling === 'strict') { + throw error + } else { + console.warn('Error parsing table:', error.message) + } + } + } + + if (options.debug) { + console.log(`Extraction complete. Found ${tables.length} tables.`) + } + + return tables +}