diff --git a/extract/html-table/README.md b/extract/html-table/README.md
new file mode 100644
index 000000000..fa99e2e36
--- /dev/null
+++ b/extract/html-table/README.md
@@ -0,0 +1,61 @@
+
+
+# @flatfile/plugin-extract-html-table
+
+This plugin provides HTML table extraction capabilities for Flatfile. It parses HTML files and extracts structured data from tables, handling complex layouts and nested tables.
+
+**Event Type:** `listener.on('file:created')`
+
+**Supported File Types:** `.html`
+
+
+
+## Features
+
+- Extracts table structure, including headers and cell data
+- Handles nested tables and complex table layouts
+- Handles colspan and rowspan attributes (configurable)
+- Supports nested tables up to a configurable depth
+- Converts extracted data into a structured format
+- Provides error handling for malformed HTML or table structures
+- Debug mode for detailed logging
+
+## Parameters
+
+#### `options` - `object` - (optional)
+
+- `handleColspan` - `boolean` - (optional): Determines how to handle colspan. Default is true.
+- `handleRowspan` - `boolean` - (optional): Determines how to handle rowspan. Default is true.
+- `maxDepth` - `number` - (optional): Maximum depth for nested tables. Default is 3.
+- `debug` - `boolean` - (optional): Enables debug logging. Default is false.
+
+## API Calls
+
+- `api.files.download`
+- `api.files.update`
+
+## Usage
+
+**install**
+```bash
+npm install @flatfile/plugin-extract-html-table
+```
+
+**import**
+```javascript
+import { HTMLTableExtractor } from '@flatfile/plugin-extract-html-table';
+```
+
+**listener.js**
+```javascript
+const listener = new FlatfileListener();
+
+listener.use(
+ HTMLTableExtractor({
+ handleColspan: true,
+ handleRowspan: true,
+ maxDepth: 3,
+ debug: false
+ })
+);
+```
diff --git a/extract/html-table/jest.config.js b/extract/html-table/jest.config.js
new file mode 100644
index 000000000..e6d7ca40b
--- /dev/null
+++ b/extract/html-table/jest.config.js
@@ -0,0 +1,16 @@
+module.exports = {
+ testEnvironment: 'node',
+
+ transform: {
+ '^.+\\.tsx?$': 'ts-jest',
+ },
+ setupFiles: ['../../test/dotenv-config.js'],
+ setupFilesAfterEnv: [
+ '../../test/betterConsoleLog.js',
+ '../../test/unit.cleanup.js',
+ ],
+ testTimeout: 60_000,
+ globalSetup: '../../test/setup-global.js',
+ forceExit: true,
+ passWithNoTests: true,
+}
diff --git a/extract/html-table/package.json b/extract/html-table/package.json
new file mode 100644
index 000000000..51a9f238f
--- /dev/null
+++ b/extract/html-table/package.json
@@ -0,0 +1,55 @@
+{
+ "name": "@flatfile/plugin-extract-html-table",
+ "version": "1.0.0",
+ "description": "A Flatfile plugin for extracting table data from HTML files",
+ "main": "./dist/index.cjs",
+ "module": "./dist/index.mjs",
+ "types": "./dist/index.d.ts",
+ "browser": {
+ "./dist/index.cjs": "./dist/index.browser.cjs",
+ "./dist/index.mjs": "./dist/index.browser.mjs"
+ },
+ "exports": {
+ "types": "./dist/index.d.ts",
+ "node": {
+ "import": "./dist/index.mjs",
+ "require": "./dist/index.cjs"
+ },
+ "browser": {
+ "require": "./dist/index.browser.cjs",
+ "import": "./dist/index.browser.mjs"
+ },
+ "default": "./dist/index.mjs"
+ },
+ "source": "./src/index.ts",
+ "files": ["dist/**"],
+ "scripts": {
+ "build": "rollup -c",
+ "build:watch": "rollup -c --watch",
+ "build:prod": "NODE_ENV=production rollup -c",
+ "check": "tsc ./**/*.ts --noEmit --esModuleInterop",
+ "test": "jest src/*.spec.ts --detectOpenHandles",
+ "test:unit": "jest src/*.spec.ts --testPathIgnorePatterns=.*\\.e2e\\.spec\\.ts$ --detectOpenHandles",
+ "test:e2e": "jest src/*.e2e.spec.ts --detectOpenHandles"
+ },
+ "keywords": ["flatfile", "flatfile-plugins", "category-extractors", "html", "table-extractor"],
+ "author": "Flatfile",
+ "license": "ISC",
+ "dependencies": {
+ "@flatfile/util-extractor": "^2.1.5",
+ "node-html-parser": "^6.1.13"
+ },
+ "devDependencies": {
+ "@flatfile/rollup-config": "^0.1.1"
+ },
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/FlatFilers/flatfile-plugins.git",
+ "directory": "extract/html-table"
+ },
+ "browserslist": [
+ "> 0.5%",
+ "last 2 versions",
+ "not dead"
+ ]
+}
\ No newline at end of file
diff --git a/extract/html-table/rollup.config.mjs b/extract/html-table/rollup.config.mjs
new file mode 100644
index 000000000..1e95e60a0
--- /dev/null
+++ b/extract/html-table/rollup.config.mjs
@@ -0,0 +1,5 @@
+import { buildConfig } from '@flatfile/rollup-config';
+
+const config = buildConfig({});
+
+export default config;
\ No newline at end of file
diff --git a/extract/html-table/samples/complex_table.html b/extract/html-table/samples/complex_table.html
new file mode 100644
index 000000000..63b3fdde9
--- /dev/null
+++ b/extract/html-table/samples/complex_table.html
@@ -0,0 +1,51 @@
+
+
+
+
+ Complex Table Example
+
+
+ Quarterly Sales Report
+
+
+ | Product Category |
+ Quarterly Sales |
+ Total |
+
+
+ | Q1 |
+ Q2 |
+ Q3 |
+ Q4 |
+
+
+ | Electronics |
+ $50,000 |
+ $65,000 |
+ $70,000 |
+ $90,000 |
+ $275,000 |
+
+
+ | Clothing |
+ $35,000 |
+ $40,000 |
+ $55,000 |
+ $80,000 |
+ $210,000 |
+
+
+ | Home & Garden |
+ $25,000 |
+ $30,000 |
+ $40,000 |
+ $45,000 |
+ $140,000 |
+
+
+ | Total Sales |
+ $625,000 |
+
+
+
+
\ No newline at end of file
diff --git a/extract/html-table/samples/multiple_tables.html b/extract/html-table/samples/multiple_tables.html
new file mode 100644
index 000000000..cde5a2841
--- /dev/null
+++ b/extract/html-table/samples/multiple_tables.html
@@ -0,0 +1,78 @@
+
+
+
+
+ Multiple Tables Example
+
+
+ Company Data
+
+ Employee Information
+
+
+ | Name |
+ Position |
+ Department |
+
+
+ | John Doe |
+ Software Engineer |
+ IT |
+
+
+ | Jane Smith |
+ Marketing Specialist |
+ Marketing |
+
+
+
+ Department Budget
+
+
+ | Department |
+ Budget |
+ Expenses |
+
+
+ | IT |
+ $500,000 |
+
+
+
+ | Category |
+ Amount |
+
+
+ | Hardware |
+ $200,000 |
+
+
+ | Software |
+ $150,000 |
+
+
+ |
+
+
+ | Marketing |
+ $300,000 |
+
+
+
+ | Category |
+ Amount |
+
+
+ | Advertising |
+ $150,000 |
+
+
+ | Events |
+ $100,000 |
+
+
+ |
+
+
+
+
\ No newline at end of file
diff --git a/extract/html-table/samples/simple_table.html b/extract/html-table/samples/simple_table.html
new file mode 100644
index 000000000..0d263de2c
--- /dev/null
+++ b/extract/html-table/samples/simple_table.html
@@ -0,0 +1,32 @@
+
+
+
+
+ Simple Table Example
+
+
+ Employee Information
+
+
+ | Name |
+ Position |
+ Department |
+
+
+ | John Doe |
+ Software Engineer |
+ IT |
+
+
+ | Jane Smith |
+ Marketing Specialist |
+ Marketing |
+
+
+ | Bob Johnson |
+ HR Manager |
+ Human Resources |
+
+
+
+
\ No newline at end of file
diff --git a/extract/html-table/src/index.ts b/extract/html-table/src/index.ts
new file mode 100644
index 000000000..e02f2a50c
--- /dev/null
+++ b/extract/html-table/src/index.ts
@@ -0,0 +1,15 @@
+import { Extractor } from '@flatfile/util-extractor'
+import { parseBuffer } from './parser'
+
+export interface HTMLTableExtractorOptions {
+ handleColspan?: boolean
+ handleRowspan?: boolean
+ maxDepth?: number
+ debug?: boolean
+}
+
+export const HTMLTableExtractor = (options: HTMLTableExtractorOptions = {}) => {
+ return Extractor('html', 'html-tables', parseBuffer, options)
+}
+
+export const htmlTableParser = parseBuffer
diff --git a/extract/html-table/src/parser.ts b/extract/html-table/src/parser.ts
new file mode 100644
index 000000000..8b53d2012
--- /dev/null
+++ b/extract/html-table/src/parser.ts
@@ -0,0 +1,154 @@
+import type { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor'
+import { parse } from 'node-html-parser'
+
+export interface HTMLTableExtractorOptions {
+ handleColspan?: boolean
+ handleRowspan?: boolean
+ maxDepth?: number
+ debug?: boolean
+}
+
+export function parseBuffer(
+ buffer: Buffer,
+ options: HTMLTableExtractorOptions
+): WorkbookCapture {
+ if (options.debug) {
+ console.log('Parsing buffer...')
+ }
+
+ const content = buffer.toString('utf-8')
+ const tables = extractTablesFromHTML(content, options)
+
+ const sheets: Record = {}
+
+ tables.forEach((table, index) => {
+ const sheetName = `Table_${index + 1}`
+ sheets[sheetName] = {
+ headers: table.headers,
+ data: table.rows.map((row) => {
+ const record: Record = {}
+ row.forEach((cell, cellIndex) => {
+ if (cellIndex < table.headers.length) {
+ record[table.headers[cellIndex]] = { value: cell }
+ } else if (options.debug) {
+ console.warn(
+ `Row ${index + 1} has more cells than headers. Ignoring extra cell:`,
+ cell
+ )
+ }
+ })
+ return record
+ }),
+ }
+
+ if (options.debug) {
+ console.log(`Created sheet: ${sheetName}`)
+ console.log('Headers:', sheets[sheetName].headers)
+ console.log('Row count:', sheets[sheetName].data.length)
+ }
+ })
+
+ if (options.debug) {
+ console.log('Parsing complete. Sheets created:', Object.keys(sheets).length)
+ }
+
+ return sheets
+}
+
+function extractTablesFromHTML(
+ content: string,
+ options: HTMLTableExtractorOptions
+): Array<{ headers: string[]; rows: string[][] }> {
+ const root = parse(content)
+ const tables: Array<{ headers: string[]; rows: string[][] }> = []
+
+ if (options.debug) {
+ console.log('Content to parse:', content)
+ }
+
+ root.querySelectorAll('table').forEach((table, tableIndex) => {
+ const headers: string[] = []
+ const rows: string[][] = []
+
+ if (options.debug) {
+ console.log(`Processing table ${tableIndex + 1}`)
+ }
+
+ // Extract headers
+ table.querySelectorAll('th').forEach((header) => {
+ headers.push(header.text.trim())
+ })
+
+ if (options.debug) {
+ console.log('Extracted headers:', headers)
+ }
+
+ // Extract rows
+ table.querySelectorAll('tr').forEach((row, rowIndex) => {
+ const rowData: string[] = []
+
+ row.querySelectorAll('td').forEach((cell, cellIndex) => {
+ const cellData = cell.text.trim()
+
+ if (options.handleColspan && cell.getAttribute('colspan')) {
+ const colspan = parseInt(cell.getAttribute('colspan') || '1', 10)
+ for (let i = 0; i < colspan; i++) {
+ rowData.push(cellData)
+ }
+ } else {
+ rowData.push(cellData)
+ }
+
+ if (options.debug) {
+ console.log(`Cell ${cellIndex + 1} in row ${rowIndex + 1}:`, cellData)
+ }
+ })
+
+ if (rowData.length > 0) {
+ rows.push(rowData)
+ }
+ })
+
+ // Handle rowspan
+ if (options.handleRowspan) {
+ handleRowspan(rows)
+ }
+
+ tables.push({ headers, rows })
+
+ if (options.debug) {
+ console.log(`Extracted table ${tableIndex + 1}:`, { headers, rows })
+ }
+ })
+
+ if (options.debug) {
+ console.log(`Extraction complete. Found ${tables.length} tables.`)
+ }
+
+ return tables
+}
+
+function handleRowspan(rows: string[][]) {
+ const rowspanCells: { [key: number]: { value: string; rowspan: number } } = {}
+
+ rows.forEach((row, rowIndex) => {
+ Object.keys(rowspanCells).forEach((colIndex) => {
+ const colIndexNum = parseInt(colIndex, 10)
+ if (rowspanCells[colIndexNum].rowspan > 0) {
+ row.splice(colIndexNum, 0, rowspanCells[colIndexNum].value)
+ rowspanCells[colIndexNum].rowspan--
+ }
+ })
+
+ row.forEach((cell, colIndex) => {
+ const cellElement = parse(`${cell} | `).querySelector('td')
+ if (cellElement && cellElement.getAttribute('rowspan')) {
+ const rowspan =
+ parseInt(cellElement.getAttribute('rowspan') || '1', 10) - 1
+ if (rowspan > 0) {
+ rowspanCells[colIndex] = { value: cell, rowspan }
+ }
+ }
+ })
+ })
+}
diff --git a/flatfilers/sandbox/src/index.ts b/flatfilers/sandbox/src/index.ts
index 12639a9ff..fadd7fb1f 100644
--- a/flatfilers/sandbox/src/index.ts
+++ b/flatfilers/sandbox/src/index.ts
@@ -1,117 +1,6 @@
import type { FlatfileListener } from '@flatfile/listener'
-import { rssImport } from '@flatfile/plugin-import-rss'
-import { configureSpace } from '@flatfile/plugin-space-configure'
-import { MarkdownExtractor } from '@flatfile/plugin-markdown-extractor'
+import { HTMLTableExtractor } from '@flatfile/plugin-extract-html-table'
export default async function (listener: FlatfileListener) {
- listener.use(
- rssImport([
- {
- sheetSlug: 'rss-feed-1',
- rssFeedUrl: 'http://rss.cnn.com/rss/money_topstories.rss',
- },
- {
- sheetSlug: 'rss-feed-2',
- rssFeedUrl: 'http://rss.cnn.com/rss/money_news_companies.rss',
- },
- ])
- )
- listener.use(
- configureSpace({
- workbooks: [
- {
- name: 'Sandbox',
- sheets: [
- {
- name: 'RSS Feed One',
- slug: 'rss-feed-1',
- fields: [
- {
- key: 'title',
- type: 'string',
- label: 'Title',
- },
- {
- key: 'link',
- type: 'string',
- label: 'Link',
- },
- {
- key: 'pubDate',
- type: 'string',
- label: 'Pub Date',
- },
- {
- key: 'content',
- type: 'string',
- label: 'Content',
- },
- {
- key: 'guid',
- type: 'string',
- label: 'GUID',
- },
- ],
- actions: [
- {
- operation: 'importRSSFeed',
- label: 'Import RSS Feed',
- description: 'Import data from an RSS feed into the workbook',
- primary: true,
- icon: 'rss_feed',
- tooltip: 'Click to import data from an RSS feed',
- mode: 'foreground',
- },
- ],
- },
- {
- name: 'RSS Feed Two',
- slug: 'rss-feed-2',
- fields: [
- {
- key: 'title',
- type: 'string',
- label: 'Title',
- },
- {
- key: 'link',
- type: 'string',
- label: 'Link',
- },
- {
- key: 'pubDate',
- type: 'string',
- label: 'Pub Date',
- },
- {
- key: 'content',
- type: 'string',
- label: 'Content',
- },
- {
- key: 'guid',
- type: 'string',
- label: 'GUID',
- },
- ],
- actions: [
- {
- operation: 'importRSSFeed',
- label: 'Import RSS Feed',
- description: 'Import data from an RSS feed into the workbook',
- primary: true,
- icon: 'rss_feed',
- tooltip: 'Click to import data from an RSS feed',
- mode: 'foreground',
- },
- ],
- },
- ],
- },
- ],
- })
- )
-
- listener.use(MarkdownExtractor())
-
+ listener.use(HTMLTableExtractor())
}
diff --git a/package-lock.json b/package-lock.json
index a028e002d..69c06be83 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -12,6 +12,7 @@
"bundlers/*",
"convert/*",
"enrich/*",
+ "extract/*",
"flatfilers/*",
"import/*",
"plugins/*",
@@ -116,6 +117,31 @@
"@flatfile/listener": "^1.1.0"
}
},
+ "extract/html-table": {
+ "name": "@flatfile/plugin-extract-html-table",
+ "version": "1.0.0",
+ "license": "ISC",
+ "dependencies": {
+ "@flatfile/util-extractor": "^2.1.5",
+ "node-html-parser": "^6.1.13"
+ },
+ "devDependencies": {
+ "@flatfile/rollup-config": "^0.1.1"
+ }
+ },
+ "flatfilers/playground": {
+ "name": "@private/playground",
+ "version": "0.0.0",
+ "license": "ISC",
+ "dependencies": {
+ "@flatfile/api": "^1.9.19",
+ "@flatfile/listener": "^1.1.0",
+ "modern-async": "^2.0.0"
+ },
+ "devDependencies": {
+ "flatfile": "3.8.0"
+ }
+ },
"flatfilers/sandbox": {
"name": "@private/sandbox",
"version": "0.0.0",
@@ -3197,6 +3223,10 @@
"resolved": "plugins/export-workbook",
"link": true
},
+ "node_modules/@flatfile/plugin-extract-html-table": {
+ "resolved": "extract/html-table",
+ "link": true
+ },
"node_modules/@flatfile/plugin-foreign-db-extractor": {
"resolved": "plugins/foreign-db-extractor",
"link": true
@@ -6215,6 +6245,10 @@
"node": ">=14"
}
},
+ "node_modules/@private/playground": {
+ "resolved": "flatfilers/playground",
+ "link": true
+ },
"node_modules/@private/sandbox": {
"resolved": "flatfilers/sandbox",
"link": true
@@ -8974,7 +9008,6 @@
},
"node_modules/boolbase": {
"version": "1.0.0",
- "dev": true,
"license": "ISC"
},
"node_modules/brace-expansion": {
@@ -9840,10 +9873,7 @@
},
"node_modules/css-select": {
"version": "5.1.0",
- "dev": true,
"license": "BSD-2-Clause",
- "optional": true,
- "peer": true,
"dependencies": {
"boolbase": "^1.0.0",
"css-what": "^6.1.0",
@@ -9857,10 +9887,7 @@
},
"node_modules/css-select/node_modules/dom-serializer": {
"version": "2.0.0",
- "dev": true,
"license": "MIT",
- "optional": true,
- "peer": true,
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
@@ -9872,10 +9899,7 @@
},
"node_modules/css-select/node_modules/domhandler": {
"version": "5.0.3",
- "dev": true,
"license": "BSD-2-Clause",
- "optional": true,
- "peer": true,
"dependencies": {
"domelementtype": "^2.3.0"
},
@@ -9888,10 +9912,7 @@
},
"node_modules/css-select/node_modules/domutils": {
"version": "3.1.0",
- "dev": true,
"license": "BSD-2-Clause",
- "optional": true,
- "peer": true,
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
@@ -9903,10 +9924,7 @@
},
"node_modules/css-select/node_modules/entities": {
"version": "4.5.0",
- "dev": true,
"license": "BSD-2-Clause",
- "optional": true,
- "peer": true,
"engines": {
"node": ">=0.12"
},
@@ -9930,7 +9948,6 @@
},
"node_modules/css-what": {
"version": "6.1.0",
- "dev": true,
"license": "BSD-2-Clause",
"engines": {
"node": ">= 6"
@@ -10348,7 +10365,6 @@
},
"node_modules/domelementtype": {
"version": "2.3.0",
- "dev": true,
"funding": [
{
"type": "github",
@@ -11925,6 +11941,14 @@
"node": ">= 0.4"
}
},
+ "node_modules/he": {
+ "version": "1.2.0",
+ "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
+ "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
+ "bin": {
+ "he": "bin/he"
+ }
+ },
"node_modules/hermes-estree": {
"version": "0.19.1",
"license": "MIT",
@@ -16000,6 +16024,15 @@
"node": ">=8"
}
},
+ "node_modules/node-html-parser": {
+ "version": "6.1.13",
+ "resolved": "https://registry.npmjs.org/node-html-parser/-/node-html-parser-6.1.13.tgz",
+ "integrity": "sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==",
+ "dependencies": {
+ "css-select": "^5.1.0",
+ "he": "1.2.0"
+ }
+ },
"node_modules/node-int64": {
"version": "0.4.0",
"license": "MIT"
@@ -16066,7 +16099,6 @@
},
"node_modules/nth-check": {
"version": "2.1.1",
- "dev": true,
"license": "BSD-2-Clause",
"dependencies": {
"boolbase": "^1.0.0"
diff --git a/package.json b/package.json
index 87e35555b..a21d368f5 100644
--- a/package.json
+++ b/package.json
@@ -9,6 +9,7 @@
"bundlers/*",
"convert/*",
"enrich/*",
+ "extract/*",
"flatfilers/*",
"import/*",
"plugins/*",
diff --git a/plugins/job-handler/src/job.handler.spec.ts b/plugins/job-handler/src/job.handler.e2e.spec.ts
similarity index 100%
rename from plugins/job-handler/src/job.handler.spec.ts
rename to plugins/job-handler/src/job.handler.e2e.spec.ts
diff --git a/plugins/webhook-egress/src/webhook.egress.spec.ts b/plugins/webhook-egress/src/webhook.egress.e2e.spec.ts
similarity index 100%
rename from plugins/webhook-egress/src/webhook.egress.spec.ts
rename to plugins/webhook-egress/src/webhook.egress.e2e.spec.ts