From 3158a1ee4965e10c46ce5e46739170dd8ea5fdef Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Mon, 30 Sep 2024 19:19:35 -0500 Subject: [PATCH 1/3] feat: html table extractor --- extract/html-table/README.md | 61 +++++++ extract/html-table/package.json | 53 ++++++ extract/html-table/rollup.config.mjs | 5 + extract/html-table/samples/complex_table.html | 51 ++++++ .../html-table/samples/multiple_tables.html | 78 +++++++++ extract/html-table/samples/simple_table.html | 32 ++++ extract/html-table/src/index.ts | 15 ++ extract/html-table/src/parser.ts | 154 ++++++++++++++++++ flatfilers/sandbox/src/index.ts | 115 +------------ package-lock.json | 70 +++++--- package.json | 1 + 11 files changed, 503 insertions(+), 132 deletions(-) create mode 100644 extract/html-table/README.md create mode 100644 extract/html-table/package.json create mode 100644 extract/html-table/rollup.config.mjs create mode 100644 extract/html-table/samples/complex_table.html create mode 100644 extract/html-table/samples/multiple_tables.html create mode 100644 extract/html-table/samples/simple_table.html create mode 100644 extract/html-table/src/index.ts create mode 100644 extract/html-table/src/parser.ts diff --git a/extract/html-table/README.md b/extract/html-table/README.md new file mode 100644 index 000000000..fa99e2e36 --- /dev/null +++ b/extract/html-table/README.md @@ -0,0 +1,61 @@ + + +# @flatfile/plugin-extract-html-table + +This plugin provides HTML table extraction capabilities for Flatfile. It parses HTML files and extracts structured data from tables, handling complex layouts and nested tables. + +**Event Type:** `listener.on('file:created')` + +**Supported File Types:** `.html` + + + +## Features + +- Extracts table structure, including headers and cell data +- Handles nested tables and complex table layouts +- Handles colspan and rowspan attributes (configurable) +- Supports nested tables up to a configurable depth +- Converts extracted data into a structured format +- Provides error handling for malformed HTML or table structures +- Debug mode for detailed logging + +## Parameters + +#### `options` - `object` - (optional) + +- `handleColspan` - `boolean` - (optional): Determines how to handle colspan. Default is true. +- `handleRowspan` - `boolean` - (optional): Determines how to handle rowspan. Default is true. +- `maxDepth` - `number` - (optional): Maximum depth for nested tables. Default is 3. +- `debug` - `boolean` - (optional): Enables debug logging. Default is false. + +## API Calls + +- `api.files.download` +- `api.files.update` + +## Usage + +**install** +```bash +npm install @flatfile/plugin-extract-html-table +``` + +**import** +```javascript +import { HTMLTableExtractor } from '@flatfile/plugin-extract-html-table'; +``` + +**listener.js** +```javascript +const listener = new FlatfileListener(); + +listener.use( + HTMLTableExtractor({ + handleColspan: true, + handleRowspan: true, + maxDepth: 3, + debug: false + }) +); +``` diff --git a/extract/html-table/package.json b/extract/html-table/package.json new file mode 100644 index 000000000..e93d01203 --- /dev/null +++ b/extract/html-table/package.json @@ -0,0 +1,53 @@ +{ + "name": "@flatfile/plugin-extract-html-table", + "version": "1.0.0", + "description": "A Flatfile plugin for extracting table data from HTML files", + "main": "./dist/index.cjs", + "module": "./dist/index.mjs", + "types": "./dist/index.d.ts", + "browser": { + "./dist/index.cjs": "./dist/index.browser.cjs", + "./dist/index.mjs": "./dist/index.browser.mjs" + }, + "exports": { + "types": "./dist/index.d.ts", + "node": { + "import": "./dist/index.mjs", + "require": "./dist/index.cjs" + }, + "browser": { + "require": "./dist/index.browser.cjs", + "import": "./dist/index.browser.mjs" + }, + "default": "./dist/index.mjs" + }, + "source": "./src/index.ts", + "files": ["dist/**"], + "scripts": { + "build": "rollup -c", + "build:watch": "rollup -c --watch", + "build:prod": "NODE_ENV=production rollup -c", + "check": "tsc ./**/*.ts --noEmit --esModuleInterop", + "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" + }, + "keywords": ["flatfile", "flatfile-plugins", "category-extractors", "html", "table-extractor"], + "author": "Flatfile", + "license": "ISC", + "dependencies": { + "@flatfile/util-extractor": "^2.1.5", + "node-html-parser": "^6.1.13" + }, + "devDependencies": { + "@flatfile/rollup-config": "^0.1.1" + }, + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "extract/html-table" + }, + "browserslist": [ + "> 0.5%", + "last 2 versions", + "not dead" + ] +} \ No newline at end of file diff --git a/extract/html-table/rollup.config.mjs b/extract/html-table/rollup.config.mjs new file mode 100644 index 000000000..1e95e60a0 --- /dev/null +++ b/extract/html-table/rollup.config.mjs @@ -0,0 +1,5 @@ +import { buildConfig } from '@flatfile/rollup-config'; + +const config = buildConfig({}); + +export default config; \ No newline at end of file diff --git a/extract/html-table/samples/complex_table.html b/extract/html-table/samples/complex_table.html new file mode 100644 index 000000000..63b3fdde9 --- /dev/null +++ b/extract/html-table/samples/complex_table.html @@ -0,0 +1,51 @@ + + + + + Complex Table Example + + +

Quarterly Sales Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Product CategoryQuarterly SalesTotal
Q1Q2Q3Q4
Electronics$50,000$65,000$70,000$90,000$275,000
Clothing$35,000$40,000$55,000$80,000$210,000
Home & Garden$25,000$30,000$40,000$45,000$140,000
Total Sales$625,000
+ + \ No newline at end of file diff --git a/extract/html-table/samples/multiple_tables.html b/extract/html-table/samples/multiple_tables.html new file mode 100644 index 000000000..cde5a2841 --- /dev/null +++ b/extract/html-table/samples/multiple_tables.html @@ -0,0 +1,78 @@ + + + + + Multiple Tables Example + + +

Company Data

+ +

Employee Information

+ + + + + + + + + + + + + + + + +
NamePositionDepartment
John DoeSoftware EngineerIT
Jane SmithMarketing SpecialistMarketing
+ +

Department Budget

+ + + + + + + + + + + + + + + + +
DepartmentBudgetExpenses
IT$500,000 + + + + + + + + + + + + + +
CategoryAmount
Hardware$200,000
Software$150,000
+
Marketing$300,000 + + + + + + + + + + + + + +
CategoryAmount
Advertising$150,000
Events$100,000
+
+ + \ No newline at end of file diff --git a/extract/html-table/samples/simple_table.html b/extract/html-table/samples/simple_table.html new file mode 100644 index 000000000..0d263de2c --- /dev/null +++ b/extract/html-table/samples/simple_table.html @@ -0,0 +1,32 @@ + + + + + Simple Table Example + + +

Employee Information

+ + + + + + + + + + + + + + + + + + + + + +
NamePositionDepartment
John DoeSoftware EngineerIT
Jane SmithMarketing SpecialistMarketing
Bob JohnsonHR ManagerHuman Resources
+ + \ No newline at end of file diff --git a/extract/html-table/src/index.ts b/extract/html-table/src/index.ts new file mode 100644 index 000000000..e02f2a50c --- /dev/null +++ b/extract/html-table/src/index.ts @@ -0,0 +1,15 @@ +import { Extractor } from '@flatfile/util-extractor' +import { parseBuffer } from './parser' + +export interface HTMLTableExtractorOptions { + handleColspan?: boolean + handleRowspan?: boolean + maxDepth?: number + debug?: boolean +} + +export const HTMLTableExtractor = (options: HTMLTableExtractorOptions = {}) => { + return Extractor('html', 'html-tables', parseBuffer, options) +} + +export const htmlTableParser = parseBuffer diff --git a/extract/html-table/src/parser.ts b/extract/html-table/src/parser.ts new file mode 100644 index 000000000..8b53d2012 --- /dev/null +++ b/extract/html-table/src/parser.ts @@ -0,0 +1,154 @@ +import type { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor' +import { parse } from 'node-html-parser' + +export interface HTMLTableExtractorOptions { + handleColspan?: boolean + handleRowspan?: boolean + maxDepth?: number + debug?: boolean +} + +export function parseBuffer( + buffer: Buffer, + options: HTMLTableExtractorOptions +): WorkbookCapture { + if (options.debug) { + console.log('Parsing buffer...') + } + + const content = buffer.toString('utf-8') + const tables = extractTablesFromHTML(content, options) + + const sheets: Record = {} + + tables.forEach((table, index) => { + const sheetName = `Table_${index + 1}` + sheets[sheetName] = { + headers: table.headers, + data: table.rows.map((row) => { + const record: Record = {} + row.forEach((cell, cellIndex) => { + if (cellIndex < table.headers.length) { + record[table.headers[cellIndex]] = { value: cell } + } else if (options.debug) { + console.warn( + `Row ${index + 1} has more cells than headers. Ignoring extra cell:`, + cell + ) + } + }) + return record + }), + } + + if (options.debug) { + console.log(`Created sheet: ${sheetName}`) + console.log('Headers:', sheets[sheetName].headers) + console.log('Row count:', sheets[sheetName].data.length) + } + }) + + if (options.debug) { + console.log('Parsing complete. Sheets created:', Object.keys(sheets).length) + } + + return sheets +} + +function extractTablesFromHTML( + content: string, + options: HTMLTableExtractorOptions +): Array<{ headers: string[]; rows: string[][] }> { + const root = parse(content) + const tables: Array<{ headers: string[]; rows: string[][] }> = [] + + if (options.debug) { + console.log('Content to parse:', content) + } + + root.querySelectorAll('table').forEach((table, tableIndex) => { + const headers: string[] = [] + const rows: string[][] = [] + + if (options.debug) { + console.log(`Processing table ${tableIndex + 1}`) + } + + // Extract headers + table.querySelectorAll('th').forEach((header) => { + headers.push(header.text.trim()) + }) + + if (options.debug) { + console.log('Extracted headers:', headers) + } + + // Extract rows + table.querySelectorAll('tr').forEach((row, rowIndex) => { + const rowData: string[] = [] + + row.querySelectorAll('td').forEach((cell, cellIndex) => { + const cellData = cell.text.trim() + + if (options.handleColspan && cell.getAttribute('colspan')) { + const colspan = parseInt(cell.getAttribute('colspan') || '1', 10) + for (let i = 0; i < colspan; i++) { + rowData.push(cellData) + } + } else { + rowData.push(cellData) + } + + if (options.debug) { + console.log(`Cell ${cellIndex + 1} in row ${rowIndex + 1}:`, cellData) + } + }) + + if (rowData.length > 0) { + rows.push(rowData) + } + }) + + // Handle rowspan + if (options.handleRowspan) { + handleRowspan(rows) + } + + tables.push({ headers, rows }) + + if (options.debug) { + console.log(`Extracted table ${tableIndex + 1}:`, { headers, rows }) + } + }) + + if (options.debug) { + console.log(`Extraction complete. Found ${tables.length} tables.`) + } + + return tables +} + +function handleRowspan(rows: string[][]) { + const rowspanCells: { [key: number]: { value: string; rowspan: number } } = {} + + rows.forEach((row, rowIndex) => { + Object.keys(rowspanCells).forEach((colIndex) => { + const colIndexNum = parseInt(colIndex, 10) + if (rowspanCells[colIndexNum].rowspan > 0) { + row.splice(colIndexNum, 0, rowspanCells[colIndexNum].value) + rowspanCells[colIndexNum].rowspan-- + } + }) + + row.forEach((cell, colIndex) => { + const cellElement = parse(`${cell}`).querySelector('td') + if (cellElement && cellElement.getAttribute('rowspan')) { + const rowspan = + parseInt(cellElement.getAttribute('rowspan') || '1', 10) - 1 + if (rowspan > 0) { + rowspanCells[colIndex] = { value: cell, rowspan } + } + } + }) + }) +} diff --git a/flatfilers/sandbox/src/index.ts b/flatfilers/sandbox/src/index.ts index 12639a9ff..fadd7fb1f 100644 --- a/flatfilers/sandbox/src/index.ts +++ b/flatfilers/sandbox/src/index.ts @@ -1,117 +1,6 @@ import type { FlatfileListener } from '@flatfile/listener' -import { rssImport } from '@flatfile/plugin-import-rss' -import { configureSpace } from '@flatfile/plugin-space-configure' -import { MarkdownExtractor } from '@flatfile/plugin-markdown-extractor' +import { HTMLTableExtractor } from '@flatfile/plugin-extract-html-table' export default async function (listener: FlatfileListener) { - listener.use( - rssImport([ - { - sheetSlug: 'rss-feed-1', - rssFeedUrl: 'http://rss.cnn.com/rss/money_topstories.rss', - }, - { - sheetSlug: 'rss-feed-2', - rssFeedUrl: 'http://rss.cnn.com/rss/money_news_companies.rss', - }, - ]) - ) - listener.use( - configureSpace({ - workbooks: [ - { - name: 'Sandbox', - sheets: [ - { - name: 'RSS Feed One', - slug: 'rss-feed-1', - fields: [ - { - key: 'title', - type: 'string', - label: 'Title', - }, - { - key: 'link', - type: 'string', - label: 'Link', - }, - { - key: 'pubDate', - type: 'string', - label: 'Pub Date', - }, - { - key: 'content', - type: 'string', - label: 'Content', - }, - { - key: 'guid', - type: 'string', - label: 'GUID', - }, - ], - actions: [ - { - operation: 'importRSSFeed', - label: 'Import RSS Feed', - description: 'Import data from an RSS feed into the workbook', - primary: true, - icon: 'rss_feed', - tooltip: 'Click to import data from an RSS feed', - mode: 'foreground', - }, - ], - }, - { - name: 'RSS Feed Two', - slug: 'rss-feed-2', - fields: [ - { - key: 'title', - type: 'string', - label: 'Title', - }, - { - key: 'link', - type: 'string', - label: 'Link', - }, - { - key: 'pubDate', - type: 'string', - label: 'Pub Date', - }, - { - key: 'content', - type: 'string', - label: 'Content', - }, - { - key: 'guid', - type: 'string', - label: 'GUID', - }, - ], - actions: [ - { - operation: 'importRSSFeed', - label: 'Import RSS Feed', - description: 'Import data from an RSS feed into the workbook', - primary: true, - icon: 'rss_feed', - tooltip: 'Click to import data from an RSS feed', - mode: 'foreground', - }, - ], - }, - ], - }, - ], - }) - ) - - listener.use(MarkdownExtractor()) - + listener.use(HTMLTableExtractor()) } diff --git a/package-lock.json b/package-lock.json index a028e002d..69c06be83 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,7 @@ "bundlers/*", "convert/*", "enrich/*", + "extract/*", "flatfilers/*", "import/*", "plugins/*", @@ -116,6 +117,31 @@ "@flatfile/listener": "^1.1.0" } }, + "extract/html-table": { + "name": "@flatfile/plugin-extract-html-table", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "@flatfile/util-extractor": "^2.1.5", + "node-html-parser": "^6.1.13" + }, + "devDependencies": { + "@flatfile/rollup-config": "^0.1.1" + } + }, + "flatfilers/playground": { + "name": "@private/playground", + "version": "0.0.0", + "license": "ISC", + "dependencies": { + "@flatfile/api": "^1.9.19", + "@flatfile/listener": "^1.1.0", + "modern-async": "^2.0.0" + }, + "devDependencies": { + "flatfile": "3.8.0" + } + }, "flatfilers/sandbox": { "name": "@private/sandbox", "version": "0.0.0", @@ -3197,6 +3223,10 @@ "resolved": "plugins/export-workbook", "link": true }, + "node_modules/@flatfile/plugin-extract-html-table": { + "resolved": "extract/html-table", + "link": true + }, "node_modules/@flatfile/plugin-foreign-db-extractor": { "resolved": "plugins/foreign-db-extractor", "link": true @@ -6215,6 +6245,10 @@ "node": ">=14" } }, + "node_modules/@private/playground": { + "resolved": "flatfilers/playground", + "link": true + }, "node_modules/@private/sandbox": { "resolved": "flatfilers/sandbox", "link": true @@ -8974,7 +9008,6 @@ }, "node_modules/boolbase": { "version": "1.0.0", - "dev": true, "license": "ISC" }, "node_modules/brace-expansion": { @@ -9840,10 +9873,7 @@ }, "node_modules/css-select": { "version": "5.1.0", - "dev": true, "license": "BSD-2-Clause", - "optional": true, - "peer": true, "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", @@ -9857,10 +9887,7 @@ }, "node_modules/css-select/node_modules/dom-serializer": { "version": "2.0.0", - "dev": true, "license": "MIT", - "optional": true, - "peer": true, "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", @@ -9872,10 +9899,7 @@ }, "node_modules/css-select/node_modules/domhandler": { "version": "5.0.3", - "dev": true, "license": "BSD-2-Clause", - "optional": true, - "peer": true, "dependencies": { "domelementtype": "^2.3.0" }, @@ -9888,10 +9912,7 @@ }, "node_modules/css-select/node_modules/domutils": { "version": "3.1.0", - "dev": true, "license": "BSD-2-Clause", - "optional": true, - "peer": true, "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", @@ -9903,10 +9924,7 @@ }, "node_modules/css-select/node_modules/entities": { "version": "4.5.0", - "dev": true, "license": "BSD-2-Clause", - "optional": true, - "peer": true, "engines": { "node": ">=0.12" }, @@ -9930,7 +9948,6 @@ }, "node_modules/css-what": { "version": "6.1.0", - "dev": true, "license": "BSD-2-Clause", "engines": { "node": ">= 6" @@ -10348,7 +10365,6 @@ }, "node_modules/domelementtype": { "version": "2.3.0", - "dev": true, "funding": [ { "type": "github", @@ -11925,6 +11941,14 @@ "node": ">= 0.4" } }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "bin": { + "he": "bin/he" + } + }, "node_modules/hermes-estree": { "version": "0.19.1", "license": "MIT", @@ -16000,6 +16024,15 @@ "node": ">=8" } }, + "node_modules/node-html-parser": { + "version": "6.1.13", + "resolved": "https://registry.npmjs.org/node-html-parser/-/node-html-parser-6.1.13.tgz", + "integrity": "sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==", + "dependencies": { + "css-select": "^5.1.0", + "he": "1.2.0" + } + }, "node_modules/node-int64": { "version": "0.4.0", "license": "MIT" @@ -16066,7 +16099,6 @@ }, "node_modules/nth-check": { "version": "2.1.1", - "dev": true, "license": "BSD-2-Clause", "dependencies": { "boolbase": "^1.0.0" diff --git a/package.json b/package.json index 87e35555b..a21d368f5 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "bundlers/*", "convert/*", "enrich/*", + "extract/*", "flatfilers/*", "import/*", "plugins/*", From e7d4e5748beb69f299b47f78f3a7179985c776f3 Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Wed, 9 Oct 2024 15:31:17 -0400 Subject: [PATCH 2/3] cleanup --- extract/html-table/jest.config.js | 16 ++++++++++++++++ extract/html-table/package.json | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 extract/html-table/jest.config.js diff --git a/extract/html-table/jest.config.js b/extract/html-table/jest.config.js new file mode 100644 index 000000000..e6d7ca40b --- /dev/null +++ b/extract/html-table/jest.config.js @@ -0,0 +1,16 @@ +module.exports = { + testEnvironment: 'node', + + transform: { + '^.+\\.tsx?$': 'ts-jest', + }, + setupFiles: ['../../test/dotenv-config.js'], + setupFilesAfterEnv: [ + '../../test/betterConsoleLog.js', + '../../test/unit.cleanup.js', + ], + testTimeout: 60_000, + globalSetup: '../../test/setup-global.js', + forceExit: true, + passWithNoTests: true, +} diff --git a/extract/html-table/package.json b/extract/html-table/package.json index e93d01203..51a9f238f 100644 --- a/extract/html-table/package.json +++ b/extract/html-table/package.json @@ -28,7 +28,9 @@ "build:watch": "rollup -c --watch", "build:prod": "NODE_ENV=production rollup -c", "check": "tsc ./**/*.ts --noEmit --esModuleInterop", - "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" + "test": "jest src/*.spec.ts --detectOpenHandles", + "test:unit": "jest src/*.spec.ts --testPathIgnorePatterns=.*\\.e2e\\.spec\\.ts$ --detectOpenHandles", + "test:e2e": "jest src/*.e2e.spec.ts --detectOpenHandles" }, "keywords": ["flatfile", "flatfile-plugins", "category-extractors", "html", "table-extractor"], "author": "Flatfile", From cc661125913781b52db95a796dd10109626cb0d4 Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Thu, 10 Oct 2024 08:35:38 -0400 Subject: [PATCH 3/3] move tests (unit -> e2e) --- .../src/{job.handler.spec.ts => job.handler.e2e.spec.ts} | 0 .../src/{webhook.egress.spec.ts => webhook.egress.e2e.spec.ts} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename plugins/job-handler/src/{job.handler.spec.ts => job.handler.e2e.spec.ts} (100%) rename plugins/webhook-egress/src/{webhook.egress.spec.ts => webhook.egress.e2e.spec.ts} (100%) diff --git a/plugins/job-handler/src/job.handler.spec.ts b/plugins/job-handler/src/job.handler.e2e.spec.ts similarity index 100% rename from plugins/job-handler/src/job.handler.spec.ts rename to plugins/job-handler/src/job.handler.e2e.spec.ts diff --git a/plugins/webhook-egress/src/webhook.egress.spec.ts b/plugins/webhook-egress/src/webhook.egress.e2e.spec.ts similarity index 100% rename from plugins/webhook-egress/src/webhook.egress.spec.ts rename to plugins/webhook-egress/src/webhook.egress.e2e.spec.ts