Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions extract/html-table/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<!-- START_INFOCARD -->

# @flatfile/plugin-extract-html-table

This plugin provides HTML table extraction capabilities for Flatfile. It parses HTML files and extracts structured data from tables, handling complex layouts and nested tables.

**Event Type:** `listener.on('file:created')`

**Supported File Types:** `.html`

<!-- END_INFOCARD -->

## Features

- Extracts table structure, including headers and cell data
- Handles nested tables and complex table layouts
- Handles colspan and rowspan attributes (configurable)
- Supports nested tables up to a configurable depth
- Converts extracted data into a structured format
- Provides error handling for malformed HTML or table structures
- Debug mode for detailed logging

## Parameters

#### `options` - `object` - (optional)

- `handleColspan` - `boolean` - (optional): Determines how to handle colspan. Default is true.
- `handleRowspan` - `boolean` - (optional): Determines how to handle rowspan. Default is true.
- `maxDepth` - `number` - (optional): Maximum depth for nested tables. Default is 3.
- `debug` - `boolean` - (optional): Enables debug logging. Default is false.

## API Calls

- `api.files.download`
- `api.files.update`

## Usage

**install**
```bash
npm install @flatfile/plugin-extract-html-table
```

**import**
```javascript
import { HTMLTableExtractor } from '@flatfile/plugin-extract-html-table';
```

**listener.js**
```javascript
const listener = new FlatfileListener();

listener.use(
HTMLTableExtractor({
handleColspan: true,
handleRowspan: true,
maxDepth: 3,
debug: false
})
);
```
16 changes: 16 additions & 0 deletions extract/html-table/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module.exports = {
testEnvironment: 'node',

transform: {
'^.+\\.tsx?$': 'ts-jest',
},
setupFiles: ['../../test/dotenv-config.js'],
setupFilesAfterEnv: [
'../../test/betterConsoleLog.js',
'../../test/unit.cleanup.js',
],
testTimeout: 60_000,
globalSetup: '../../test/setup-global.js',
forceExit: true,
passWithNoTests: true,
}
55 changes: 55 additions & 0 deletions extract/html-table/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"name": "@flatfile/plugin-extract-html-table",
"version": "1.0.0",
"description": "A Flatfile plugin for extracting table data from HTML files",
"main": "./dist/index.cjs",
"module": "./dist/index.mjs",
"types": "./dist/index.d.ts",
"browser": {
"./dist/index.cjs": "./dist/index.browser.cjs",
"./dist/index.mjs": "./dist/index.browser.mjs"
},
"exports": {
"types": "./dist/index.d.ts",
"node": {
"import": "./dist/index.mjs",
"require": "./dist/index.cjs"
},
"browser": {
"require": "./dist/index.browser.cjs",
"import": "./dist/index.browser.mjs"
},
"default": "./dist/index.mjs"
},
"source": "./src/index.ts",
"files": ["dist/**"],
"scripts": {
"build": "rollup -c",
"build:watch": "rollup -c --watch",
"build:prod": "NODE_ENV=production rollup -c",
"check": "tsc ./**/*.ts --noEmit --esModuleInterop",
"test": "jest src/*.spec.ts --detectOpenHandles",
"test:unit": "jest src/*.spec.ts --testPathIgnorePatterns=.*\\.e2e\\.spec\\.ts$ --detectOpenHandles",
"test:e2e": "jest src/*.e2e.spec.ts --detectOpenHandles"
},
"keywords": ["flatfile", "flatfile-plugins", "category-extractors", "html", "table-extractor"],
"author": "Flatfile",
"license": "ISC",
"dependencies": {
"@flatfile/util-extractor": "^2.1.5",
"node-html-parser": "^6.1.13"
},
"devDependencies": {
"@flatfile/rollup-config": "^0.1.1"
},
"repository": {
"type": "git",
"url": "https://github.com/FlatFilers/flatfile-plugins.git",
"directory": "extract/html-table"
},
"browserslist": [
"> 0.5%",
"last 2 versions",
"not dead"
]
}
5 changes: 5 additions & 0 deletions extract/html-table/rollup.config.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import { buildConfig } from '@flatfile/rollup-config';

const config = buildConfig({});

export default config;
51 changes: 51 additions & 0 deletions extract/html-table/samples/complex_table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Complex Table Example</title>
</head>
<body>
<h1>Quarterly Sales Report</h1>
<table border="1">
<tr>
<th rowspan="2">Product Category</th>
<th colspan="4">Quarterly Sales</th>
<th rowspan="2">Total</th>
</tr>
<tr>
<th>Q1</th>
<th>Q2</th>
<th>Q3</th>
<th>Q4</th>
</tr>
<tr>
<td>Electronics</td>
<td>$50,000</td>
<td>$65,000</td>
<td>$70,000</td>
<td>$90,000</td>
<td>$275,000</td>
</tr>
<tr>
<td>Clothing</td>
<td>$35,000</td>
<td>$40,000</td>
<td>$55,000</td>
<td>$80,000</td>
<td>$210,000</td>
</tr>
<tr>
<td>Home & Garden</td>
<td>$25,000</td>
<td>$30,000</td>
<td>$40,000</td>
<td>$45,000</td>
<td>$140,000</td>
</tr>
<tr>
<td colspan="5">Total Sales</td>
<td>$625,000</td>
</tr>
</table>
</body>
</html>
78 changes: 78 additions & 0 deletions extract/html-table/samples/multiple_tables.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Multiple Tables Example</title>
</head>
<body>
<h1>Company Data</h1>

<h2>Employee Information</h2>
<table>
<tr>
<th>Name</th>
<th>Position</th>
<th>Department</th>
</tr>
<tr>
<td>John Doe</td>
<td>Software Engineer</td>
<td>IT</td>
</tr>
<tr>
<td>Jane Smith</td>
<td>Marketing Specialist</td>
<td>Marketing</td>
</tr>
</table>

<h2>Department Budget</h2>
<table>
<tr>
<th>Department</th>
<th>Budget</th>
<th>Expenses</th>
</tr>
<tr>
<td>IT</td>
<td>$500,000</td>
<td>
<table>
<tr>
<th>Category</th>
<th>Amount</th>
</tr>
<tr>
<td>Hardware</td>
<td>$200,000</td>
</tr>
<tr>
<td>Software</td>
<td>$150,000</td>
</tr>
</table>
</td>
</tr>
<tr>
<td>Marketing</td>
<td>$300,000</td>
<td>
<table>
<tr>
<th>Category</th>
<th>Amount</th>
</tr>
<tr>
<td>Advertising</td>
<td>$150,000</td>
</tr>
<tr>
<td>Events</td>
<td>$100,000</td>
</tr>
</table>
</td>
</tr>
</table>
</body>
</html>
32 changes: 32 additions & 0 deletions extract/html-table/samples/simple_table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Simple Table Example</title>
</head>
<body>
<h1>Employee Information</h1>
<table>
<tr>
<th>Name</th>
<th>Position</th>
<th>Department</th>
</tr>
<tr>
<td>John Doe</td>
<td>Software Engineer</td>
<td>IT</td>
</tr>
<tr>
<td>Jane Smith</td>
<td>Marketing Specialist</td>
<td>Marketing</td>
</tr>
<tr>
<td>Bob Johnson</td>
<td>HR Manager</td>
<td>Human Resources</td>
</tr>
</table>
</body>
</html>
15 changes: 15 additions & 0 deletions extract/html-table/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { Extractor } from '@flatfile/util-extractor'
import { parseBuffer } from './parser'

export interface HTMLTableExtractorOptions {
handleColspan?: boolean
handleRowspan?: boolean
maxDepth?: number
debug?: boolean
}

export const HTMLTableExtractor = (options: HTMLTableExtractorOptions = {}) => {
return Extractor('html', 'html-tables', parseBuffer, options)
}

export const htmlTableParser = parseBuffer
Loading