From 29191e47995e0c83c0ee733a4e7113a45d38f6d4 Mon Sep 17 00:00:00 2001 From: "Alex Rock (Koala)" Date: Tue, 24 Sep 2024 01:50:07 -0600 Subject: [PATCH 1/5] koala: initial commit --- validators/TextSummarizer/README.MD | 65 ++++++++++++++++ validators/TextSummarizer/metadata.json | 84 +++++++++++++++++++++ validators/TextSummarizer/package.json | 72 ++++++++++++++++++ validators/TextSummarizer/rollup.config.mjs | 44 +++++++++++ validators/TextSummarizer/src/index.ts | 83 ++++++++++++++++++++ 5 files changed, 348 insertions(+) create mode 100644 validators/TextSummarizer/README.MD create mode 100644 validators/TextSummarizer/metadata.json create mode 100644 validators/TextSummarizer/package.json create mode 100644 validators/TextSummarizer/rollup.config.mjs create mode 100644 validators/TextSummarizer/src/index.ts diff --git a/validators/TextSummarizer/README.MD b/validators/TextSummarizer/README.MD new file mode 100644 index 000000000..bfa481c97 --- /dev/null +++ b/validators/TextSummarizer/README.MD @@ -0,0 +1,65 @@ +# Flatfile Text Summarization Plugin + +This plugin for Flatfile provides automatic text summarization capabilities using natural language processing. It leverages the compromise library to generate summaries and extract key phrases from content in specified fields. + +## Features + +- Automatic summarization of text content +- Extraction of key phrases +- Configurable summary length or percentage +- Custom field mapping for content, summary, and key phrases +- Error handling for missing content + +## Installation + +To install the plugin, use npm: + +```bash +npm install @flatfile/plugin-text-summarization +``` + +## Example Usage + +```javascript +import { FlatfileListener } from "@flatfile/listener"; +import textSummarization from "@flatfile/plugin-text-summarization"; + +const listener = new FlatfileListener(); + +listener.use( + textSummarization({ + sheetSlug: "articles", + contentField: "full_text", + summaryField: "summary", + keyPhrasesField: "key_phrases", + summaryLength: 3, + autoSummarize: true + }) +); +``` + +## Configuration + +The plugin accepts a configuration object with the following properties: + +- `sheetSlug` (required): The slug of the sheet to apply the summarization to +- `contentField` (required): The field containing the full text content +- `summaryField` (required): The field where the generated summary will be stored +- `keyPhrasesField` (required): The field where extracted key phrases will be stored +- `summaryLength` (optional): The number of sentences to include in the summary (default: 2) +- `summaryPercentage` (optional): The percentage of original content to include in the summary +- `autoSummarize` (optional): Whether to automatically generate summaries (default: false) + +## Behavior + +1. The plugin checks for required configuration fields and throws an error if any are missing. +2. It processes each record in the specified sheet. +3. If `autoSummarize` is true and no existing summary is present: + - It generates a summary based on the specified length or percentage. + - If the content is shorter than the specified summary length, the entire content is used as the summary. + - For longer content, it combines sentences from the beginning and end of the text. + - Key phrases are extracted and stored in the specified field. +4. If the content field is empty, an error is added to the record. +5. The plugin uses a concurrency of 2 for processing records and includes debug information. + +Note: This plugin requires the `@flatfile/listener`, `@flatfile/plugin-record-hook`, and `compromise` packages as dependencies. \ No newline at end of file diff --git a/validators/TextSummarizer/metadata.json b/validators/TextSummarizer/metadata.json new file mode 100644 index 000000000..f9f75be10 --- /dev/null +++ b/validators/TextSummarizer/metadata.json @@ -0,0 +1,84 @@ +{ + "timestamp": "2024-09-24T07-23-50-639Z", + "task": "Develop a text summarizer Flatfile Listener plugin:\n - Create a RecordHook to summarize text fields\n - Use the 'compromise' npm package for natural language processing\n - Implement extractive summarization techniques\n - Allow configuration of summary length or percentage\n - Add summarized text to a new field in the record\n - Give the user reasonable config options to specify the Sheet Slug, the Field(s) that are the text(s), whether the summarization should be done automatically", + "summary": "This code implements a Flatfile Listener plugin for text summarization. It uses the compromise library for natural language processing and provides configurable options for automatic summarization of content in specified fields.", + "steps": [ + [ + "Retrieve information about Flatfile Listeners and RecordHook.\n", + "#E1", + "PineconeAssistant", + "Provide information about Flatfile Listeners and RecordHook, including their structure and usage", + "Plan: Retrieve information about Flatfile Listeners and RecordHook.\n#E1 = PineconeAssistant[Provide information about Flatfile Listeners and RecordHook, including their structure and usage]" + ], + [ + "Search for information about the 'compromise' npm package for natural language processing.\n", + "#E2", + "Google", + "compromise npm package natural language processing", + "Plan: Search for information about the 'compromise' npm package for natural language processing.\n#E2 = Google[compromise npm package natural language processing]" + ], + [ + "Create the basic structure of the Flatfile Listener plugin with RecordHook.\n", + "#E3", + "LLM", + "Create a basic structure for a Flatfile Listener plugin with RecordHook, using the information from #E1", + "Plan: Create the basic structure of the Flatfile Listener plugin with RecordHook.\n#E3 = LLM[Create a basic structure for a Flatfile Listener plugin with RecordHook, using the information from #E1]" + ], + [ + "Implement the text summarization logic using the 'compromise' package.\n", + "#E4", + "LLM", + "Implement extractive summarization techniques using the 'compromise' package based on the information from #E2 and integrate it into the Listener structure from #E3", + "Plan: Implement the text summarization logic using the 'compromise' package.\n#E4 = LLM[Implement extractive summarization techniques using the 'compromise' package based on the information from #E2 and integrate it into the Listener structure from #E3]" + ], + [ + "Add configuration options for summary length or percentage and field selection.\n", + "#E5", + "LLM", + "Extend the Listener code from #E4 to include configuration options for summary length or percentage, Sheet Slug, and field selection", + "Plan: Add configuration options for summary length or percentage and field selection.\n#E5 = LLM[Extend the Listener code from #E4 to include configuration options for summary length or percentage, Sheet Slug, and field selection]" + ], + [ + "Implement the logic to add the summarized text to a new field in the record.\n", + "#E6", + "LLM", + "Add code to the Listener from #E5 to create a new field in the record and populate it with the summarized text", + "Plan: Implement the logic to add the summarized text to a new field in the record.\n#E6 = LLM[Add code to the Listener from #E5 to create a new field in the record and populate it with the summarized text]" + ], + [ + "Add configuration option for automatic summarization.\n", + "#E7", + "LLM", + "Extend the Listener code from #E6 to include a configuration option for automatic summarization", + "Plan: Add configuration option for automatic summarization.\n#E7 = LLM[Extend the Listener code from #E6 to include a configuration option for automatic summarization]" + ], + [ + "Verify the Event Topics used in the Listener.\n", + "#E8", + "PineconeAssistant", + "Verify that the Event Topics used in the Listener are valid according to the event.topics.fact file", + "Plan: Verify the Event Topics used in the Listener.\n#E8 = PineconeAssistant[Verify that the Event Topics used in the Listener are valid according to the event.topics.fact file]" + ], + [ + "Finalize the Flatfile Listener plugin code.\n", + "#E9", + "LLM", + "Combine all the components from #E3, #E4, #E5, #E6, #E7, and #E8 into a complete Flatfile Listener plugin code, ensuring all imports are used and the code is valid", + "Plan: Finalize the Flatfile Listener plugin code.\n#E9 = LLM[Combine all the components from #E3, #E4, #E5, #E6, #E7, and #E8 into a complete Flatfile Listener plugin code, ensuring all imports are used and the code is valid]" + ], + [ + "Review and optimize the final code.\n", + "#E10", + "LLM", + "Review the code from #E9, remove any unused imports, validate plugin parameters, and ensure the code follows best practices for Flatfile Listener plugins", + "Plan: Review and optimize the final code.\n#E10 = LLM[Review the code from #E9, remove any unused imports, validate plugin parameters, and ensure the code follows best practices for Flatfile Listener plugins]" + ] + ], + "metrics": { + "tokens": { + "plan": 5077, + "state": 4675, + "total": 9752 + } + } +} \ No newline at end of file diff --git a/validators/TextSummarizer/package.json b/validators/TextSummarizer/package.json new file mode 100644 index 000000000..3c905af70 --- /dev/null +++ b/validators/TextSummarizer/package.json @@ -0,0 +1,72 @@ +{ + "name": "@flatfile/plugin-text-summarization", + "version": "1.0.0", + "description": "A Flatfile plugin for text summarization and key phrase extraction", + "main": "./dist/index.js", + "module": "./dist/index.mjs", + "types": "./dist/index.d.ts", + "browser": { + "./dist/index.js": "./dist/index.browser.js", + "./dist/index.mjs": "./dist/index.browser.mjs" + }, + "exports": { + "types": "./dist/index.d.ts", + "node": { + "import": "./dist/index.mjs", + "require": "./dist/index.js" + }, + "browser": { + "require": "./dist/index.browser.js", + "import": "./dist/index.browser.mjs" + }, + "default": "./dist/index.mjs" + }, + "source": "./src/index.ts", + "files": [ + "dist/**" + ], + "scripts": { + "build": "rollup -c", + "build:watch": "rollup -c --watch", + "build:prod": "NODE_ENV=production rollup -c", + "check": "tsc ./**/*.ts --noEmit --esModuleInterop", + "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" + }, + "keywords": [ + "flatfile", + "plugin", + "text summarization", + "key phrases", + "nlp", + "flatfile-plugins", + "category-transform" + ], + "author": "Your Name", + "license": "MIT", + "dependencies": { + "@flatfile/plugin-record-hook": "^1.6.1", + "compromise": "^14.14.0" + }, + "peerDependencies": { + "@flatfile/listener": "^1.0.5" + }, + "devDependencies": { + "@flatfile/hooks": "^1.5.0", + "@flatfile/rollup-config": "^0.1.1", + "@types/node": "^22.6.1", + "typescript": "^5.6.2", + "rollup": "^4.22.4", + "jest": "^29.7.0", + "@types/jest": "^29.5.13" + }, + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "plugins/text-summarization" + }, + "browserslist": [ + "> 0.5%", + "last 2 versions", + "not dead" + ] +} \ No newline at end of file diff --git a/validators/TextSummarizer/rollup.config.mjs b/validators/TextSummarizer/rollup.config.mjs new file mode 100644 index 000000000..a0a22b94c --- /dev/null +++ b/validators/TextSummarizer/rollup.config.mjs @@ -0,0 +1,44 @@ +import { buildConfig } from '@flatfile/rollup-config'; +import typescript from '@rollup/plugin-typescript'; +import { nodeResolve } from '@rollup/plugin-node-resolve'; +import commonjs from '@rollup/plugin-commonjs'; + +const umdExternals = [ + '@flatfile/api', + '@flatfile/hooks', + '@flatfile/listener', + '@flatfile/util-common', + '@flatfile/plugin-record-hook', + 'compromise' +]; + +const config = buildConfig({ + includeUmd: true, + umdConfig: { name: 'FlatfileSummarizer', external: umdExternals }, + external: [ + ...umdExternals, + 'compromise' + ] +}); + +// Add TypeScript support to all configurations +config.forEach(conf => { + if (!conf.plugins) conf.plugins = []; + conf.plugins.unshift( + typescript({ tsconfig: './tsconfig.json' }), + nodeResolve({ preferBuiltins: true }), + commonjs() + ); +}); + +// Ensure proper handling of 'compromise' as an external dependency +config.forEach(conf => { + if (conf.output && conf.output.format === 'umd') { + conf.output.globals = { + ...conf.output.globals, + 'compromise': 'nlp' + }; + } +}); + +export default config; \ No newline at end of file diff --git a/validators/TextSummarizer/src/index.ts b/validators/TextSummarizer/src/index.ts new file mode 100644 index 000000000..015b32004 --- /dev/null +++ b/validators/TextSummarizer/src/index.ts @@ -0,0 +1,83 @@ +import { FlatfileListener } from '@flatfile/listener' +import { recordHook } from '@flatfile/plugin-record-hook' +import nlp from 'compromise' + +interface SummarizationConfig { + sheetSlug: string + contentField: string + summaryField: string + keyPhrasesField: string + summaryLength?: number + summaryPercentage?: number + autoSummarize?: boolean +} + +export default function ( + listener: FlatfileListener, + config: SummarizationConfig +) { + if ( + !config.sheetSlug || + !config.contentField || + !config.summaryField || + !config.keyPhrasesField + ) { + throw new Error('Missing required configuration fields') + } + + listener.use( + recordHook( + config.sheetSlug, + async (record, event) => { + const content = record.get(config.contentField) as string + const existingSummary = record.get(config.summaryField) as string + + if (!content) { + record.addError( + config.contentField, + 'Content is required for summarization' + ) + return record + } + + if (config.autoSummarize && !existingSummary) { + const doc = nlp(content) + const sentences = doc.sentences().out('array') + + let summaryLength = config.summaryLength || 2 + if (config.summaryPercentage) { + summaryLength = Math.max( + 1, + Math.floor((sentences.length * config.summaryPercentage) / 100) + ) + } + + let summary = '' + if (sentences.length > 0) { + if (sentences.length <= summaryLength) { + summary = sentences.join(' ') + } else { + const middleIndex = Math.floor(summaryLength / 2) + const firstPart = sentences.slice(0, middleIndex).join(' ') + const lastPart = sentences.slice(-middleIndex).join(' ') + summary = `${firstPart} ... ${lastPart}` + } + } + + record.set(config.summaryField, summary) + + const keyPhrases = doc + .match('#Noun+ (#Adjective|#Noun){0,2}') + .out('array') + record.set(config.keyPhrasesField, keyPhrases.join(', ')) + } + + return record + }, + { + concurrency: 2, + debug: true, + } + ) + ) +} From a8cdfa1755fa8eeee1db438a44f5dc33ce8d43b1 Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Mon, 30 Sep 2024 14:56:02 -0500 Subject: [PATCH 2/5] cleanup --- enrich/summarize/README.MD | 71 +++++++++++++++ .../summarize}/metadata.json | 0 .../summarize}/package.json | 63 ++++++------- enrich/summarize/rollup.config.mjs | 5 ++ enrich/summarize/src/index.ts | 60 +++++++++++++ .../src/text-summarizer.plugin.spec.ts | 89 +++++++++++++++++++ flatfilers/sandbox/src/index.ts | 49 +++------- package-lock.json | 64 ++++++++++++- package.json | 1 + validators/TextSummarizer/README.MD | 65 -------------- validators/TextSummarizer/rollup.config.mjs | 44 --------- validators/TextSummarizer/src/index.ts | 83 ----------------- 12 files changed, 327 insertions(+), 267 deletions(-) create mode 100644 enrich/summarize/README.MD rename {validators/TextSummarizer => enrich/summarize}/metadata.json (100%) rename {validators/TextSummarizer => enrich/summarize}/package.json (56%) create mode 100644 enrich/summarize/rollup.config.mjs create mode 100644 enrich/summarize/src/index.ts create mode 100644 enrich/summarize/src/text-summarizer.plugin.spec.ts delete mode 100644 validators/TextSummarizer/README.MD delete mode 100644 validators/TextSummarizer/rollup.config.mjs delete mode 100644 validators/TextSummarizer/src/index.ts diff --git a/enrich/summarize/README.MD b/enrich/summarize/README.MD new file mode 100644 index 000000000..9f92e3694 --- /dev/null +++ b/enrich/summarize/README.MD @@ -0,0 +1,71 @@ + + +# @flatfile/plugin-enrich-summarize + +This plugin provides automatic text summarization capabilities for Flatfile using natural language processing. It uses the compromise library to generate summaries and extract key phrases from specified fields. + +**Event Type:** `listener.on('commit:created')` + +**Supported Field Types:** `string` + + + +## Features + +- Automatic text summarization +- Key phrase extraction +- Configurable summary length or percentage +- Custom field mapping for content, summary, and key phrases +- Error handling for missing content + +## Parameters + +#### `sheetSlug` - `string` - (required) +The slug of the sheet to apply summarization. + +#### `contentField` - `string` - (required) +The field containing the full text content. + +#### `summaryField` - `string` - (required) +The field to store the generated summary. + +#### `keyPhrasesField` - `string` - (required) +The field to store extracted key phrases. + +#### `summaryLength` - `number` - (optional) +Number of sentences in the summary. Default is 2. + +#### `summaryPercentage` - `number` - (optional) +Percentage of content to include in summary. + +#### `autoSummarize` - `boolean` - (optional) +Automatically generate summaries. Default is false. + +## Usage + +**install** +```bash +npm install @flatfile/plugin-enrich-summarize +``` + +**import** +```javascript +import { FlatfileListener } from "@flatfile/listener"; +import { textSummarization } from "@flatfile/plugin-enrich-summarize"; +``` + +**listener.js** +```javascript +const listener = new FlatfileListener(); + +listener.use( + textSummarization({ + sheetSlug: "articles", + contentField: "full_text", + summaryField: "summary", + keyPhrasesField: "key_phrases", + summaryLength: 3, + autoSummarize: true + }) +); +``` diff --git a/validators/TextSummarizer/metadata.json b/enrich/summarize/metadata.json similarity index 100% rename from validators/TextSummarizer/metadata.json rename to enrich/summarize/metadata.json diff --git a/validators/TextSummarizer/package.json b/enrich/summarize/package.json similarity index 56% rename from validators/TextSummarizer/package.json rename to enrich/summarize/package.json index 3c905af70..ba7475835 100644 --- a/validators/TextSummarizer/package.json +++ b/enrich/summarize/package.json @@ -1,26 +1,32 @@ { - "name": "@flatfile/plugin-text-summarization", - "version": "1.0.0", + "name": "@flatfile/plugin-enrich-summarize", + "version": "0.0.0", "description": "A Flatfile plugin for text summarization and key phrase extraction", - "main": "./dist/index.js", - "module": "./dist/index.mjs", - "types": "./dist/index.d.ts", + "registryMetadata": { + "category": "records" + }, + "engines": { + "node": ">= 16" + }, "browser": { - "./dist/index.js": "./dist/index.browser.js", + "./dist/index.cjs": "./dist/index.browser.cjs", "./dist/index.mjs": "./dist/index.browser.mjs" }, "exports": { "types": "./dist/index.d.ts", "node": { "import": "./dist/index.mjs", - "require": "./dist/index.js" + "require": "./dist/index.cjs" }, "browser": { - "require": "./dist/index.browser.js", + "require": "./dist/index.browser.cjs", "import": "./dist/index.browser.mjs" }, "default": "./dist/index.mjs" }, + "main": "./dist/index.cjs", + "module": "./dist/index.mjs", + "types": "./dist/index.d.ts", "source": "./src/index.ts", "files": [ "dist/**" @@ -33,40 +39,25 @@ "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" }, "keywords": [ - "flatfile", - "plugin", - "text summarization", - "key phrases", - "nlp", "flatfile-plugins", - "category-transform" + "category-enrich" ], - "author": "Your Name", - "license": "MIT", + "author": "Flatfile", + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "enrich/summarize" + }, + "license": "ISC", "dependencies": { - "@flatfile/plugin-record-hook": "^1.6.1", + "@flatfile/plugin-record-hook": "^1.7.0", "compromise": "^14.14.0" }, "peerDependencies": { - "@flatfile/listener": "^1.0.5" + "@flatfile/api": "^1.4.13", + "@flatfile/listener": "^1.1.0" }, "devDependencies": { - "@flatfile/hooks": "^1.5.0", - "@flatfile/rollup-config": "^0.1.1", - "@types/node": "^22.6.1", - "typescript": "^5.6.2", - "rollup": "^4.22.4", - "jest": "^29.7.0", - "@types/jest": "^29.5.13" - }, - "repository": { - "type": "git", - "url": "https://github.com/FlatFilers/flatfile-plugins.git", - "directory": "plugins/text-summarization" - }, - "browserslist": [ - "> 0.5%", - "last 2 versions", - "not dead" - ] + "@flatfile/rollup-config": "^0.1.1" + } } \ No newline at end of file diff --git a/enrich/summarize/rollup.config.mjs b/enrich/summarize/rollup.config.mjs new file mode 100644 index 000000000..fafa813c6 --- /dev/null +++ b/enrich/summarize/rollup.config.mjs @@ -0,0 +1,5 @@ +import { buildConfig } from '@flatfile/rollup-config' + +const config = buildConfig({}) + +export default config diff --git a/enrich/summarize/src/index.ts b/enrich/summarize/src/index.ts new file mode 100644 index 000000000..008f17bc6 --- /dev/null +++ b/enrich/summarize/src/index.ts @@ -0,0 +1,60 @@ +import { type FlatfileRecord, recordHook } from '@flatfile/plugin-record-hook' +import nlp from 'compromise' + +interface SummarizationConfig { + sheetSlug: string + contentField: string + summaryField: string + keyPhrasesField: string + summaryLength?: number + summaryPercentage?: number +} + +export function summarizationPlugin(config: SummarizationConfig) { + return recordHook(config.sheetSlug, (record: FlatfileRecord) => { + const content = record.get(config.contentField) as string + const existingSummary = record.get(config.summaryField) as string + + if (!content) { + record.addError( + config.contentField, + 'Content is required for summarization' + ) + return record + } + + if (!existingSummary) { + const doc = nlp(content) + const sentences = doc.sentences().out('array') + + let summaryLength = config.summaryLength || 2 + if (config.summaryPercentage) { + summaryLength = Math.max( + 1, + Math.floor((sentences.length * config.summaryPercentage) / 100) + ) + } + + let summary = '' + if (sentences.length > 0) { + if (sentences.length <= summaryLength) { + summary = sentences.join(' ') + } else { + const middleIndex = Math.floor(summaryLength / 2) + const firstPart = sentences.slice(0, middleIndex).join(' ') + const lastPart = sentences.slice(-middleIndex).join(' ') + summary = `${firstPart} ... ${lastPart}` + } + } + + record.set(config.summaryField, summary) + + const keyPhrases = doc + .match('#Noun+ (#Adjective|#Noun){0,2}') + .out('array') + record.set(config.keyPhrasesField, keyPhrases.join(', ')) + } + + return record + }) +} diff --git a/enrich/summarize/src/text-summarizer.plugin.spec.ts b/enrich/summarize/src/text-summarizer.plugin.spec.ts new file mode 100644 index 000000000..366deea38 --- /dev/null +++ b/enrich/summarize/src/text-summarizer.plugin.spec.ts @@ -0,0 +1,89 @@ +import { FlatfileClient } from '@flatfile/api' +import { + createRecords, + deleteSpace, + getRecords, + setupListener, + setupSimpleWorkbook, + setupSpace, +} from '@flatfile/utils-testing' +import { summarizationPlugin } from './index' + +const api = new FlatfileClient() + +describe('Text Summarization Plugin', () => { + const listener = setupListener() + let spaceId: string + let sheetId: string + + beforeAll(async () => { + const space = await setupSpace() + spaceId = space.id + const workbook = await setupSimpleWorkbook(space.id, [ + { key: 'content', type: 'string' }, + { key: 'summary', type: 'string' }, + { key: 'key_phrases', type: 'string' }, + ]) + sheetId = workbook.sheets![0].id + }) + + afterAll(async () => { + await deleteSpace(spaceId) + }) + + afterEach(async () => { + listener.reset() + const records = await getRecords(sheetId) + if (records.length > 0) { + const ids = records.map((record) => record.id) + await api.records.delete(sheetId, { ids }) + } + }) + + describe('summarizationPlugin', () => { + const mockConfig = { + sheetSlug: 'test', + contentField: 'content', + summaryField: 'summary', + keyPhrasesField: 'key_phrases', + summaryLength: 2, + } + + it('should add summary and key phrases to the record', async () => { + listener.use(summarizationPlugin(mockConfig)) + + await createRecords(sheetId, [ + { + content: + 'This is a test sentence. This is another test sentence. And a third one for good measure.', + }, + ]) + await listener.waitFor('commit:created') + + const records = await getRecords(sheetId) + + expect(records[0].values['summary'].value).toBeDefined() + expect(records[0].values['key_phrases'].value).toBeDefined() + expect( + (records[0].values['summary'].value as string).split('.').length + ).toBe(6) + }) + + it('should handle empty content fields', async () => { + listener.use(summarizationPlugin(mockConfig)) + + await createRecords(sheetId, [{ content: '' }]) + await listener.waitFor('commit:created') + + const records = await getRecords(sheetId) + + expect(records[0].values['summary'].value).toBeUndefined() + expect(records[0].values['key_phrases'].value).toBeUndefined() + expect(records[0].values['content'].messages).toContainEqual( + expect.objectContaining({ + message: 'Content is required for summarization', + }) + ) + }) + }) +}) diff --git a/flatfilers/sandbox/src/index.ts b/flatfilers/sandbox/src/index.ts index f8d538aea..5693b5382 100644 --- a/flatfilers/sandbox/src/index.ts +++ b/flatfilers/sandbox/src/index.ts @@ -1,14 +1,14 @@ import type { FlatfileListener } from '@flatfile/listener' +import { summarizationPlugin } from '@flatfile/plugin-enrich-summarize' import { configureSpace } from '@flatfile/plugin-space-configure' -import { validateDate } from '@flatfile/plugin-validate-date' export default async function (listener: FlatfileListener) { listener.use( - validateDate({ - sheetSlug: 'contacts', - dateFields: ['dob', 'hire_date'], - outputFormat: 'MM/dd/yyyy', - includeTime: true, + summarizationPlugin({ + sheetSlug: 'summarization', + contentField: 'content', + summaryField: 'summary', + keyPhrasesField: 'keyPhrases', }) ) listener.use( @@ -18,44 +18,23 @@ export default async function (listener: FlatfileListener) { name: 'Sandbox', sheets: [ { - name: 'Contacts', - slug: 'contacts', - allowAdditionalFields: true, + name: 'Summarization', + slug: 'summarization', fields: [ { - key: 'firstName', + key: 'content', type: 'string', - label: 'First Name', + label: 'Content', }, { - key: 'lastName', + key: 'summary', type: 'string', - label: 'Last Name', + label: 'Summary', }, { - key: 'email', + key: 'keyPhrases', type: 'string', - label: 'Email', - }, - { - key: 'phone', - type: 'string', - label: 'Phone', - }, - { - key: 'country', - type: 'string', - label: 'Country', - }, - { - key: 'dob', - type: 'string', - label: 'Date of Birth', - }, - { - key: 'hire_date', - type: 'string', - label: 'Hire Date', + label: 'Key Phrases', }, ], }, diff --git a/package-lock.json b/package-lock.json index 7d3c42608..9b0855032 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "workspaces": [ "bundlers/*", "convert/*", + "enrich/*", "flatfilers/*", "plugins/*", "support/*", @@ -78,6 +79,25 @@ "@flatfile/listener": "^1.0.5" } }, + "enrich/summarize": { + "name": "@flatfile/plugin-enrich-summarize", + "version": "0.0.0", + "license": "ISC", + "dependencies": { + "@flatfile/plugin-record-hook": "^1.7.0", + "compromise": "^14.14.0" + }, + "devDependencies": { + "@flatfile/rollup-config": "^0.1.1" + }, + "engines": { + "node": ">= 16" + }, + "peerDependencies": { + "@flatfile/api": "^1.4.13", + "@flatfile/listener": "^1.1.0" + } + }, "flatfilers/playground": { "name": "@private/playground", "version": "0.0.0", @@ -3141,6 +3161,10 @@ "resolved": "plugins/dxp-configure", "link": true }, + "node_modules/@flatfile/plugin-enrich-summarize": { + "resolved": "enrich/summarize", + "link": true + }, "node_modules/@flatfile/plugin-export-workbook": { "resolved": "plugins/export-workbook", "link": true @@ -9265,8 +9289,7 @@ }, "node_modules/chrono-node": { "version": "2.7.7", - "resolved": "https://registry.npmjs.org/chrono-node/-/chrono-node-2.7.7.tgz", - "integrity": "sha512-p3S7gotuTPu5oqhRL2p1fLwQXGgdQaRTtWR3e8Di9P1Pa9mzkK5DWR5AWBieMUh2ZdOnPgrK+zCrbbtyuA+D/Q==", + "license": "MIT", "dependencies": { "dayjs": "^1.10.0" }, @@ -9478,6 +9501,19 @@ "license": "MIT", "peer": true }, + "node_modules/compromise": { + "version": "14.14.1", + "resolved": "https://registry.npmjs.org/compromise/-/compromise-14.14.1.tgz", + "integrity": "sha512-2N/wPyFwtDaskKorKMs2sB+KTcrkOUHSaKZkNQMo8hEcNJHLt8NdwNVsN14vnmsgjPOxctTOYt3DWVxXIwXXxA==", + "dependencies": { + "efrt": "2.7.0", + "grad-school": "0.0.5", + "suffix-thumb": "5.0.2" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/concat-map": { "version": "0.0.1", "license": "MIT" @@ -10387,6 +10423,14 @@ "version": "1.1.1", "license": "MIT" }, + "node_modules/efrt": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/efrt/-/efrt-2.7.0.tgz", + "integrity": "sha512-/RInbCy1d4P6Zdfa+TMVsf/ufZVotat5hCw3QXmWtjU+3pFEOvOQ7ibo3aIxyCJw2leIeAMjmPj+1SLJiCpdrQ==", + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/ejs": { "version": "3.1.10", "dev": true, @@ -11693,6 +11737,14 @@ "version": "4.2.11", "license": "ISC" }, + "node_modules/grad-school": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/grad-school/-/grad-school-0.0.5.tgz", + "integrity": "sha512-rXunEHF9M9EkMydTBux7+IryYXEZinRk6g8OBOGDBzo/qWJjhTxy86i5q7lQYpCLHN8Sqv1XX3OIOc7ka2gtvQ==", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/grapheme-splitter": { "version": "1.0.4", "dev": true, @@ -18816,6 +18868,11 @@ "license": "MIT", "peer": true }, + "node_modules/suffix-thumb": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/suffix-thumb/-/suffix-thumb-5.0.2.tgz", + "integrity": "sha512-I5PWXAFKx3FYnI9a+dQMWNqTxoRt6vdBdb0O+BJ1sxXCWtSoQCusc13E58f+9p4MYx/qCnEMkD5jac6K2j3dgA==" + }, "node_modules/supports-color": { "version": "5.5.0", "license": "MIT", @@ -20941,8 +20998,7 @@ }, "validate/date/node_modules/date-fns": { "version": "4.1.0", - "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-4.1.0.tgz", - "integrity": "sha512-Ukq0owbQXxa/U3EGtsdVBkR1w7KOQ5gIBqdH2hkvknzZPYvBxb/aa6E8L7tmjFtkwZBu3UXBbjIgPo/Ez4xaNg==", + "license": "MIT", "funding": { "type": "github", "url": "https://github.com/sponsors/kossnocorp" diff --git a/package.json b/package.json index 77f95a101..305579ee8 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "workspaces": [ "bundlers/*", "convert/*", + "enrich/*", "flatfilers/*", "plugins/*", "support/*", diff --git a/validators/TextSummarizer/README.MD b/validators/TextSummarizer/README.MD deleted file mode 100644 index bfa481c97..000000000 --- a/validators/TextSummarizer/README.MD +++ /dev/null @@ -1,65 +0,0 @@ -# Flatfile Text Summarization Plugin - -This plugin for Flatfile provides automatic text summarization capabilities using natural language processing. It leverages the compromise library to generate summaries and extract key phrases from content in specified fields. - -## Features - -- Automatic summarization of text content -- Extraction of key phrases -- Configurable summary length or percentage -- Custom field mapping for content, summary, and key phrases -- Error handling for missing content - -## Installation - -To install the plugin, use npm: - -```bash -npm install @flatfile/plugin-text-summarization -``` - -## Example Usage - -```javascript -import { FlatfileListener } from "@flatfile/listener"; -import textSummarization from "@flatfile/plugin-text-summarization"; - -const listener = new FlatfileListener(); - -listener.use( - textSummarization({ - sheetSlug: "articles", - contentField: "full_text", - summaryField: "summary", - keyPhrasesField: "key_phrases", - summaryLength: 3, - autoSummarize: true - }) -); -``` - -## Configuration - -The plugin accepts a configuration object with the following properties: - -- `sheetSlug` (required): The slug of the sheet to apply the summarization to -- `contentField` (required): The field containing the full text content -- `summaryField` (required): The field where the generated summary will be stored -- `keyPhrasesField` (required): The field where extracted key phrases will be stored -- `summaryLength` (optional): The number of sentences to include in the summary (default: 2) -- `summaryPercentage` (optional): The percentage of original content to include in the summary -- `autoSummarize` (optional): Whether to automatically generate summaries (default: false) - -## Behavior - -1. The plugin checks for required configuration fields and throws an error if any are missing. -2. It processes each record in the specified sheet. -3. If `autoSummarize` is true and no existing summary is present: - - It generates a summary based on the specified length or percentage. - - If the content is shorter than the specified summary length, the entire content is used as the summary. - - For longer content, it combines sentences from the beginning and end of the text. - - Key phrases are extracted and stored in the specified field. -4. If the content field is empty, an error is added to the record. -5. The plugin uses a concurrency of 2 for processing records and includes debug information. - -Note: This plugin requires the `@flatfile/listener`, `@flatfile/plugin-record-hook`, and `compromise` packages as dependencies. \ No newline at end of file diff --git a/validators/TextSummarizer/rollup.config.mjs b/validators/TextSummarizer/rollup.config.mjs deleted file mode 100644 index a0a22b94c..000000000 --- a/validators/TextSummarizer/rollup.config.mjs +++ /dev/null @@ -1,44 +0,0 @@ -import { buildConfig } from '@flatfile/rollup-config'; -import typescript from '@rollup/plugin-typescript'; -import { nodeResolve } from '@rollup/plugin-node-resolve'; -import commonjs from '@rollup/plugin-commonjs'; - -const umdExternals = [ - '@flatfile/api', - '@flatfile/hooks', - '@flatfile/listener', - '@flatfile/util-common', - '@flatfile/plugin-record-hook', - 'compromise' -]; - -const config = buildConfig({ - includeUmd: true, - umdConfig: { name: 'FlatfileSummarizer', external: umdExternals }, - external: [ - ...umdExternals, - 'compromise' - ] -}); - -// Add TypeScript support to all configurations -config.forEach(conf => { - if (!conf.plugins) conf.plugins = []; - conf.plugins.unshift( - typescript({ tsconfig: './tsconfig.json' }), - nodeResolve({ preferBuiltins: true }), - commonjs() - ); -}); - -// Ensure proper handling of 'compromise' as an external dependency -config.forEach(conf => { - if (conf.output && conf.output.format === 'umd') { - conf.output.globals = { - ...conf.output.globals, - 'compromise': 'nlp' - }; - } -}); - -export default config; \ No newline at end of file diff --git a/validators/TextSummarizer/src/index.ts b/validators/TextSummarizer/src/index.ts deleted file mode 100644 index 015b32004..000000000 --- a/validators/TextSummarizer/src/index.ts +++ /dev/null @@ -1,83 +0,0 @@ -import { FlatfileListener } from '@flatfile/listener' -import { recordHook } from '@flatfile/plugin-record-hook' -import nlp from 'compromise' - -interface SummarizationConfig { - sheetSlug: string - contentField: string - summaryField: string - keyPhrasesField: string - summaryLength?: number - summaryPercentage?: number - autoSummarize?: boolean -} - -export default function ( - listener: FlatfileListener, - config: SummarizationConfig -) { - if ( - !config.sheetSlug || - !config.contentField || - !config.summaryField || - !config.keyPhrasesField - ) { - throw new Error('Missing required configuration fields') - } - - listener.use( - recordHook( - config.sheetSlug, - async (record, event) => { - const content = record.get(config.contentField) as string - const existingSummary = record.get(config.summaryField) as string - - if (!content) { - record.addError( - config.contentField, - 'Content is required for summarization' - ) - return record - } - - if (config.autoSummarize && !existingSummary) { - const doc = nlp(content) - const sentences = doc.sentences().out('array') - - let summaryLength = config.summaryLength || 2 - if (config.summaryPercentage) { - summaryLength = Math.max( - 1, - Math.floor((sentences.length * config.summaryPercentage) / 100) - ) - } - - let summary = '' - if (sentences.length > 0) { - if (sentences.length <= summaryLength) { - summary = sentences.join(' ') - } else { - const middleIndex = Math.floor(summaryLength / 2) - const firstPart = sentences.slice(0, middleIndex).join(' ') - const lastPart = sentences.slice(-middleIndex).join(' ') - summary = `${firstPart} ... ${lastPart}` - } - } - - record.set(config.summaryField, summary) - - const keyPhrases = doc - .match('#Noun+ (#Adjective|#Noun){0,2}') - .out('array') - record.set(config.keyPhrasesField, keyPhrases.join(', ')) - } - - return record - }, - { - concurrency: 2, - debug: true, - } - ) - ) -} From 0050fcd97b78658f23b196f1740d738151302bac Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Wed, 9 Oct 2024 13:04:35 -0400 Subject: [PATCH 3/5] cleanup --- enrich/summarize/src/index.ts | 61 +------------------ ...lugin.spec.ts => summarize.plugin.spec.ts} | 8 +-- enrich/summarize/src/summarize.plugin.ts | 60 ++++++++++++++++++ flatfilers/sandbox/src/index.ts | 4 +- 4 files changed, 67 insertions(+), 66 deletions(-) rename enrich/summarize/src/{text-summarizer.plugin.spec.ts => summarize.plugin.spec.ts} (92%) create mode 100644 enrich/summarize/src/summarize.plugin.ts diff --git a/enrich/summarize/src/index.ts b/enrich/summarize/src/index.ts index 008f17bc6..21f44d66f 100644 --- a/enrich/summarize/src/index.ts +++ b/enrich/summarize/src/index.ts @@ -1,60 +1 @@ -import { type FlatfileRecord, recordHook } from '@flatfile/plugin-record-hook' -import nlp from 'compromise' - -interface SummarizationConfig { - sheetSlug: string - contentField: string - summaryField: string - keyPhrasesField: string - summaryLength?: number - summaryPercentage?: number -} - -export function summarizationPlugin(config: SummarizationConfig) { - return recordHook(config.sheetSlug, (record: FlatfileRecord) => { - const content = record.get(config.contentField) as string - const existingSummary = record.get(config.summaryField) as string - - if (!content) { - record.addError( - config.contentField, - 'Content is required for summarization' - ) - return record - } - - if (!existingSummary) { - const doc = nlp(content) - const sentences = doc.sentences().out('array') - - let summaryLength = config.summaryLength || 2 - if (config.summaryPercentage) { - summaryLength = Math.max( - 1, - Math.floor((sentences.length * config.summaryPercentage) / 100) - ) - } - - let summary = '' - if (sentences.length > 0) { - if (sentences.length <= summaryLength) { - summary = sentences.join(' ') - } else { - const middleIndex = Math.floor(summaryLength / 2) - const firstPart = sentences.slice(0, middleIndex).join(' ') - const lastPart = sentences.slice(-middleIndex).join(' ') - summary = `${firstPart} ... ${lastPart}` - } - } - - record.set(config.summaryField, summary) - - const keyPhrases = doc - .match('#Noun+ (#Adjective|#Noun){0,2}') - .out('array') - record.set(config.keyPhrasesField, keyPhrases.join(', ')) - } - - return record - }) -} +export { summarize } from './summarize.plugin' diff --git a/enrich/summarize/src/text-summarizer.plugin.spec.ts b/enrich/summarize/src/summarize.plugin.spec.ts similarity index 92% rename from enrich/summarize/src/text-summarizer.plugin.spec.ts rename to enrich/summarize/src/summarize.plugin.spec.ts index 366deea38..a1eb333b4 100644 --- a/enrich/summarize/src/text-summarizer.plugin.spec.ts +++ b/enrich/summarize/src/summarize.plugin.spec.ts @@ -7,7 +7,7 @@ import { setupSimpleWorkbook, setupSpace, } from '@flatfile/utils-testing' -import { summarizationPlugin } from './index' +import { summarize } from './summarize.plugin' const api = new FlatfileClient() @@ -40,7 +40,7 @@ describe('Text Summarization Plugin', () => { } }) - describe('summarizationPlugin', () => { + describe('summarize()', () => { const mockConfig = { sheetSlug: 'test', contentField: 'content', @@ -50,7 +50,7 @@ describe('Text Summarization Plugin', () => { } it('should add summary and key phrases to the record', async () => { - listener.use(summarizationPlugin(mockConfig)) + listener.use(summarize(mockConfig)) await createRecords(sheetId, [ { @@ -70,7 +70,7 @@ describe('Text Summarization Plugin', () => { }) it('should handle empty content fields', async () => { - listener.use(summarizationPlugin(mockConfig)) + listener.use(summarize(mockConfig)) await createRecords(sheetId, [{ content: '' }]) await listener.waitFor('commit:created') diff --git a/enrich/summarize/src/summarize.plugin.ts b/enrich/summarize/src/summarize.plugin.ts new file mode 100644 index 000000000..9e6df731b --- /dev/null +++ b/enrich/summarize/src/summarize.plugin.ts @@ -0,0 +1,60 @@ +import { type FlatfileRecord, recordHook } from '@flatfile/plugin-record-hook' +import nlp from 'compromise' + +interface SummarizationConfig { + sheetSlug: string + contentField: string + summaryField: string + keyPhrasesField: string + summaryLength?: number + summaryPercentage?: number +} + +export function summarize(config: SummarizationConfig) { + return recordHook(config.sheetSlug, (record: FlatfileRecord) => { + const content = record.get(config.contentField) as string + const existingSummary = record.get(config.summaryField) as string + + if (!content) { + record.addError( + config.contentField, + 'Content is required for summarization' + ) + return record + } + + if (!existingSummary) { + const doc = nlp(content) + const sentences = doc.sentences().out('array') + + let summaryLength = config.summaryLength || 2 + if (config.summaryPercentage) { + summaryLength = Math.max( + 1, + Math.floor((sentences.length * config.summaryPercentage) / 100) + ) + } + + let summary = '' + if (sentences.length > 0) { + if (sentences.length <= summaryLength) { + summary = sentences.join(' ') + } else { + const middleIndex = Math.floor(summaryLength / 2) + const firstPart = sentences.slice(0, middleIndex).join(' ') + const lastPart = sentences.slice(-middleIndex).join(' ') + summary = `${firstPart} ... ${lastPart}` + } + } + + record.set(config.summaryField, summary) + + const keyPhrases = doc + .match('#Noun+ (#Adjective|#Noun){0,2}') + .out('array') + record.set(config.keyPhrasesField, keyPhrases.join(', ')) + } + + return record + }) +} diff --git a/flatfilers/sandbox/src/index.ts b/flatfilers/sandbox/src/index.ts index 5693b5382..7a170cad0 100644 --- a/flatfilers/sandbox/src/index.ts +++ b/flatfilers/sandbox/src/index.ts @@ -1,10 +1,10 @@ import type { FlatfileListener } from '@flatfile/listener' -import { summarizationPlugin } from '@flatfile/plugin-enrich-summarize' +import { summarize } from '@flatfile/plugin-enrich-summarize' import { configureSpace } from '@flatfile/plugin-space-configure' export default async function (listener: FlatfileListener) { listener.use( - summarizationPlugin({ + summarize({ sheetSlug: 'summarization', contentField: 'content', summaryField: 'summary', From 9417e6dceaed044918ac66522f44e371cd30dfac Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Wed, 9 Oct 2024 13:08:27 -0400 Subject: [PATCH 4/5] Update README.MD --- enrich/summarize/README.MD | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/enrich/summarize/README.MD b/enrich/summarize/README.MD index 9f92e3694..2a0b5f95c 100644 --- a/enrich/summarize/README.MD +++ b/enrich/summarize/README.MD @@ -4,7 +4,7 @@ This plugin provides automatic text summarization capabilities for Flatfile using natural language processing. It uses the compromise library to generate summaries and extract key phrases from specified fields. -**Event Type:** `listener.on('commit:created')` +**Event Type:** `commit:created` **Supported Field Types:** `string` @@ -38,9 +38,6 @@ Number of sentences in the summary. Default is 2. #### `summaryPercentage` - `number` - (optional) Percentage of content to include in summary. -#### `autoSummarize` - `boolean` - (optional) -Automatically generate summaries. Default is false. - ## Usage **install** @@ -51,7 +48,7 @@ npm install @flatfile/plugin-enrich-summarize **import** ```javascript import { FlatfileListener } from "@flatfile/listener"; -import { textSummarization } from "@flatfile/plugin-enrich-summarize"; +import { summarize } from "@flatfile/plugin-enrich-summarize"; ``` **listener.js** @@ -59,13 +56,12 @@ import { textSummarization } from "@flatfile/plugin-enrich-summarize"; const listener = new FlatfileListener(); listener.use( - textSummarization({ + summarize({ sheetSlug: "articles", contentField: "full_text", summaryField: "summary", keyPhrasesField: "key_phrases", - summaryLength: 3, - autoSummarize: true + summaryLength: 3 }) ); ``` From 3d3d981e374471e77ed35b890b059f3de9a66188 Mon Sep 17 00:00:00 2001 From: Carl Brugger Date: Wed, 9 Oct 2024 14:51:32 -0400 Subject: [PATCH 5/5] cleanup --- enrich/summarize/jest.config.js | 16 +++ enrich/summarize/package.json | 4 +- enrich/summarize/src/summarize.plugin.spec.ts | 119 ++++++------------ enrich/summarize/src/summarize.plugin.ts | 33 +---- enrich/summarize/src/summary.util.ts | 41 ++++++ 5 files changed, 106 insertions(+), 107 deletions(-) create mode 100644 enrich/summarize/jest.config.js create mode 100644 enrich/summarize/src/summary.util.ts diff --git a/enrich/summarize/jest.config.js b/enrich/summarize/jest.config.js new file mode 100644 index 000000000..e6d7ca40b --- /dev/null +++ b/enrich/summarize/jest.config.js @@ -0,0 +1,16 @@ +module.exports = { + testEnvironment: 'node', + + transform: { + '^.+\\.tsx?$': 'ts-jest', + }, + setupFiles: ['../../test/dotenv-config.js'], + setupFilesAfterEnv: [ + '../../test/betterConsoleLog.js', + '../../test/unit.cleanup.js', + ], + testTimeout: 60_000, + globalSetup: '../../test/setup-global.js', + forceExit: true, + passWithNoTests: true, +} diff --git a/enrich/summarize/package.json b/enrich/summarize/package.json index ba7475835..46ca8f354 100644 --- a/enrich/summarize/package.json +++ b/enrich/summarize/package.json @@ -36,7 +36,9 @@ "build:watch": "rollup -c --watch", "build:prod": "NODE_ENV=production rollup -c", "check": "tsc ./**/*.ts --noEmit --esModuleInterop", - "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" + "test": "jest src/*.spec.ts --detectOpenHandles", + "test:unit": "jest src/*.spec.ts --testPathIgnorePatterns=.*\\.e2e\\.spec\\.ts$ --detectOpenHandles", + "test:e2e": "jest src/*.e2e.spec.ts --detectOpenHandles" }, "keywords": [ "flatfile-plugins", diff --git a/enrich/summarize/src/summarize.plugin.spec.ts b/enrich/summarize/src/summarize.plugin.spec.ts index a1eb333b4..1b004e09f 100644 --- a/enrich/summarize/src/summarize.plugin.spec.ts +++ b/enrich/summarize/src/summarize.plugin.spec.ts @@ -1,89 +1,50 @@ -import { FlatfileClient } from '@flatfile/api' -import { - createRecords, - deleteSpace, - getRecords, - setupListener, - setupSimpleWorkbook, - setupSpace, -} from '@flatfile/utils-testing' -import { summarize } from './summarize.plugin' - -const api = new FlatfileClient() - -describe('Text Summarization Plugin', () => { - const listener = setupListener() - let spaceId: string - let sheetId: string +import { extractKeyPhrases, generateSummary } from './summary.util' + +describe('Summary Utility Functions', () => { + describe('generateSummary()', () => { + it('should generate a summary with default length', () => { + const content = + 'This is a test sentence. This is another test sentence. And a third one for good measure.' + const summary = generateSummary(content) + expect(summary).toBe( + 'This is a test sentence. ... And a third one for good measure.' + ) + }) - beforeAll(async () => { - const space = await setupSpace() - spaceId = space.id - const workbook = await setupSimpleWorkbook(space.id, [ - { key: 'content', type: 'string' }, - { key: 'summary', type: 'string' }, - { key: 'key_phrases', type: 'string' }, - ]) - sheetId = workbook.sheets![0].id - }) + it('should generate a summary with specified length', () => { + const content = + 'First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.' + const summary = generateSummary(content, { summaryLength: 3 }) + expect(summary).toBe('First sentence. ... Fifth sentence.') + }) - afterAll(async () => { - await deleteSpace(spaceId) - }) + it('should generate a summary with specified percentage', () => { + const content = + 'One. Two. Three. Four. Five. Six. Seven. Eight. Nine. Ten.' + const summary = generateSummary(content, { summaryPercentage: 30 }) + expect(summary).toBe('One. ... Ten.') + }) - afterEach(async () => { - listener.reset() - const records = await getRecords(sheetId) - if (records.length > 0) { - const ids = records.map((record) => record.id) - await api.records.delete(sheetId, { ids }) - } + it('should handle content shorter than summary length', () => { + const content = 'Short content.' + const summary = generateSummary(content, { summaryLength: 5 }) + expect(summary).toBe(content) + }) }) - describe('summarize()', () => { - const mockConfig = { - sheetSlug: 'test', - contentField: 'content', - summaryField: 'summary', - keyPhrasesField: 'key_phrases', - summaryLength: 2, - } - - it('should add summary and key phrases to the record', async () => { - listener.use(summarize(mockConfig)) - - await createRecords(sheetId, [ - { - content: - 'This is a test sentence. This is another test sentence. And a third one for good measure.', - }, - ]) - await listener.waitFor('commit:created') - - const records = await getRecords(sheetId) - - expect(records[0].values['summary'].value).toBeDefined() - expect(records[0].values['key_phrases'].value).toBeDefined() - expect( - (records[0].values['summary'].value as string).split('.').length - ).toBe(6) + describe('extractKeyPhrases()', () => { + it('should extract key phrases from content', () => { + const content = 'The quick brown fox jumps over the lazy dog.' + const keyPhrases = extractKeyPhrases(content) + console.log('keyPhrases', keyPhrases) + expect(keyPhrases[0]).toContain('quick brown fox') + expect(keyPhrases[1]).toContain('lazy dog') }) - it('should handle empty content fields', async () => { - listener.use(summarize(mockConfig)) - - await createRecords(sheetId, [{ content: '' }]) - await listener.waitFor('commit:created') - - const records = await getRecords(sheetId) - - expect(records[0].values['summary'].value).toBeUndefined() - expect(records[0].values['key_phrases'].value).toBeUndefined() - expect(records[0].values['content'].messages).toContainEqual( - expect.objectContaining({ - message: 'Content is required for summarization', - }) - ) + it('should handle content with no key phrases', () => { + const content = '0 1 2 3 4 5 6 7 8 9' + const keyPhrases = extractKeyPhrases(content) + expect(keyPhrases).toHaveLength(0) }) }) }) diff --git a/enrich/summarize/src/summarize.plugin.ts b/enrich/summarize/src/summarize.plugin.ts index 9e6df731b..f2592728d 100644 --- a/enrich/summarize/src/summarize.plugin.ts +++ b/enrich/summarize/src/summarize.plugin.ts @@ -1,5 +1,5 @@ import { type FlatfileRecord, recordHook } from '@flatfile/plugin-record-hook' -import nlp from 'compromise' +import { generateSummary, extractKeyPhrases } from './summary.util' interface SummarizationConfig { sheetSlug: string @@ -24,34 +24,13 @@ export function summarize(config: SummarizationConfig) { } if (!existingSummary) { - const doc = nlp(content) - const sentences = doc.sentences().out('array') - - let summaryLength = config.summaryLength || 2 - if (config.summaryPercentage) { - summaryLength = Math.max( - 1, - Math.floor((sentences.length * config.summaryPercentage) / 100) - ) - } - - let summary = '' - if (sentences.length > 0) { - if (sentences.length <= summaryLength) { - summary = sentences.join(' ') - } else { - const middleIndex = Math.floor(summaryLength / 2) - const firstPart = sentences.slice(0, middleIndex).join(' ') - const lastPart = sentences.slice(-middleIndex).join(' ') - summary = `${firstPart} ... ${lastPart}` - } - } - + const summary = generateSummary(content, { + summaryLength: config.summaryLength, + summaryPercentage: config.summaryPercentage + }) record.set(config.summaryField, summary) - const keyPhrases = doc - .match('#Noun+ (#Adjective|#Noun){0,2}') - .out('array') + const keyPhrases = extractKeyPhrases(content) record.set(config.keyPhrasesField, keyPhrases.join(', ')) } diff --git a/enrich/summarize/src/summary.util.ts b/enrich/summarize/src/summary.util.ts new file mode 100644 index 000000000..e91161b05 --- /dev/null +++ b/enrich/summarize/src/summary.util.ts @@ -0,0 +1,41 @@ +import nlp from 'compromise' + +export interface SummaryOptions { + summaryLength?: number + summaryPercentage?: number +} + +export function generateSummary( + content: string, + options: SummaryOptions = {} +): string { + const doc = nlp(content) + const sentences = doc.sentences().out('array') + + let summaryLength = options.summaryLength || 2 + if (options.summaryPercentage) { + summaryLength = Math.max( + 1, + Math.floor((sentences.length * options.summaryPercentage) / 100) + ) + } + + if (sentences.length <= summaryLength) { + return sentences.join(' ') + } + + const middleIndex = Math.floor(summaryLength / 2) + const firstPart = sentences.slice(0, middleIndex).join(' ') + const lastPart = sentences.slice(-middleIndex).join(' ') + return `${firstPart} ... ${lastPart}` +} + +export function extractKeyPhrases(content: string): string[] { + const doc = nlp(content) + // This line extracts key phrases from the content using compromise (nlp) + // It matches patterns of up to two optional adjectives followed by one or more nouns + // '#Adjective? #Adjective?' allows matching for up to two optional adjectives + // '#Noun+' matches one or more nouns + // The 'out('array')' method returns the matches as an array of strings + return doc.match('#Adjective? #Adjective? #Noun+').out('array') +}