diff --git a/enrich/summarize/README.MD b/enrich/summarize/README.MD new file mode 100644 index 000000000..2a0b5f95c --- /dev/null +++ b/enrich/summarize/README.MD @@ -0,0 +1,67 @@ + + +# @flatfile/plugin-enrich-summarize + +This plugin provides automatic text summarization capabilities for Flatfile using natural language processing. It uses the compromise library to generate summaries and extract key phrases from specified fields. + +**Event Type:** `commit:created` + +**Supported Field Types:** `string` + + + +## Features + +- Automatic text summarization +- Key phrase extraction +- Configurable summary length or percentage +- Custom field mapping for content, summary, and key phrases +- Error handling for missing content + +## Parameters + +#### `sheetSlug` - `string` - (required) +The slug of the sheet to apply summarization. + +#### `contentField` - `string` - (required) +The field containing the full text content. + +#### `summaryField` - `string` - (required) +The field to store the generated summary. + +#### `keyPhrasesField` - `string` - (required) +The field to store extracted key phrases. + +#### `summaryLength` - `number` - (optional) +Number of sentences in the summary. Default is 2. + +#### `summaryPercentage` - `number` - (optional) +Percentage of content to include in summary. + +## Usage + +**install** +```bash +npm install @flatfile/plugin-enrich-summarize +``` + +**import** +```javascript +import { FlatfileListener } from "@flatfile/listener"; +import { summarize } from "@flatfile/plugin-enrich-summarize"; +``` + +**listener.js** +```javascript +const listener = new FlatfileListener(); + +listener.use( + summarize({ + sheetSlug: "articles", + contentField: "full_text", + summaryField: "summary", + keyPhrasesField: "key_phrases", + summaryLength: 3 + }) +); +``` diff --git a/enrich/summarize/jest.config.js b/enrich/summarize/jest.config.js new file mode 100644 index 000000000..e6d7ca40b --- /dev/null +++ b/enrich/summarize/jest.config.js @@ -0,0 +1,16 @@ +module.exports = { + testEnvironment: 'node', + + transform: { + '^.+\\.tsx?$': 'ts-jest', + }, + setupFiles: ['../../test/dotenv-config.js'], + setupFilesAfterEnv: [ + '../../test/betterConsoleLog.js', + '../../test/unit.cleanup.js', + ], + testTimeout: 60_000, + globalSetup: '../../test/setup-global.js', + forceExit: true, + passWithNoTests: true, +} diff --git a/enrich/summarize/metadata.json b/enrich/summarize/metadata.json new file mode 100644 index 000000000..f9f75be10 --- /dev/null +++ b/enrich/summarize/metadata.json @@ -0,0 +1,84 @@ +{ + "timestamp": "2024-09-24T07-23-50-639Z", + "task": "Develop a text summarizer Flatfile Listener plugin:\n - Create a RecordHook to summarize text fields\n - Use the 'compromise' npm package for natural language processing\n - Implement extractive summarization techniques\n - Allow configuration of summary length or percentage\n - Add summarized text to a new field in the record\n - Give the user reasonable config options to specify the Sheet Slug, the Field(s) that are the text(s), whether the summarization should be done automatically", + "summary": "This code implements a Flatfile Listener plugin for text summarization. It uses the compromise library for natural language processing and provides configurable options for automatic summarization of content in specified fields.", + "steps": [ + [ + "Retrieve information about Flatfile Listeners and RecordHook.\n", + "#E1", + "PineconeAssistant", + "Provide information about Flatfile Listeners and RecordHook, including their structure and usage", + "Plan: Retrieve information about Flatfile Listeners and RecordHook.\n#E1 = PineconeAssistant[Provide information about Flatfile Listeners and RecordHook, including their structure and usage]" + ], + [ + "Search for information about the 'compromise' npm package for natural language processing.\n", + "#E2", + "Google", + "compromise npm package natural language processing", + "Plan: Search for information about the 'compromise' npm package for natural language processing.\n#E2 = Google[compromise npm package natural language processing]" + ], + [ + "Create the basic structure of the Flatfile Listener plugin with RecordHook.\n", + "#E3", + "LLM", + "Create a basic structure for a Flatfile Listener plugin with RecordHook, using the information from #E1", + "Plan: Create the basic structure of the Flatfile Listener plugin with RecordHook.\n#E3 = LLM[Create a basic structure for a Flatfile Listener plugin with RecordHook, using the information from #E1]" + ], + [ + "Implement the text summarization logic using the 'compromise' package.\n", + "#E4", + "LLM", + "Implement extractive summarization techniques using the 'compromise' package based on the information from #E2 and integrate it into the Listener structure from #E3", + "Plan: Implement the text summarization logic using the 'compromise' package.\n#E4 = LLM[Implement extractive summarization techniques using the 'compromise' package based on the information from #E2 and integrate it into the Listener structure from #E3]" + ], + [ + "Add configuration options for summary length or percentage and field selection.\n", + "#E5", + "LLM", + "Extend the Listener code from #E4 to include configuration options for summary length or percentage, Sheet Slug, and field selection", + "Plan: Add configuration options for summary length or percentage and field selection.\n#E5 = LLM[Extend the Listener code from #E4 to include configuration options for summary length or percentage, Sheet Slug, and field selection]" + ], + [ + "Implement the logic to add the summarized text to a new field in the record.\n", + "#E6", + "LLM", + "Add code to the Listener from #E5 to create a new field in the record and populate it with the summarized text", + "Plan: Implement the logic to add the summarized text to a new field in the record.\n#E6 = LLM[Add code to the Listener from #E5 to create a new field in the record and populate it with the summarized text]" + ], + [ + "Add configuration option for automatic summarization.\n", + "#E7", + "LLM", + "Extend the Listener code from #E6 to include a configuration option for automatic summarization", + "Plan: Add configuration option for automatic summarization.\n#E7 = LLM[Extend the Listener code from #E6 to include a configuration option for automatic summarization]" + ], + [ + "Verify the Event Topics used in the Listener.\n", + "#E8", + "PineconeAssistant", + "Verify that the Event Topics used in the Listener are valid according to the event.topics.fact file", + "Plan: Verify the Event Topics used in the Listener.\n#E8 = PineconeAssistant[Verify that the Event Topics used in the Listener are valid according to the event.topics.fact file]" + ], + [ + "Finalize the Flatfile Listener plugin code.\n", + "#E9", + "LLM", + "Combine all the components from #E3, #E4, #E5, #E6, #E7, and #E8 into a complete Flatfile Listener plugin code, ensuring all imports are used and the code is valid", + "Plan: Finalize the Flatfile Listener plugin code.\n#E9 = LLM[Combine all the components from #E3, #E4, #E5, #E6, #E7, and #E8 into a complete Flatfile Listener plugin code, ensuring all imports are used and the code is valid]" + ], + [ + "Review and optimize the final code.\n", + "#E10", + "LLM", + "Review the code from #E9, remove any unused imports, validate plugin parameters, and ensure the code follows best practices for Flatfile Listener plugins", + "Plan: Review and optimize the final code.\n#E10 = LLM[Review the code from #E9, remove any unused imports, validate plugin parameters, and ensure the code follows best practices for Flatfile Listener plugins]" + ] + ], + "metrics": { + "tokens": { + "plan": 5077, + "state": 4675, + "total": 9752 + } + } +} \ No newline at end of file diff --git a/enrich/summarize/package.json b/enrich/summarize/package.json new file mode 100644 index 000000000..46ca8f354 --- /dev/null +++ b/enrich/summarize/package.json @@ -0,0 +1,65 @@ +{ + "name": "@flatfile/plugin-enrich-summarize", + "version": "0.0.0", + "description": "A Flatfile plugin for text summarization and key phrase extraction", + "registryMetadata": { + "category": "records" + }, + "engines": { + "node": ">= 16" + }, + "browser": { + "./dist/index.cjs": "./dist/index.browser.cjs", + "./dist/index.mjs": "./dist/index.browser.mjs" + }, + "exports": { + "types": "./dist/index.d.ts", + "node": { + "import": "./dist/index.mjs", + "require": "./dist/index.cjs" + }, + "browser": { + "require": "./dist/index.browser.cjs", + "import": "./dist/index.browser.mjs" + }, + "default": "./dist/index.mjs" + }, + "main": "./dist/index.cjs", + "module": "./dist/index.mjs", + "types": "./dist/index.d.ts", + "source": "./src/index.ts", + "files": [ + "dist/**" + ], + "scripts": { + "build": "rollup -c", + "build:watch": "rollup -c --watch", + "build:prod": "NODE_ENV=production rollup -c", + "check": "tsc ./**/*.ts --noEmit --esModuleInterop", + "test": "jest src/*.spec.ts --detectOpenHandles", + "test:unit": "jest src/*.spec.ts --testPathIgnorePatterns=.*\\.e2e\\.spec\\.ts$ --detectOpenHandles", + "test:e2e": "jest src/*.e2e.spec.ts --detectOpenHandles" + }, + "keywords": [ + "flatfile-plugins", + "category-enrich" + ], + "author": "Flatfile", + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "enrich/summarize" + }, + "license": "ISC", + "dependencies": { + "@flatfile/plugin-record-hook": "^1.7.0", + "compromise": "^14.14.0" + }, + "peerDependencies": { + "@flatfile/api": "^1.4.13", + "@flatfile/listener": "^1.1.0" + }, + "devDependencies": { + "@flatfile/rollup-config": "^0.1.1" + } +} \ No newline at end of file diff --git a/enrich/summarize/rollup.config.mjs b/enrich/summarize/rollup.config.mjs new file mode 100644 index 000000000..fafa813c6 --- /dev/null +++ b/enrich/summarize/rollup.config.mjs @@ -0,0 +1,5 @@ +import { buildConfig } from '@flatfile/rollup-config' + +const config = buildConfig({}) + +export default config diff --git a/enrich/summarize/src/index.ts b/enrich/summarize/src/index.ts new file mode 100644 index 000000000..21f44d66f --- /dev/null +++ b/enrich/summarize/src/index.ts @@ -0,0 +1 @@ +export { summarize } from './summarize.plugin' diff --git a/enrich/summarize/src/summarize.plugin.spec.ts b/enrich/summarize/src/summarize.plugin.spec.ts new file mode 100644 index 000000000..1b004e09f --- /dev/null +++ b/enrich/summarize/src/summarize.plugin.spec.ts @@ -0,0 +1,50 @@ +import { extractKeyPhrases, generateSummary } from './summary.util' + +describe('Summary Utility Functions', () => { + describe('generateSummary()', () => { + it('should generate a summary with default length', () => { + const content = + 'This is a test sentence. This is another test sentence. And a third one for good measure.' + const summary = generateSummary(content) + expect(summary).toBe( + 'This is a test sentence. ... And a third one for good measure.' + ) + }) + + it('should generate a summary with specified length', () => { + const content = + 'First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.' + const summary = generateSummary(content, { summaryLength: 3 }) + expect(summary).toBe('First sentence. ... Fifth sentence.') + }) + + it('should generate a summary with specified percentage', () => { + const content = + 'One. Two. Three. Four. Five. Six. Seven. Eight. Nine. Ten.' + const summary = generateSummary(content, { summaryPercentage: 30 }) + expect(summary).toBe('One. ... Ten.') + }) + + it('should handle content shorter than summary length', () => { + const content = 'Short content.' + const summary = generateSummary(content, { summaryLength: 5 }) + expect(summary).toBe(content) + }) + }) + + describe('extractKeyPhrases()', () => { + it('should extract key phrases from content', () => { + const content = 'The quick brown fox jumps over the lazy dog.' + const keyPhrases = extractKeyPhrases(content) + console.log('keyPhrases', keyPhrases) + expect(keyPhrases[0]).toContain('quick brown fox') + expect(keyPhrases[1]).toContain('lazy dog') + }) + + it('should handle content with no key phrases', () => { + const content = '0 1 2 3 4 5 6 7 8 9' + const keyPhrases = extractKeyPhrases(content) + expect(keyPhrases).toHaveLength(0) + }) + }) +}) diff --git a/enrich/summarize/src/summarize.plugin.ts b/enrich/summarize/src/summarize.plugin.ts new file mode 100644 index 000000000..f2592728d --- /dev/null +++ b/enrich/summarize/src/summarize.plugin.ts @@ -0,0 +1,39 @@ +import { type FlatfileRecord, recordHook } from '@flatfile/plugin-record-hook' +import { generateSummary, extractKeyPhrases } from './summary.util' + +interface SummarizationConfig { + sheetSlug: string + contentField: string + summaryField: string + keyPhrasesField: string + summaryLength?: number + summaryPercentage?: number +} + +export function summarize(config: SummarizationConfig) { + return recordHook(config.sheetSlug, (record: FlatfileRecord) => { + const content = record.get(config.contentField) as string + const existingSummary = record.get(config.summaryField) as string + + if (!content) { + record.addError( + config.contentField, + 'Content is required for summarization' + ) + return record + } + + if (!existingSummary) { + const summary = generateSummary(content, { + summaryLength: config.summaryLength, + summaryPercentage: config.summaryPercentage + }) + record.set(config.summaryField, summary) + + const keyPhrases = extractKeyPhrases(content) + record.set(config.keyPhrasesField, keyPhrases.join(', ')) + } + + return record + }) +} diff --git a/enrich/summarize/src/summary.util.ts b/enrich/summarize/src/summary.util.ts new file mode 100644 index 000000000..e91161b05 --- /dev/null +++ b/enrich/summarize/src/summary.util.ts @@ -0,0 +1,41 @@ +import nlp from 'compromise' + +export interface SummaryOptions { + summaryLength?: number + summaryPercentage?: number +} + +export function generateSummary( + content: string, + options: SummaryOptions = {} +): string { + const doc = nlp(content) + const sentences = doc.sentences().out('array') + + let summaryLength = options.summaryLength || 2 + if (options.summaryPercentage) { + summaryLength = Math.max( + 1, + Math.floor((sentences.length * options.summaryPercentage) / 100) + ) + } + + if (sentences.length <= summaryLength) { + return sentences.join(' ') + } + + const middleIndex = Math.floor(summaryLength / 2) + const firstPart = sentences.slice(0, middleIndex).join(' ') + const lastPart = sentences.slice(-middleIndex).join(' ') + return `${firstPart} ... ${lastPart}` +} + +export function extractKeyPhrases(content: string): string[] { + const doc = nlp(content) + // This line extracts key phrases from the content using compromise (nlp) + // It matches patterns of up to two optional adjectives followed by one or more nouns + // '#Adjective? #Adjective?' allows matching for up to two optional adjectives + // '#Noun+' matches one or more nouns + // The 'out('array')' method returns the matches as an array of strings + return doc.match('#Adjective? #Adjective? #Noun+').out('array') +} diff --git a/flatfilers/sandbox/src/index.ts b/flatfilers/sandbox/src/index.ts index f8d538aea..7a170cad0 100644 --- a/flatfilers/sandbox/src/index.ts +++ b/flatfilers/sandbox/src/index.ts @@ -1,14 +1,14 @@ import type { FlatfileListener } from '@flatfile/listener' +import { summarize } from '@flatfile/plugin-enrich-summarize' import { configureSpace } from '@flatfile/plugin-space-configure' -import { validateDate } from '@flatfile/plugin-validate-date' export default async function (listener: FlatfileListener) { listener.use( - validateDate({ - sheetSlug: 'contacts', - dateFields: ['dob', 'hire_date'], - outputFormat: 'MM/dd/yyyy', - includeTime: true, + summarize({ + sheetSlug: 'summarization', + contentField: 'content', + summaryField: 'summary', + keyPhrasesField: 'keyPhrases', }) ) listener.use( @@ -18,44 +18,23 @@ export default async function (listener: FlatfileListener) { name: 'Sandbox', sheets: [ { - name: 'Contacts', - slug: 'contacts', - allowAdditionalFields: true, + name: 'Summarization', + slug: 'summarization', fields: [ { - key: 'firstName', + key: 'content', type: 'string', - label: 'First Name', + label: 'Content', }, { - key: 'lastName', + key: 'summary', type: 'string', - label: 'Last Name', + label: 'Summary', }, { - key: 'email', + key: 'keyPhrases', type: 'string', - label: 'Email', - }, - { - key: 'phone', - type: 'string', - label: 'Phone', - }, - { - key: 'country', - type: 'string', - label: 'Country', - }, - { - key: 'dob', - type: 'string', - label: 'Date of Birth', - }, - { - key: 'hire_date', - type: 'string', - label: 'Hire Date', + label: 'Key Phrases', }, ], }, diff --git a/package-lock.json b/package-lock.json index 7d3c42608..9b0855032 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "workspaces": [ "bundlers/*", "convert/*", + "enrich/*", "flatfilers/*", "plugins/*", "support/*", @@ -78,6 +79,25 @@ "@flatfile/listener": "^1.0.5" } }, + "enrich/summarize": { + "name": "@flatfile/plugin-enrich-summarize", + "version": "0.0.0", + "license": "ISC", + "dependencies": { + "@flatfile/plugin-record-hook": "^1.7.0", + "compromise": "^14.14.0" + }, + "devDependencies": { + "@flatfile/rollup-config": "^0.1.1" + }, + "engines": { + "node": ">= 16" + }, + "peerDependencies": { + "@flatfile/api": "^1.4.13", + "@flatfile/listener": "^1.1.0" + } + }, "flatfilers/playground": { "name": "@private/playground", "version": "0.0.0", @@ -3141,6 +3161,10 @@ "resolved": "plugins/dxp-configure", "link": true }, + "node_modules/@flatfile/plugin-enrich-summarize": { + "resolved": "enrich/summarize", + "link": true + }, "node_modules/@flatfile/plugin-export-workbook": { "resolved": "plugins/export-workbook", "link": true @@ -9265,8 +9289,7 @@ }, "node_modules/chrono-node": { "version": "2.7.7", - "resolved": "https://registry.npmjs.org/chrono-node/-/chrono-node-2.7.7.tgz", - "integrity": "sha512-p3S7gotuTPu5oqhRL2p1fLwQXGgdQaRTtWR3e8Di9P1Pa9mzkK5DWR5AWBieMUh2ZdOnPgrK+zCrbbtyuA+D/Q==", + "license": "MIT", "dependencies": { "dayjs": "^1.10.0" }, @@ -9478,6 +9501,19 @@ "license": "MIT", "peer": true }, + "node_modules/compromise": { + "version": "14.14.1", + "resolved": "https://registry.npmjs.org/compromise/-/compromise-14.14.1.tgz", + "integrity": "sha512-2N/wPyFwtDaskKorKMs2sB+KTcrkOUHSaKZkNQMo8hEcNJHLt8NdwNVsN14vnmsgjPOxctTOYt3DWVxXIwXXxA==", + "dependencies": { + "efrt": "2.7.0", + "grad-school": "0.0.5", + "suffix-thumb": "5.0.2" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/concat-map": { "version": "0.0.1", "license": "MIT" @@ -10387,6 +10423,14 @@ "version": "1.1.1", "license": "MIT" }, + "node_modules/efrt": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/efrt/-/efrt-2.7.0.tgz", + "integrity": "sha512-/RInbCy1d4P6Zdfa+TMVsf/ufZVotat5hCw3QXmWtjU+3pFEOvOQ7ibo3aIxyCJw2leIeAMjmPj+1SLJiCpdrQ==", + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/ejs": { "version": "3.1.10", "dev": true, @@ -11693,6 +11737,14 @@ "version": "4.2.11", "license": "ISC" }, + "node_modules/grad-school": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/grad-school/-/grad-school-0.0.5.tgz", + "integrity": "sha512-rXunEHF9M9EkMydTBux7+IryYXEZinRk6g8OBOGDBzo/qWJjhTxy86i5q7lQYpCLHN8Sqv1XX3OIOc7ka2gtvQ==", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/grapheme-splitter": { "version": "1.0.4", "dev": true, @@ -18816,6 +18868,11 @@ "license": "MIT", "peer": true }, + "node_modules/suffix-thumb": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/suffix-thumb/-/suffix-thumb-5.0.2.tgz", + "integrity": "sha512-I5PWXAFKx3FYnI9a+dQMWNqTxoRt6vdBdb0O+BJ1sxXCWtSoQCusc13E58f+9p4MYx/qCnEMkD5jac6K2j3dgA==" + }, "node_modules/supports-color": { "version": "5.5.0", "license": "MIT", @@ -20941,8 +20998,7 @@ }, "validate/date/node_modules/date-fns": { "version": "4.1.0", - "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-4.1.0.tgz", - "integrity": "sha512-Ukq0owbQXxa/U3EGtsdVBkR1w7KOQ5gIBqdH2hkvknzZPYvBxb/aa6E8L7tmjFtkwZBu3UXBbjIgPo/Ez4xaNg==", + "license": "MIT", "funding": { "type": "github", "url": "https://github.com/sponsors/kossnocorp" diff --git a/package.json b/package.json index 77f95a101..305579ee8 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "workspaces": [ "bundlers/*", "convert/*", + "enrich/*", "flatfilers/*", "plugins/*", "support/*",