From fea574149c34306a1e79917b7276c142c5cd0d5c Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Thu, 3 Apr 2025 15:38:37 +0100 Subject: [PATCH 1/6] feat: text splitter --- .../core/generate-text/smooth-stream.test.ts | 139 +++++++++++++++++- .../ai/core/generate-text/smooth-stream.ts | 44 ++++-- .../ai/core/generate-text/text-splitter.ts | 109 ++++++++++++++ 3 files changed, 269 insertions(+), 23 deletions(-) create mode 100644 packages/ai/core/generate-text/text-splitter.ts diff --git a/packages/ai/core/generate-text/smooth-stream.test.ts b/packages/ai/core/generate-text/smooth-stream.test.ts index 6b01f7565ab8..f0727552a487 100644 --- a/packages/ai/core/generate-text/smooth-stream.test.ts +++ b/packages/ai/core/generate-text/smooth-stream.test.ts @@ -46,6 +46,7 @@ describe('smoothStream', () => { textDelta: 'Hello, ', type: 'text-delta', }, + 'delay 10', { textDelta: 'world!', type: 'text-delta', @@ -107,6 +108,7 @@ describe('smoothStream', () => { textDelta: 'example ', type: 'text-delta', }, + 'delay 10', { textDelta: 'text.', type: 'text-delta', @@ -146,14 +148,14 @@ describe('smoothStream', () => { }, 'delay 10', { - textDelta: 'line \n\n', + textDelta: 'line \n\n ', type: 'text-delta', }, 'delay 10', { - // note: leading whitespace is included here - // because it is part of the new chunk: - textDelta: ' Multiple ', + // note: leading whitespace not included here + // because it is part of the last chunk: + textDelta: 'Multiple ', type: 'text-delta', }, 'delay 10', @@ -161,6 +163,7 @@ describe('smoothStream', () => { textDelta: 'spaces\n ', type: 'text-delta', }, + 'delay 10', { textDelta: 'Indented', type: 'text-delta', @@ -173,6 +176,128 @@ describe('smoothStream', () => { }, ]); }); + + it('should support kanji', async () => { + const stream = convertArrayToReadableStream([ + { textDelta: 'Vercel', type: 'text-delta' }, + { textDelta: 'はサ', type: 'text-delta' }, + { textDelta: 'サーバーレス', type: 'text-delta' }, + { textDelta: 'の', type: 'text-delta' }, + { textDelta: 'フロントエンド', type: 'text-delta' }, + { textDelta: 'Hello, world!', type: 'text-delta' }, + { type: 'step-finish' }, + { type: 'finish' }, + ]).pipeThrough( + smoothStream({ + delayInMs: 10, + _internal: { delay }, + })({ tools: {} }), + ); + + await consumeStream(stream); + + expect(events).toMatchInlineSnapshot(` + [ + "delay 10", + { + "textDelta": "Vercelは", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "サ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "サ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ー", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "バ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ー", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "レ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ス", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "の", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "フ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ロ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ン", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ト", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "エ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ン", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "ド", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "Hello, ", + "type": "text-delta", + }, + "delay 10", + { + "textDelta": "world!", + "type": "text-delta", + }, + { + "type": "step-finish", + }, + { + "type": "finish", + }, + ] + `) + + }) }); describe('line chunking', () => { @@ -290,9 +415,7 @@ describe('smoothStream', () => { 'delay 10', { textDelta: 'o', type: 'text-delta' }, 'delay 10', - { textDelta: ',', type: 'text-delta' }, - 'delay 10', - { textDelta: ' ', type: 'text-delta' }, + { textDelta: ', ', type: 'text-delta' }, 'delay 10', { textDelta: 'w', type: 'text-delta' }, 'delay 10', @@ -331,6 +454,7 @@ describe('smoothStream', () => { textDelta: 'Hello, ', type: 'text-delta', }, + 'delay 10', { textDelta: 'world!', type: 'text-delta', @@ -364,6 +488,7 @@ describe('smoothStream', () => { textDelta: 'Hello, ', type: 'text-delta', }, + 'delay 20', { textDelta: 'world!', type: 'text-delta', diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts index 2f0ef085aee4..6b452af36d7f 100644 --- a/packages/ai/core/generate-text/smooth-stream.ts +++ b/packages/ai/core/generate-text/smooth-stream.ts @@ -2,10 +2,12 @@ import { InvalidArgumentError } from '@ai-sdk/provider'; import { delay as originalDelay } from '@ai-sdk/provider-utils'; import { TextStreamPart } from './stream-text-result'; import { ToolSet } from './tool-set'; +import { TextSplit, splitText } from './text-splitter'; const CHUNKING_REGEXPS = { - word: /\s*\S+\s+/m, - line: /[^\n]*\n/m, + character: /(?!\s)(?=.)/g, + word: /[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF]|\s+/gm, + line: /\r\n|\r|\n/g, }; /** @@ -44,33 +46,43 @@ export function smoothStream({ return () => { let buffer = ''; + let lastSplits: TextSplit[] = []; + let lastIndex = 0; + return new TransformStream, TextStreamPart>({ async transform(chunk, controller) { - if (chunk.type === 'step-finish') { - if (buffer.length > 0) { - controller.enqueue({ type: 'text-delta', textDelta: buffer }); - buffer = ''; + const lastSplit = lastSplits.at(-1); + + if (chunk.type !== 'text-delta') { + if (lastSplits.length > 1 && delayInMs) { + await delay(delayInMs); } - controller.enqueue(chunk); - return; - } + if (lastSplit) { + controller.enqueue({ type: 'text-delta', textDelta: lastSplit.text }); + lastSplits = []; + buffer = ''; + } - if (chunk.type !== 'text-delta') { controller.enqueue(chunk); return; } buffer += chunk.textDelta; - let match; - while ((match = chunkingRegexp.exec(buffer)) != null) { - const chunk = match[0]; - controller.enqueue({ type: 'text-delta', textDelta: chunk }); - buffer = buffer.slice(chunk.length); + const splits = splitText(buffer, chunkingRegexp); + const newSplitIndex = splits.findIndex(split => !lastSplit || split.start >= lastIndex); - await delay(delayInMs); + if (newSplitIndex !== -1) { + for (let i = newSplitIndex; i < splits.length -1; i++) { + const split = splits[i]; + controller.enqueue({ type: 'text-delta', textDelta: split.text }); + lastIndex = split.end; + await delay(delayInMs); + } } + + lastSplits = splits; }, }); }; diff --git a/packages/ai/core/generate-text/text-splitter.ts b/packages/ai/core/generate-text/text-splitter.ts new file mode 100644 index 000000000000..2ff6b42f8e01 --- /dev/null +++ b/packages/ai/core/generate-text/text-splitter.ts @@ -0,0 +1,109 @@ +export interface SplitOptions { + /** + * When true, delimiters will be separate matches rather then extending the matches near them + * @default false + */ + separateDelimiters?: boolean; +} + +export type TextSplit = T & { + start: number; + end: number; + text: string; +} + +export function splitText( + text: string, + splitter: RegExp | string, + splitOptions: SplitOptions & T = {} as SplitOptions & T, +): TextSplit[] { + const { + separateDelimiters, + ...options + } = splitOptions; + + const splits: TextSplit[] = []; + let lastIndex = 0; + + function getNextMatch() { + if (lastIndex === text.length) { + return null; + } + + if (typeof splitter === "string") { + if (splitter === "") { + return { index: lastIndex, 0: text.slice(lastIndex, lastIndex + 1) }; + } + + const index = text.indexOf(splitter, lastIndex); + return index === -1 ? null : { index, 0: splitter }; + } + + const regex = splitter.flags.includes("g") + ? splitter + : new RegExp(splitter.source, `${splitter.flags}g`); + regex.lastIndex = lastIndex; + const match = regex.exec(text); + + // If it's a zero-width match, we need to find the next match position + if (match && match[0] === "") { + regex.lastIndex = match.index + 1; + const nextMatch = regex.exec(text); + return { ...match, endIndex: nextMatch ? nextMatch.index : text.length }; + } + + return match; + } + + let match: ReturnType; + + while ((match = getNextMatch())) { + const matchEndIndex = + "endIndex" in match ? match.endIndex : match.index + match[0].length; + + const end = separateDelimiters ? match.index : matchEndIndex; + + if (end > lastIndex) { + const segment = text.slice(lastIndex, end); + + if (!segment.trim()) { + if (splits.length > 0) { + const previousSplit = splits[splits.length - 1]; + if (previousSplit) { + previousSplit.end = end; + previousSplit.text = text.slice(previousSplit.start, end); + } + } + } else { + splits.push({ + ...options, + start: lastIndex, + end, + text: segment, + } as TextSplit); + } + } + + if (separateDelimiters) { + splits.push({ + ...options, + start: match.index, + end: matchEndIndex, + text: match[0], + } as TextSplit); + } + + lastIndex = matchEndIndex; + } + + if (lastIndex < text.length) { + splits.push({ + ...options, + start: lastIndex, + end: text.length, + text: text.slice(lastIndex), + } as TextSplit); + } + + return splits; +} From 00d483d2aa88db46d36bb1caa2b4e83e68178663 Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Thu, 3 Apr 2025 15:40:29 +0100 Subject: [PATCH 2/6] chore: add changeset --- .changeset/beige-seals-tie.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/beige-seals-tie.md diff --git a/.changeset/beige-seals-tie.md b/.changeset/beige-seals-tie.md new file mode 100644 index 000000000000..6f3a98b088e9 --- /dev/null +++ b/.changeset/beige-seals-tie.md @@ -0,0 +1,5 @@ +--- +'ai': patch +--- + +feat(ai): improved text splitter From 6c06aaeb94abbdd0ecaf8234735a40c946bef5d7 Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Thu, 3 Apr 2025 15:45:56 +0100 Subject: [PATCH 3/6] chore: add comment --- packages/ai/core/generate-text/smooth-stream.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts index a315ec66dfb8..2602859b91b5 100644 --- a/packages/ai/core/generate-text/smooth-stream.ts +++ b/packages/ai/core/generate-text/smooth-stream.ts @@ -74,6 +74,9 @@ export function smoothStream({ buffer += chunk.textDelta; const splits = splitText(buffer, chunkingRegexp); + + // If there's a new split with the start index greater than the last index, + // push the new split(s) and delay. const newSplitIndex = splits.findIndex( split => !lastSplit || split.start >= lastIndex, ); From 5a6be5d1dbee1baff08d94f8c8cb654b891c0bca Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Thu, 3 Apr 2025 16:29:59 +0100 Subject: [PATCH 4/6] chore: review --- .../core/generate-text/smooth-stream.test.ts | 2 ++ .../ai/core/generate-text/smooth-stream.ts | 14 +++++--- .../ai/core/generate-text/text-splitter.ts | 36 ++++--------------- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/packages/ai/core/generate-text/smooth-stream.test.ts b/packages/ai/core/generate-text/smooth-stream.test.ts index 649fb8daf2d9..39bd5ad657e2 100644 --- a/packages/ai/core/generate-text/smooth-stream.test.ts +++ b/packages/ai/core/generate-text/smooth-stream.test.ts @@ -226,6 +226,7 @@ describe('smoothStream', () => { "textDelta": "in ", "type": "text-delta", }, + "delay 10", { "textDelta": "London.", "type": "text-delta", @@ -302,6 +303,7 @@ describe('smoothStream', () => { "textDelta": "in ", "type": "text-delta", }, + "delay 10", { "textDelta": "London.", "type": "text-delta", diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts index 2602859b91b5..07dc707021e3 100644 --- a/packages/ai/core/generate-text/smooth-stream.ts +++ b/packages/ai/core/generate-text/smooth-stream.ts @@ -24,7 +24,7 @@ export function smoothStream({ _internal: { delay = originalDelay } = {}, }: { delayInMs?: number | null; - chunking?: 'word' | 'line' | RegExp; + chunking?: 'word' | 'line' | { split: string } | RegExp; /** * Internal. For test use only. May change without notice. */ @@ -34,10 +34,14 @@ export function smoothStream({ } = {}): (options: { tools: TOOLS; }) => TransformStream, TextStreamPart> { - const chunkingRegexp = - typeof chunking === 'string' ? CHUNKING_REGEXPS[chunking] : chunking; + const chunker = + typeof chunking === 'object' && 'split' in chunking + ? chunking.split + : typeof chunking === 'string' + ? CHUNKING_REGEXPS[chunking] + : chunking; - if (chunkingRegexp == null) { + if (chunker == null) { throw new InvalidArgumentError({ argument: 'chunking', message: `Chunking must be "word" or "line" or a RegExp. Received: ${chunking}`, @@ -73,7 +77,7 @@ export function smoothStream({ buffer += chunk.textDelta; - const splits = splitText(buffer, chunkingRegexp); + const splits = splitText(buffer, chunker); // If there's a new split with the start index greater than the last index, // push the new split(s) and delay. diff --git a/packages/ai/core/generate-text/text-splitter.ts b/packages/ai/core/generate-text/text-splitter.ts index ec86c9f5b935..94fc1a825e53 100644 --- a/packages/ai/core/generate-text/text-splitter.ts +++ b/packages/ai/core/generate-text/text-splitter.ts @@ -1,25 +1,14 @@ -export interface SplitOptions { - /** - * When true, delimiters will be separate matches rather then extending the matches near them - * @default false - */ - separateDelimiters?: boolean; -} - -export type TextSplit = T & { +export interface TextSplit { start: number; end: number; text: string; -}; +} export function splitText( text: string, splitter: RegExp | string, - splitOptions: SplitOptions & T = {} as SplitOptions & T, -): TextSplit[] { - const { separateDelimiters, ...options } = splitOptions; - - const splits: TextSplit[] = []; +): TextSplit[] { + const splits: TextSplit[] = []; let lastIndex = 0; function getNextMatch() { @@ -58,7 +47,7 @@ export function splitText( const matchEndIndex = 'endIndex' in match ? match.endIndex : match.index + match[0].length; - const end = separateDelimiters ? match.index : matchEndIndex; + const end = matchEndIndex; if (end > lastIndex) { const segment = text.slice(lastIndex, end); @@ -73,33 +62,22 @@ export function splitText( } } else { splits.push({ - ...options, start: lastIndex, end, text: segment, - } as TextSplit); + }); } } - if (separateDelimiters) { - splits.push({ - ...options, - start: match.index, - end: matchEndIndex, - text: match[0], - } as TextSplit); - } - lastIndex = matchEndIndex; } if (lastIndex < text.length) { splits.push({ - ...options, start: lastIndex, end: text.length, text: text.slice(lastIndex), - } as TextSplit); + }); } return splits; From 9555705ae4466c59dc175a21b9d431430d0be5f1 Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Thu, 3 Apr 2025 16:38:48 +0100 Subject: [PATCH 5/6] fix --- packages/ai/core/generate-text/smooth-stream.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts index 07dc707021e3..0f2fed625c9b 100644 --- a/packages/ai/core/generate-text/smooth-stream.ts +++ b/packages/ai/core/generate-text/smooth-stream.ts @@ -24,7 +24,7 @@ export function smoothStream({ _internal: { delay = originalDelay } = {}, }: { delayInMs?: number | null; - chunking?: 'word' | 'line' | { split: string } | RegExp; + chunking?: 'character' |'word' | 'line' | { split: string } | RegExp; /** * Internal. For test use only. May change without notice. */ From 61d57db206dd1e846e0d158679cef6d6ece6e16e Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Thu, 3 Apr 2025 16:42:10 +0100 Subject: [PATCH 6/6] fix: lint --- packages/ai/core/generate-text/smooth-stream.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts index 0f2fed625c9b..31c7f9243c2e 100644 --- a/packages/ai/core/generate-text/smooth-stream.ts +++ b/packages/ai/core/generate-text/smooth-stream.ts @@ -24,7 +24,7 @@ export function smoothStream({ _internal: { delay = originalDelay } = {}, }: { delayInMs?: number | null; - chunking?: 'character' |'word' | 'line' | { split: string } | RegExp; + chunking?: 'character' | 'word' | 'line' | { split: string } | RegExp; /** * Internal. For test use only. May change without notice. */