From fea574149c34306a1e79917b7276c142c5cd0d5c Mon Sep 17 00:00:00 2001
From: Sam Denty <sam@samdenty.com>
Date: Thu, 3 Apr 2025 15:38:37 +0100
Subject: [PATCH 1/6] feat: text splitter

---
 .../core/generate-text/smooth-stream.test.ts  | 139 +++++++++++++++++-
 .../ai/core/generate-text/smooth-stream.ts    |  44 ++++--
 .../ai/core/generate-text/text-splitter.ts    | 109 ++++++++++++++
 3 files changed, 269 insertions(+), 23 deletions(-)
 create mode 100644 packages/ai/core/generate-text/text-splitter.ts

diff --git a/packages/ai/core/generate-text/smooth-stream.test.ts b/packages/ai/core/generate-text/smooth-stream.test.ts
index 6b01f7565ab8..f0727552a487 100644
--- a/packages/ai/core/generate-text/smooth-stream.test.ts
+++ b/packages/ai/core/generate-text/smooth-stream.test.ts
@@ -46,6 +46,7 @@ describe('smoothStream', () => {
           textDelta: 'Hello, ',
           type: 'text-delta',
         },
+        'delay 10',
         {
           textDelta: 'world!',
           type: 'text-delta',
@@ -107,6 +108,7 @@ describe('smoothStream', () => {
           textDelta: 'example ',
           type: 'text-delta',
         },
+        'delay 10',
         {
           textDelta: 'text.',
           type: 'text-delta',
@@ -146,14 +148,14 @@ describe('smoothStream', () => {
         },
         'delay 10',
         {
-          textDelta: 'line \n\n',
+          textDelta: 'line \n\n    ',
           type: 'text-delta',
         },
         'delay 10',
         {
-          // note: leading whitespace is included here
-          // because it is part of the new chunk:
-          textDelta: '    Multiple ',
+          // note: leading whitespace not included here
+          // because it is part of the last chunk:
+          textDelta: 'Multiple ',
           type: 'text-delta',
         },
         'delay 10',
@@ -161,6 +163,7 @@ describe('smoothStream', () => {
           textDelta: 'spaces\n    ',
           type: 'text-delta',
         },
+        'delay 10',
         {
           textDelta: 'Indented',
           type: 'text-delta',
@@ -173,6 +176,128 @@ describe('smoothStream', () => {
         },
       ]);
     });
+
+    it('should support kanji', async () => {
+      const stream = convertArrayToReadableStream([
+        { textDelta: 'Vercel', type: 'text-delta' },
+        { textDelta: 'はサ', type: 'text-delta' },
+        { textDelta: 'サーバーレス', type: 'text-delta' },
+        { textDelta: 'の', type: 'text-delta' },
+        { textDelta: 'フロントエンド', type: 'text-delta' },
+        { textDelta: 'Hello, world!', type: 'text-delta' },
+        { type: 'step-finish' },
+        { type: 'finish' },
+      ]).pipeThrough(
+        smoothStream({
+          delayInMs: 10,
+          _internal: { delay },
+        })({ tools: {} }),
+      );
+
+      await consumeStream(stream);
+
+      expect(events).toMatchInlineSnapshot(`
+        [
+          "delay 10",
+          {
+            "textDelta": "Vercelは",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "サ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "サ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ー",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "バ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ー",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "レ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ス",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "の",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "フ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ロ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ン",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ト",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "エ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ン",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "ド",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "Hello, ",
+            "type": "text-delta",
+          },
+          "delay 10",
+          {
+            "textDelta": "world!",
+            "type": "text-delta",
+          },
+          {
+            "type": "step-finish",
+          },
+          {
+            "type": "finish",
+          },
+        ]
+      `)
+
+    })
   });
 
   describe('line chunking', () => {
@@ -290,9 +415,7 @@ describe('smoothStream', () => {
         'delay 10',
         { textDelta: 'o', type: 'text-delta' },
         'delay 10',
-        { textDelta: ',', type: 'text-delta' },
-        'delay 10',
-        { textDelta: ' ', type: 'text-delta' },
+        { textDelta: ', ', type: 'text-delta' },
         'delay 10',
         { textDelta: 'w', type: 'text-delta' },
         'delay 10',
@@ -331,6 +454,7 @@ describe('smoothStream', () => {
           textDelta: 'Hello, ',
           type: 'text-delta',
         },
+        'delay 10',
         {
           textDelta: 'world!',
           type: 'text-delta',
@@ -364,6 +488,7 @@ describe('smoothStream', () => {
           textDelta: 'Hello, ',
           type: 'text-delta',
         },
+        'delay 20',
         {
           textDelta: 'world!',
           type: 'text-delta',
diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts
index 2f0ef085aee4..6b452af36d7f 100644
--- a/packages/ai/core/generate-text/smooth-stream.ts
+++ b/packages/ai/core/generate-text/smooth-stream.ts
@@ -2,10 +2,12 @@ import { InvalidArgumentError } from '@ai-sdk/provider';
 import { delay as originalDelay } from '@ai-sdk/provider-utils';
 import { TextStreamPart } from './stream-text-result';
 import { ToolSet } from './tool-set';
+import { TextSplit, splitText } from './text-splitter';
 
 const CHUNKING_REGEXPS = {
-  word: /\s*\S+\s+/m,
-  line: /[^\n]*\n/m,
+  character: /(?!\s)(?=.)/g,
+  word: /[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF]|\s+/gm,
+  line: /\r\n|\r|\n/g,
 };
 
 /**
@@ -44,33 +46,43 @@ export function smoothStream<TOOLS extends ToolSet>({
 
   return () => {
     let buffer = '';
+    let lastSplits: TextSplit[] = [];
+    let lastIndex = 0;
+
     return new TransformStream<TextStreamPart<TOOLS>, TextStreamPart<TOOLS>>({
       async transform(chunk, controller) {
-        if (chunk.type === 'step-finish') {
-          if (buffer.length > 0) {
-            controller.enqueue({ type: 'text-delta', textDelta: buffer });
-            buffer = '';
+        const lastSplit = lastSplits.at(-1);
+
+        if (chunk.type !== 'text-delta') {
+          if (lastSplits.length > 1 && delayInMs) {
+            await delay(delayInMs);
           }
 
-          controller.enqueue(chunk);
-          return;
-        }
+          if (lastSplit) {
+            controller.enqueue({ type: 'text-delta', textDelta: lastSplit.text });
+            lastSplits = [];
+            buffer = '';
+          }
 
-        if (chunk.type !== 'text-delta') {
           controller.enqueue(chunk);
           return;
         }
 
         buffer += chunk.textDelta;
 
-        let match;
-        while ((match = chunkingRegexp.exec(buffer)) != null) {
-          const chunk = match[0];
-          controller.enqueue({ type: 'text-delta', textDelta: chunk });
-          buffer = buffer.slice(chunk.length);
+        const splits = splitText(buffer, chunkingRegexp);
+        const newSplitIndex = splits.findIndex(split => !lastSplit || split.start >= lastIndex);
 
-          await delay(delayInMs);
+        if (newSplitIndex !== -1) {
+          for (let i = newSplitIndex; i < splits.length -1; i++) {
+            const split = splits[i];
+            controller.enqueue({ type: 'text-delta', textDelta: split.text });
+            lastIndex = split.end;
+            await delay(delayInMs);
+          }
         }
+
+        lastSplits = splits;
       },
     });
   };
diff --git a/packages/ai/core/generate-text/text-splitter.ts b/packages/ai/core/generate-text/text-splitter.ts
new file mode 100644
index 000000000000..2ff6b42f8e01
--- /dev/null
+++ b/packages/ai/core/generate-text/text-splitter.ts
@@ -0,0 +1,109 @@
+export interface SplitOptions {
+	/**
+	 * When true, delimiters will be separate matches rather then extending the matches near them
+	 * @default false
+	 */
+	separateDelimiters?: boolean;
+}
+
+export type TextSplit<T = {}> = T & {
+  start: number;
+  end: number;
+  text: string;
+}
+
+export function splitText<T>(
+	text: string,
+	splitter: RegExp | string,
+	splitOptions: SplitOptions & T = {} as SplitOptions & T,
+): TextSplit<T>[] {
+	const {
+		separateDelimiters,
+		...options
+	} = splitOptions;
+
+	const splits: TextSplit<T>[] = [];
+	let lastIndex = 0;
+
+	function getNextMatch() {
+		if (lastIndex === text.length) {
+			return null;
+		}
+
+		if (typeof splitter === "string") {
+			if (splitter === "") {
+				return { index: lastIndex, 0: text.slice(lastIndex, lastIndex + 1) };
+			}
+
+			const index = text.indexOf(splitter, lastIndex);
+			return index === -1 ? null : { index, 0: splitter };
+		}
+
+		const regex = splitter.flags.includes("g")
+			? splitter
+			: new RegExp(splitter.source, `${splitter.flags}g`);
+		regex.lastIndex = lastIndex;
+		const match = regex.exec(text);
+
+		// If it's a zero-width match, we need to find the next match position
+		if (match && match[0] === "") {
+			regex.lastIndex = match.index + 1;
+			const nextMatch = regex.exec(text);
+			return { ...match, endIndex: nextMatch ? nextMatch.index : text.length };
+		}
+
+		return match;
+	}
+
+	let match: ReturnType<typeof getNextMatch>;
+
+	while ((match = getNextMatch())) {
+		const matchEndIndex =
+			"endIndex" in match ? match.endIndex : match.index + match[0].length;
+
+		const end = separateDelimiters ? match.index : matchEndIndex;
+
+		if (end > lastIndex) {
+			const segment = text.slice(lastIndex, end);
+
+			if (!segment.trim()) {
+				if (splits.length > 0) {
+					const previousSplit = splits[splits.length - 1];
+					if (previousSplit) {
+						previousSplit.end = end;
+						previousSplit.text = text.slice(previousSplit.start, end);
+					}
+				}
+			} else {
+				splits.push({
+					...options,
+					start: lastIndex,
+					end,
+					text: segment,
+				} as TextSplit<T>);
+			}
+		}
+
+		if (separateDelimiters) {
+			splits.push({
+				...options,
+				start: match.index,
+				end: matchEndIndex,
+				text: match[0],
+			} as TextSplit<T>);
+		}
+
+		lastIndex = matchEndIndex;
+	}
+
+	if (lastIndex < text.length) {
+		splits.push({
+			...options,
+			start: lastIndex,
+			end: text.length,
+			text: text.slice(lastIndex),
+		} as TextSplit<T>);
+	}
+
+	return splits;
+}

From 00d483d2aa88db46d36bb1caa2b4e83e68178663 Mon Sep 17 00:00:00 2001
From: Sam Denty <sam@samdenty.com>
Date: Thu, 3 Apr 2025 15:40:29 +0100
Subject: [PATCH 2/6] chore: add changeset

---
 .changeset/beige-seals-tie.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/beige-seals-tie.md

diff --git a/.changeset/beige-seals-tie.md b/.changeset/beige-seals-tie.md
new file mode 100644
index 000000000000..6f3a98b088e9
--- /dev/null
+++ b/.changeset/beige-seals-tie.md
@@ -0,0 +1,5 @@
+---
+'ai': patch
+---
+
+feat(ai): improved text splitter

From 6c06aaeb94abbdd0ecaf8234735a40c946bef5d7 Mon Sep 17 00:00:00 2001
From: Sam Denty <sam@samdenty.com>
Date: Thu, 3 Apr 2025 15:45:56 +0100
Subject: [PATCH 3/6] chore: add comment

---
 packages/ai/core/generate-text/smooth-stream.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts
index a315ec66dfb8..2602859b91b5 100644
--- a/packages/ai/core/generate-text/smooth-stream.ts
+++ b/packages/ai/core/generate-text/smooth-stream.ts
@@ -74,6 +74,9 @@ export function smoothStream<TOOLS extends ToolSet>({
         buffer += chunk.textDelta;
 
         const splits = splitText(buffer, chunkingRegexp);
+
+        // If there's a new split with the start index greater than the last index,
+        // push the new split(s) and delay.
         const newSplitIndex = splits.findIndex(
           split => !lastSplit || split.start >= lastIndex,
         );

From 5a6be5d1dbee1baff08d94f8c8cb654b891c0bca Mon Sep 17 00:00:00 2001
From: Sam Denty <sam@samdenty.com>
Date: Thu, 3 Apr 2025 16:29:59 +0100
Subject: [PATCH 4/6] chore: review

---
 .../core/generate-text/smooth-stream.test.ts  |  2 ++
 .../ai/core/generate-text/smooth-stream.ts    | 14 +++++---
 .../ai/core/generate-text/text-splitter.ts    | 36 ++++---------------
 3 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/packages/ai/core/generate-text/smooth-stream.test.ts b/packages/ai/core/generate-text/smooth-stream.test.ts
index 649fb8daf2d9..39bd5ad657e2 100644
--- a/packages/ai/core/generate-text/smooth-stream.test.ts
+++ b/packages/ai/core/generate-text/smooth-stream.test.ts
@@ -226,6 +226,7 @@ describe('smoothStream', () => {
             "textDelta": "in ",
             "type": "text-delta",
           },
+          "delay 10",
           {
             "textDelta": "London.",
             "type": "text-delta",
@@ -302,6 +303,7 @@ describe('smoothStream', () => {
             "textDelta": "in ",
             "type": "text-delta",
           },
+          "delay 10",
           {
             "textDelta": "London.",
             "type": "text-delta",
diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts
index 2602859b91b5..07dc707021e3 100644
--- a/packages/ai/core/generate-text/smooth-stream.ts
+++ b/packages/ai/core/generate-text/smooth-stream.ts
@@ -24,7 +24,7 @@ export function smoothStream<TOOLS extends ToolSet>({
   _internal: { delay = originalDelay } = {},
 }: {
   delayInMs?: number | null;
-  chunking?: 'word' | 'line' | RegExp;
+  chunking?: 'word' | 'line' | { split: string } | RegExp;
   /**
    * Internal. For test use only. May change without notice.
    */
@@ -34,10 +34,14 @@ export function smoothStream<TOOLS extends ToolSet>({
 } = {}): (options: {
   tools: TOOLS;
 }) => TransformStream<TextStreamPart<TOOLS>, TextStreamPart<TOOLS>> {
-  const chunkingRegexp =
-    typeof chunking === 'string' ? CHUNKING_REGEXPS[chunking] : chunking;
+  const chunker =
+    typeof chunking === 'object' && 'split' in chunking
+      ? chunking.split
+      : typeof chunking === 'string'
+        ? CHUNKING_REGEXPS[chunking]
+        : chunking;
 
-  if (chunkingRegexp == null) {
+  if (chunker == null) {
     throw new InvalidArgumentError({
       argument: 'chunking',
       message: `Chunking must be "word" or "line" or a RegExp. Received: ${chunking}`,
@@ -73,7 +77,7 @@ export function smoothStream<TOOLS extends ToolSet>({
 
         buffer += chunk.textDelta;
 
-        const splits = splitText(buffer, chunkingRegexp);
+        const splits = splitText(buffer, chunker);
 
         // If there's a new split with the start index greater than the last index,
         // push the new split(s) and delay.
diff --git a/packages/ai/core/generate-text/text-splitter.ts b/packages/ai/core/generate-text/text-splitter.ts
index ec86c9f5b935..94fc1a825e53 100644
--- a/packages/ai/core/generate-text/text-splitter.ts
+++ b/packages/ai/core/generate-text/text-splitter.ts
@@ -1,25 +1,14 @@
-export interface SplitOptions {
-  /**
-   * When true, delimiters will be separate matches rather then extending the matches near them
-   * @default false
-   */
-  separateDelimiters?: boolean;
-}
-
-export type TextSplit<T = {}> = T & {
+export interface TextSplit {
   start: number;
   end: number;
   text: string;
-};
+}
 
 export function splitText<T>(
   text: string,
   splitter: RegExp | string,
-  splitOptions: SplitOptions & T = {} as SplitOptions & T,
-): TextSplit<T>[] {
-  const { separateDelimiters, ...options } = splitOptions;
-
-  const splits: TextSplit<T>[] = [];
+): TextSplit[] {
+  const splits: TextSplit[] = [];
   let lastIndex = 0;
 
   function getNextMatch() {
@@ -58,7 +47,7 @@ export function splitText<T>(
     const matchEndIndex =
       'endIndex' in match ? match.endIndex : match.index + match[0].length;
 
-    const end = separateDelimiters ? match.index : matchEndIndex;
+    const end = matchEndIndex;
 
     if (end > lastIndex) {
       const segment = text.slice(lastIndex, end);
@@ -73,33 +62,22 @@ export function splitText<T>(
         }
       } else {
         splits.push({
-          ...options,
           start: lastIndex,
           end,
           text: segment,
-        } as TextSplit<T>);
+        });
       }
     }
 
-    if (separateDelimiters) {
-      splits.push({
-        ...options,
-        start: match.index,
-        end: matchEndIndex,
-        text: match[0],
-      } as TextSplit<T>);
-    }
-
     lastIndex = matchEndIndex;
   }
 
   if (lastIndex < text.length) {
     splits.push({
-      ...options,
       start: lastIndex,
       end: text.length,
       text: text.slice(lastIndex),
-    } as TextSplit<T>);
+    });
   }
 
   return splits;

From 9555705ae4466c59dc175a21b9d431430d0be5f1 Mon Sep 17 00:00:00 2001
From: Sam Denty <sam@samdenty.com>
Date: Thu, 3 Apr 2025 16:38:48 +0100
Subject: [PATCH 5/6] fix

---
 packages/ai/core/generate-text/smooth-stream.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts
index 07dc707021e3..0f2fed625c9b 100644
--- a/packages/ai/core/generate-text/smooth-stream.ts
+++ b/packages/ai/core/generate-text/smooth-stream.ts
@@ -24,7 +24,7 @@ export function smoothStream<TOOLS extends ToolSet>({
   _internal: { delay = originalDelay } = {},
 }: {
   delayInMs?: number | null;
-  chunking?: 'word' | 'line' | { split: string } | RegExp;
+  chunking?: 'character' |'word' | 'line' | { split: string } | RegExp;
   /**
    * Internal. For test use only. May change without notice.
    */

From 61d57db206dd1e846e0d158679cef6d6ece6e16e Mon Sep 17 00:00:00 2001
From: Sam Denty <sam@samdenty.com>
Date: Thu, 3 Apr 2025 16:42:10 +0100
Subject: [PATCH 6/6] fix: lint

---
 packages/ai/core/generate-text/smooth-stream.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/ai/core/generate-text/smooth-stream.ts b/packages/ai/core/generate-text/smooth-stream.ts
index 0f2fed625c9b..31c7f9243c2e 100644
--- a/packages/ai/core/generate-text/smooth-stream.ts
+++ b/packages/ai/core/generate-text/smooth-stream.ts
@@ -24,7 +24,7 @@ export function smoothStream<TOOLS extends ToolSet>({
   _internal: { delay = originalDelay } = {},
 }: {
   delayInMs?: number | null;
-  chunking?: 'character' |'word' | 'line' | { split: string } | RegExp;
+  chunking?: 'character' | 'word' | 'line' | { split: string } | RegExp;
   /**
    * Internal. For test use only. May change without notice.
    */