From 52d155ba2deb5f0280e4edb73c05aa5a3cfe7545 Mon Sep 17 00:00:00 2001 From: Ian Lee Date: Thu, 29 Jan 2026 14:06:42 -0800 Subject: [PATCH 1/3] inworld tts ws auto mode --- plugins/inworld/src/tts.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/inworld/src/tts.ts b/plugins/inworld/src/tts.ts index 5a94b8a59..8a8050215 100644 --- a/plugins/inworld/src/tts.ts +++ b/plugins/inworld/src/tts.ts @@ -74,6 +74,7 @@ interface CreateContextConfig { maxBufferDelayMs: number; timestampType?: TimestampType; applyTextNormalization?: TextNormalization; + autoMode?: boolean; } interface WordAlignment { @@ -635,6 +636,9 @@ class SynthesizeStream extends tts.SynthesizeStream { maxBufferDelayMs: this.#opts.maxBufferDelayMs, timestampType: this.#opts.timestampType, applyTextNormalization: this.#opts.textNormalization, + // Always enable auto_mode since we use sentence tokenizer and don't expose + // mid-stream flush_context control to users yet + autoMode: true, }; return this.#send(ws, { create: config, contextId: this.#contextId }); From 2baf4a1ede37737ac730ca5944262b2f1c52504c Mon Sep 17 00:00:00 2001 From: Ian Lee Date: Wed, 4 Feb 2026 16:45:25 -0800 Subject: [PATCH 2/3] fix timestamps cumulation within a context --- plugins/inworld/src/tts.ts | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/plugins/inworld/src/tts.ts b/plugins/inworld/src/tts.ts index 8a8050215..aea9aedda 100644 --- a/plugins/inworld/src/tts.ts +++ b/plugins/inworld/src/tts.ts @@ -103,6 +103,7 @@ interface InworldResult { contextId?: string; contextCreated?: boolean; contextClosed?: boolean; + flushCompleted?: boolean; audioChunk?: AudioChunk; audioContent?: string; status?: { code: number; message: string }; @@ -477,6 +478,12 @@ class SynthesizeStream extends tts.SynthesizeStream { #opts: TTSOptions; #tts: TTS; #contextId: string; + // Cumulative timestamp tracking for monotonic timestamps across generations. + // When auto_mode is enabled or flush_context() is called, the server resets + // timestamps to 0 after each generation. We add cumulativeTime to maintain + // monotonically increasing timestamps within an agent turn. + #cumulativeTime: number = 0; + #generationEndTime: number = 0; label = 'inworld.SynthesizeStream'; constructor(ttsInstance: TTS, opts: TTSOptions) { @@ -505,13 +512,27 @@ class SynthesizeStream extends tts.SynthesizeStream { if (result.contextCreated) { } else if (result.contextClosed) { resolveProcessing(); + } else if (result.flushCompleted) { + // Signals the end of a generation. Subsequent timestamps from the server + // will reset offset to 0. Update cumulative time to maintain monotonically + // increasing timestamps within the agent turn. + this.#cumulativeTime = this.#generationEndTime; } else if (result.audioChunk) { if (result.audioChunk.timestampInfo) { const tsInfo = result.audioChunk.timestampInfo; if (tsInfo.wordAlignment) { const words = tsInfo.wordAlignment.words || []; - const starts = tsInfo.wordAlignment.wordStartTimeSeconds || []; - const ends = tsInfo.wordAlignment.wordEndTimeSeconds || []; + const rawStarts = tsInfo.wordAlignment.wordStartTimeSeconds || []; + const rawEnds = tsInfo.wordAlignment.wordEndTimeSeconds || []; + + // Apply cumulative offset for monotonic timestamps across generations + const starts = rawStarts.map((t: number) => t + this.#cumulativeTime); + const ends = rawEnds.map((t: number) => t + this.#cumulativeTime); + + // Track generation end time from last word for cumulative offset + if (ends.length > 0) { + this.#generationEndTime = Math.max(this.#generationEndTime, ends[ends.length - 1]!); + } // eslint-disable-next-line @typescript-eslint/no-explicit-any (this.#tts as any).emit('alignment', { @@ -523,8 +544,17 @@ class SynthesizeStream extends tts.SynthesizeStream { if (tsInfo.characterAlignment) { const chars = tsInfo.characterAlignment.characters || []; - const starts = tsInfo.characterAlignment.characterStartTimeSeconds || []; - const ends = tsInfo.characterAlignment.characterEndTimeSeconds || []; + const rawStarts = tsInfo.characterAlignment.characterStartTimeSeconds || []; + const rawEnds = tsInfo.characterAlignment.characterEndTimeSeconds || []; + + // Apply cumulative offset for monotonic timestamps across generations + const starts = rawStarts.map((t: number) => t + this.#cumulativeTime); + const ends = rawEnds.map((t: number) => t + this.#cumulativeTime); + + // Track generation end time from last character for cumulative offset + if (ends.length > 0) { + this.#generationEndTime = Math.max(this.#generationEndTime, ends[ends.length - 1]!); + } // eslint-disable-next-line @typescript-eslint/no-explicit-any (this.#tts as any).emit('alignment', { From 9a585c66cc628545887ddba77abc44f638a98564 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Tue, 10 Feb 2026 14:16:03 -0800 Subject: [PATCH 3/3] Create little-tables-joke.md --- .changeset/little-tables-joke.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/little-tables-joke.md diff --git a/.changeset/little-tables-joke.md b/.changeset/little-tables-joke.md new file mode 100644 index 000000000..11d1640d4 --- /dev/null +++ b/.changeset/little-tables-joke.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-inworld": patch +--- + +Add inworld tts auto mode