From 57d956cd57afba2f77ca1cd7b1c7c0d99ef4a0e7 Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 9 Apr 2026 19:29:54 +0000 Subject: [PATCH 1/4] faster handoffs --- plugins/phonic/src/realtime/realtime_model.ts | 163 +++++++++++++++--- 1 file changed, 137 insertions(+), 26 deletions(-) diff --git a/plugins/phonic/src/realtime/realtime_model.ts b/plugins/phonic/src/realtime/realtime_model.ts index 0233679e4..c94b8ddf5 100644 --- a/plugins/phonic/src/realtime/realtime_model.ts +++ b/plugins/phonic/src/realtime/realtime_model.ts @@ -23,6 +23,10 @@ const PHONIC_INPUT_FRAME_MS = 20; const DEFAULT_MODEL = 'merritt'; const WS_CLOSE_NORMAL = 1000; const TOOL_CALL_OUTPUT_TIMEOUT_MS = 60_000; +const CONVERSATION_HISTORY_PREFIX = + '\n\nThis conversation is being continued from an existing ' + + 'conversation. You are the assistant speaking to the user. ' + + 'The following is the conversation history:\n'; export interface RealtimeModelOptions { apiKey: string; @@ -56,6 +60,10 @@ export class RealtimeModel extends llm.RealtimeModel { return this._options.model; } + get provider(): string { + return 'phonic'; + } + constructor( options: { /** @@ -294,9 +302,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.logger.debug( 'updateChatCtx called with messages prior to config being sent to Phonic. Including conversation state in system instructions.', ); - this.systemPromptPostfix = - '\n\nThis conversation is being continued from an existing conversation. You are the assistant speaking to the user. The following is the conversation history:\n' + - turnHistory; + this.systemPromptPostfix = CONVERSATION_HISTORY_PREFIX + turnHistory; } this._chatCtx = chatCtx.copy(); } @@ -375,6 +381,79 @@ export class RealtimeSession extends llm.RealtimeSession { this.toolsReady.resolve(); } + async _updateSession({ + instructions, + chatCtx, + tools, + }: { + instructions?: string; + chatCtx?: llm.ChatContext; + tools?: llm.ToolContext; + } = {}): Promise { + if (!this.configSent) { + if (instructions !== undefined) { + await this.updateInstructions(instructions); + } + if (chatCtx !== undefined) { + await this.updateChatCtx(chatCtx); + } + if (tools !== undefined) { + await this.updateTools(tools); + } + return; + } + + if (instructions !== undefined) { + this.options.instructions = instructions; + } + if (tools !== undefined) { + this._tools = { ...tools }; + this.toolDefinitions = Object.entries(tools) + .filter(([, tool]) => llm.isFunctionTool(tool)) + .map(([name, tool]) => ({ + type: 'custom_websocket', + tool_schema: { + type: 'function', + function: { + name, + description: tool.description, + parameters: llm.toJsonSchema(tool.parameters), + strict: true, + }, + }, + tool_call_output_timeout_ms: TOOL_CALL_OUTPUT_TIMEOUT_MS, + wait_for_speech_before_tool_call: true, + allow_tool_chaining: false, + })); + } + if (chatCtx !== undefined) { + this._chatCtx = chatCtx.copy(); + } + + let systemPrompt = this.options.instructions ?? ''; + if (chatCtx !== undefined) { + const history = this.buildTurnHistory(chatCtx); + if (history) { + systemPrompt += CONVERSATION_HISTORY_PREFIX + history; + } + } + + this.closeCurrentGeneration({ interrupted: true }); + + const toolsPayload: Phonic.ConfigOptions.Tools.Item[] = [ + ...(this.options.phonicTools ?? []), + ...this.toolDefinitions, + ]; + + if (this.socket) { + this.logger.info('Sending mid-session reset to Phonic'); + this.socket.sendReset({ + type: 'reset', + config: this.buildConfigOptions({ systemPrompt, toolsPayload }), + }); + } + } + updateOptions(_options: { toolChoice?: llm.ToolChoice | null }): void { this.logger.warn('updateOptions is not supported by the Phonic realtime model.'); } @@ -434,6 +513,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.instructionsReady.resolve(); this.toolsReady.resolve(); this.closeCurrentGeneration({ interrupted: false }); + this.inputResampler = undefined; this.socket?.close(); await this.connectTask; await super.close(); @@ -482,30 +562,10 @@ export class RealtimeSession extends llm.RealtimeSession { this.socket.sendConfig({ type: 'config', model: this.options.model as Phonic.ConfigPayload['model'], - agent: this.options.phonicAgent, - project: this.options.project, - welcome_message: this.options.welcomeMessage, - generate_welcome_message: this.options.generateWelcomeMessage, - system_prompt: this.options.instructions + this.systemPromptPostfix, - voice_id: this.options.voice, - input_format: 'pcm_44100', - output_format: 'pcm_44100', - ...(this.options.defaultLanguage !== undefined && { - default_language: this.options.defaultLanguage, + ...this.buildConfigOptions({ + systemPrompt: this.options.instructions + this.systemPromptPostfix, + toolsPayload: [...(this.options.phonicTools ?? []), ...this.toolDefinitions], }), - ...(this.options.additionalLanguages !== undefined && { - additional_languages: this.options.additionalLanguages, - }), - ...(this.options.multilingualMode !== undefined && { - multilingual_mode: this.options.multilingualMode, - }), - audio_speed: this.options.audioSpeed, - tools: [...(this.options.phonicTools ?? []), ...this.toolDefinitions], - boosted_keywords: this.options.boostedKeywords, - generate_no_input_poke_text: this.options.generateNoInputPokeText, - no_input_poke_sec: this.options.noInputPokeSec, - no_input_poke_text: this.options.noInputPokeText, - no_input_end_conversation_sec: this.options.noInputEndConversationSec, }); } @@ -736,6 +796,57 @@ export class RealtimeSession extends llm.RealtimeSession { } satisfies llm.RealtimeModelError); } + private buildConfigOptions({ + systemPrompt, + toolsPayload, + }: { + systemPrompt: string; + toolsPayload: Phonic.ConfigOptions.Tools.Item[]; + }): Phonic.ConfigOptions { + return { + agent: this.options.phonicAgent, + project: this.options.project, + welcome_message: this.options.welcomeMessage, + generate_welcome_message: this.options.generateWelcomeMessage, + system_prompt: systemPrompt, + voice_id: this.options.voice, + input_format: 'pcm_44100', + output_format: 'pcm_44100', + ...(this.options.defaultLanguage !== undefined && { + default_language: this.options.defaultLanguage, + }), + ...(this.options.additionalLanguages !== undefined && { + additional_languages: this.options.additionalLanguages, + }), + ...(this.options.multilingualMode !== undefined && { + multilingual_mode: this.options.multilingualMode, + }), + audio_speed: this.options.audioSpeed, + tools: toolsPayload, + boosted_keywords: this.options.boostedKeywords, + ...(this.options.minWordsToInterrupt !== undefined && { + min_words_to_interrupt: this.options.minWordsToInterrupt, + }), + generate_no_input_poke_text: this.options.generateNoInputPokeText, + no_input_poke_sec: this.options.noInputPokeSec, + no_input_poke_text: this.options.noInputPokeText, + no_input_end_conversation_sec: this.options.noInputEndConversationSec, + }; + } + + private buildTurnHistory(chatCtx: llm.ChatContext): string | undefined { + const messages = chatCtx.items.filter( + (item): item is llm.ChatMessage => + item.type === 'message' && + 'textContent' in item && + item.textContent !== undefined && + item.textContent.trim() !== '', + ); + if (messages.length === 0) return undefined; + const history = messages.map((m) => `${m.role}: ${m.textContent}`).join('\n'); + return history.trim() || undefined; + } + private *resampleAudio(frame: AudioFrame): Generator { if (this.inputResampler) { if (frame.sampleRate !== this.inputResamplerInputRate) { From 343baa03a61bfc56b82bd8c1c5569c52c72b2cf0 Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 9 Apr 2026 21:30:16 +0000 Subject: [PATCH 2/4] fixes --- agents/src/voice/agent_activity.ts | 2 +- examples/src/phonic_handoff_agent.ts | 122 ++++++++++++++++++ plugins/phonic/src/realtime/realtime_model.ts | 55 +++++--- 3 files changed, 160 insertions(+), 19 deletions(-) create mode 100644 examples/src/phonic_handoff_agent.ts diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 46dcc84db..bf1314697 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -408,7 +408,7 @@ export class AgentActivity implements RecognitionHooks { // skip the update if the session is reused and no mid-session update is supported // this means the content is the same as the previous session const capabilities = this.llm.capabilities; - if (!rtReused && this.realtimeSession?.realtimeModel.provider == 'phonic') { + if (this.realtimeSession?.realtimeModel.provider == 'phonic') { try { await (this.realtimeSession as any)._updateSession( this.agent.instructions, diff --git a/examples/src/phonic_handoff_agent.ts b/examples/src/phonic_handoff_agent.ts new file mode 100644 index 000000000..f3ec958f6 --- /dev/null +++ b/examples/src/phonic_handoff_agent.ts @@ -0,0 +1,122 @@ +// For testing only! +import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents'; +import * as phonic from '@livekit/agents-plugin-phonic'; +import { fileURLToPath } from 'node:url'; +import { z } from 'zod'; + +type UserData = { + name?: string; + email?: string; + address?: string; +}; + +class NameAgent extends voice.Agent { + async onEnter() { + this.session.generateReply(); + } + + static create() { + return new NameAgent({ + instructions: + 'You are Alex, a friendly interviewer. You just started the call. ' + + 'Greet the user, then ask for their full name. ' + + 'Once you have it, thank the user and call record_name.', + tools: { + record_name: llm.tool({ + description: "Record the user's name and move on.", + parameters: z.object({ + name: z.string().describe("The user's full name"), + }), + execute: async ({ name }, { ctx }) => { + console.log(`Got name: ${name}`); + ctx.userData.name = name; + return llm.handoff({ agent: EmailAgent.create() }); + }, + }), + }, + }); + } +} + +class EmailAgent extends voice.Agent { + async onEnter() { + this.session.generateReply({ + instructions: 'Transition naturally and ask for their email address.', + }); + } + + static create() { + return new EmailAgent({ + instructions: + 'You are Alex, continuing an interview. ' + + 'Ask the user for their email address. Be conversational. ' + + 'Once you have it, thank the user and call record_email.', + tools: { + record_email: llm.tool({ + description: "Record the user's email and move on.", + parameters: z.object({ + email: z.string().describe("The user's email address"), + }), + execute: async ({ email }, { ctx }) => { + console.log(`Got email: ${email}`); + ctx.userData.email = email; + return llm.handoff({ agent: AddressAgent.create() }); + }, + }), + }, + }); + } +} + +class AddressAgent extends voice.Agent { + async onEnter() { + this.session.generateReply({ + instructions: 'Transition naturally and ask for their mailing address.', + }); + } + + static create() { + return new AddressAgent({ + instructions: + 'You are Alex, wrapping up an interview. ' + + 'Ask the user for their mailing address (city and state is fine). ' + + 'Once you have it, thank the user and call record_address.', + tools: { + record_address: llm.tool({ + description: "Record the user's address and finish.", + parameters: z.object({ + address: z.string().describe("The user's mailing address"), + }), + execute: async ({ address }, { ctx }) => { + console.log(`Got address: ${address}`); + ctx.userData.address = address; + const { name, email } = ctx.userData; + console.log(`All collected: name=${name}, email=${email}, address=${address}`); + return 'Thank the user for their time. Let them know they are all set.'; + }, + }), + }, + }); + } +} + +export default defineAgent({ + entry: async (ctx: JobContext) => { + const session = new voice.AgentSession({ + llm: new phonic.realtime.RealtimeModel({ + voice: 'sabrina', + audioSpeed: 1.2, + }), + userData: { name: undefined, email: undefined, address: undefined } as UserData, + }); + + await session.start({ + agent: NameAgent.create(), + room: ctx.room, + }); + + await ctx.connect(); + }, +}); + +cli.runApp(new ServerOptions({ agent: fileURLToPath(import.meta.url) })); diff --git a/plugins/phonic/src/realtime/realtime_model.ts b/plugins/phonic/src/realtime/realtime_model.ts index c94b8ddf5..095f2fafb 100644 --- a/plugins/phonic/src/realtime/realtime_model.ts +++ b/plugins/phonic/src/realtime/realtime_model.ts @@ -154,6 +154,10 @@ export class RealtimeModel extends llm.RealtimeModel { autoToolReplyGeneration: true, manualFunctionCalls: false, audioOutput: true, + midSessionChatCtxUpdate: true, + midSessionInstructionsUpdate: true, + midSessionToolsUpdate: true, + perResponseToolChoice: false, }); const apiKey = options.apiKey || process.env.PHONIC_API_KEY; @@ -244,7 +248,8 @@ export class RealtimeSession extends llm.RealtimeSession { private connectTask: Promise; private toolDefinitions: Record[] = []; private pendingToolCallIds = new Set(); - private readyToStart = false; + private readyToStart = new Future(); + private pendingGenerateReplyFut?: Future; private systemPromptPostfix = ''; constructor(realtimeModel: RealtimeModel) { @@ -381,15 +386,11 @@ export class RealtimeSession extends llm.RealtimeSession { this.toolsReady.resolve(); } - async _updateSession({ - instructions, - chatCtx, - tools, - }: { - instructions?: string; - chatCtx?: llm.ChatContext; - tools?: llm.ToolContext; - } = {}): Promise { + async _updateSession( + instructions?: string, + chatCtx?: llm.ChatContext, + tools?: llm.ToolContext, + ): Promise { if (!this.configSent) { if (instructions !== undefined) { await this.updateInstructions(instructions); @@ -459,7 +460,7 @@ export class RealtimeSession extends llm.RealtimeSession { } pushAudio(frame: AudioFrame): void { - if (this.closed || !this.readyToStart) { + if (this.closed || !this.readyToStart.done) { return; } @@ -480,14 +481,24 @@ export class RealtimeSession extends llm.RealtimeSession { } async generateReply(instructions?: string): Promise { - if (this.socket) { - this.socket.sendGenerateReply({ type: 'generate_reply', system_message: instructions }); - } else { - this.logger.warn('Cannot send generate_reply: WebSocket not available'); + if (this.closed) { + return Promise.reject(new Error('session is closed')); } - this.closeCurrentGeneration({ interrupted: false }); - return this.startNewAssistantTurn({ userInitiated: true }); + this.pendingGenerateReplyFut = new Future(); + this.sendGenerateReply(instructions); + + return this.pendingGenerateReplyFut.await; + } + + private async sendGenerateReply(instructions?: string): Promise { + await this.readyToStart.await; + if (this.closed || !this.socket) { + this.pendingGenerateReplyFut?.reject(new Error('session is closed')); + this.pendingGenerateReplyFut = undefined; + return; + } + this.socket.sendGenerateReply({ type: 'generate_reply', system_message: instructions }); } async commitAudio(): Promise { @@ -512,6 +523,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.closedFuture.resolve(); this.instructionsReady.resolve(); this.toolsReady.resolve(); + this.readyToStart.resolve(); this.closeCurrentGeneration({ interrupted: false }); this.inputResampler = undefined; this.socket?.close(); @@ -617,7 +629,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.handleToolCallInterrupted(message); break; case 'ready_to_start_conversation': - this.readyToStart = true; + this.readyToStart.resolve(); break; case 'assistant_chose_not_to_respond': case 'input_cancelled': @@ -758,6 +770,12 @@ export class RealtimeSession extends llm.RealtimeSession { responseId, }; + if (this.pendingGenerateReplyFut && !this.pendingGenerateReplyFut.done) { + generationEvent.userInitiated = true; + this.pendingGenerateReplyFut.resolve(generationEvent); + this.pendingGenerateReplyFut = undefined; + } + this.emit('generation_created', generationEvent); return generationEvent; } @@ -850,6 +868,7 @@ export class RealtimeSession extends llm.RealtimeSession { private *resampleAudio(frame: AudioFrame): Generator { if (this.inputResampler) { if (frame.sampleRate !== this.inputResamplerInputRate) { + this.inputResampler.close(); this.inputResampler = undefined; this.inputResamplerInputRate = undefined; } From e6ccf9887ca1f44014a7b2be2378b1288866f89f Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 9 Apr 2026 21:41:58 +0000 Subject: [PATCH 3/4] changeset --- .changeset/happy-yaks-bet.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/happy-yaks-bet.md diff --git a/.changeset/happy-yaks-bet.md b/.changeset/happy-yaks-bet.md new file mode 100644 index 000000000..887dd511c --- /dev/null +++ b/.changeset/happy-yaks-bet.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-phonic": patch +--- + +Update phonic plugin to reuse session for handoffs From 3d9a6c5b81c40b3e26d933630f56401724db01d8 Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 9 Apr 2026 22:42:21 +0000 Subject: [PATCH 4/4] remove need for configSent check --- plugins/phonic/src/realtime/realtime_model.ts | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/plugins/phonic/src/realtime/realtime_model.ts b/plugins/phonic/src/realtime/realtime_model.ts index 095f2fafb..d1c7af962 100644 --- a/plugins/phonic/src/realtime/realtime_model.ts +++ b/plugins/phonic/src/realtime/realtime_model.ts @@ -391,19 +391,7 @@ export class RealtimeSession extends llm.RealtimeSession { chatCtx?: llm.ChatContext, tools?: llm.ToolContext, ): Promise { - if (!this.configSent) { - if (instructions !== undefined) { - await this.updateInstructions(instructions); - } - if (chatCtx !== undefined) { - await this.updateChatCtx(chatCtx); - } - if (tools !== undefined) { - await this.updateTools(tools); - } - return; - } - + await this.readyToStart.await; if (instructions !== undefined) { this.options.instructions = instructions; } @@ -525,6 +513,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.toolsReady.resolve(); this.readyToStart.resolve(); this.closeCurrentGeneration({ interrupted: false }); + this.rejectPendingGenerateReply(); this.inputResampler = undefined; this.socket?.close(); await this.connectTask; @@ -547,6 +536,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.socket.on('error', (error: Error) => this.emitError(error, false)); this.socket.on('close', (event: { code?: number }) => { this.closeCurrentGeneration({ interrupted: false }); + this.rejectPendingGenerateReply(); if (!this.closed && event.code !== WS_CLOSE_NORMAL) { this.emitError(new Error(`Phonic STS socket closed with code ${event.code ?? -1}`), false); } @@ -804,6 +794,13 @@ export class RealtimeSession extends llm.RealtimeSession { this.currentGeneration = undefined; } + private rejectPendingGenerateReply(): void { + if (this.pendingGenerateReplyFut && !this.pendingGenerateReplyFut.done) { + this.pendingGenerateReplyFut.reject(new Error('session is closed')); + this.pendingGenerateReplyFut = undefined; + } + } + private emitError(error: Error, recoverable: boolean): void { this.emit('error', { timestamp: Date.now(), @@ -842,9 +839,9 @@ export class RealtimeSession extends llm.RealtimeSession { audio_speed: this.options.audioSpeed, tools: toolsPayload, boosted_keywords: this.options.boostedKeywords, - ...(this.options.minWordsToInterrupt !== undefined && { - min_words_to_interrupt: this.options.minWordsToInterrupt, - }), + // ...(this.options.minWordsToInterrupt !== undefined && { + // min_words_to_interrupt: this.options.minWordsToInterrupt, + // }), generate_no_input_poke_text: this.options.generateNoInputPokeText, no_input_poke_sec: this.options.noInputPokeSec, no_input_poke_text: this.options.noInputPokeText,