From 12f72f2d4d1bd142da298f883d8c21c0a8697230 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 23 Jul 2025 11:41:35 -0400 Subject: [PATCH 01/47] update streaming ASR Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/mixins/mixins.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index d99ae3cc70b4..de32784a4032 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -18,7 +18,7 @@ import tarfile import unicodedata from abc import ABC, abstractmethod -from typing import List +from typing import List, Optional import torch from omegaconf import DictConfig, OmegaConf, open_dict @@ -602,6 +602,7 @@ def conformer_stream_step( drop_extra_pre_encoded: int = None, return_transcription: bool = True, return_log_probs: bool = False, + valid_out_len: Optional[int] = None, ): """ It simulates a forward step with caching for streaming purposes. @@ -659,6 +660,11 @@ def conformer_stream_step( drop_extra_pre_encoded=drop_extra_pre_encoded, ) + if valid_out_len and not keep_all_outputs: + # drop right context if any + encoded = encoded[:, :, :valid_out_len] + encoded_len = torch.ones_like(encoded_len) * valid_out_len + if isinstance(self, asr_models.EncDecCTCModel) or ( isinstance(self, asr_models.EncDecHybridRNNTCTCModel) and self.cur_decoder == "ctc" ): From e4f566364a7355e29e9fdee72ba6d86e625e103c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 23 Jul 2025 17:41:32 -0400 Subject: [PATCH 02/47] add voice agent Signed-off-by: stevehuang52 --- .gitignore | 2 + examples/voice_agent/nemo_chatbot/README.md | 36 + .../voice_agent/nemo_chatbot/client/README.md | 27 + .../nemo_chatbot/client/index.html | 85 + .../nemo_chatbot/client/package-lock.json | 1672 +++++++++++++++++ .../nemo_chatbot/client/package.json | 26 + .../nemo_chatbot/client/src/app.ts | 522 +++++ .../nemo_chatbot/client/src/style.css | 180 ++ .../nemo_chatbot/client/tsconfig.json | 111 ++ .../nemo_chatbot/client/vite.config.js | 17 + .../voice_agent/nemo_chatbot/environment.yml | 435 +++++ .../server/bot_websocket_server.py | 381 ++++ .../nemo_chatbot/server/env.example | 2 + .../nemo_chatbot/server/prompts.py | 71 + .../nemo_chatbot/server/requirements.txt | 4 + .../voice_agent/nemo_chatbot/server/server.py | 85 + .../nemo_chatbot/server/server_config.yaml | 47 + .../asr/parts/submodules/rnnt_decoding.py | 2 + nemo/collections/voice_agent/__init__.py | 13 + .../voice_agent/pipecat/__init__.py | 18 + .../voice_agent/pipecat/frames/__init__.py | 13 + .../voice_agent/pipecat/frames/frames.py | 26 + .../pipecat/services/nemo/__init__.py | 22 + .../voice_agent/pipecat/services/nemo/diar.py | 373 ++++ .../pipecat/services/nemo/legacy_asr.py | 264 +++ .../pipecat/services/nemo/legacy_diar.py | 306 +++ .../voice_agent/pipecat/services/nemo/llm.py | 162 ++ .../voice_agent/pipecat/services/nemo/stt.py | 250 +++ .../voice_agent/pipecat/services/nemo/tts.py | 339 ++++ .../pipecat/services/nemo/turn_taking.py | 442 +++++ .../pipecat/services/nemo/utils.py | 196 ++ .../pipecat/transports/__init__.py | 13 + .../pipecat/transports/base_input.py | 55 + .../pipecat/transports/base_transport.py | 20 + .../pipecat/transports/network/__init__.py | 13 + .../transports/network/websocket_server.py | 128 ++ .../voice_agent/pipecat/utils/__init__.py | 13 + .../pipecat/utils/text/__init__.py | 13 + .../utils/text/simple_text_aggregator.py | 52 + 39 files changed, 6436 insertions(+) create mode 100644 examples/voice_agent/nemo_chatbot/README.md create mode 100644 examples/voice_agent/nemo_chatbot/client/README.md create mode 100644 examples/voice_agent/nemo_chatbot/client/index.html create mode 100644 examples/voice_agent/nemo_chatbot/client/package-lock.json create mode 100644 examples/voice_agent/nemo_chatbot/client/package.json create mode 100644 examples/voice_agent/nemo_chatbot/client/src/app.ts create mode 100644 examples/voice_agent/nemo_chatbot/client/src/style.css create mode 100644 examples/voice_agent/nemo_chatbot/client/tsconfig.json create mode 100644 examples/voice_agent/nemo_chatbot/client/vite.config.js create mode 100644 examples/voice_agent/nemo_chatbot/environment.yml create mode 100644 examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py create mode 100644 examples/voice_agent/nemo_chatbot/server/env.example create mode 100644 examples/voice_agent/nemo_chatbot/server/prompts.py create mode 100644 examples/voice_agent/nemo_chatbot/server/requirements.txt create mode 100644 examples/voice_agent/nemo_chatbot/server/server.py create mode 100644 examples/voice_agent/nemo_chatbot/server/server_config.yaml create mode 100644 nemo/collections/voice_agent/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/frames/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/frames/frames.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/diar.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/llm.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/stt.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/tts.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py create mode 100644 nemo/collections/voice_agent/pipecat/services/nemo/utils.py create mode 100644 nemo/collections/voice_agent/pipecat/transports/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/transports/base_input.py create mode 100644 nemo/collections/voice_agent/pipecat/transports/base_transport.py create mode 100644 nemo/collections/voice_agent/pipecat/transports/network/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py create mode 100644 nemo/collections/voice_agent/pipecat/utils/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/utils/text/__init__.py create mode 100644 nemo/collections/voice_agent/pipecat/utils/text/simple_text_aggregator.py diff --git a/.gitignore b/.gitignore index 1aa5ef00de5e..d437cc83474c 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,5 @@ examples/neural_graphs/*.yml nemo_experiments/ slurm*.out + +node_modules/ diff --git a/examples/voice_agent/nemo_chatbot/README.md b/examples/voice_agent/nemo_chatbot/README.md new file mode 100644 index 000000000000..666292d5d9d6 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/README.md @@ -0,0 +1,36 @@ +# NeMo Voice Agent + +A Pipecat example demonstrating the simplest way to create a voice agent using `WebsocketTransport`, NeMo STT/TTS service, and HuggingFace LLM. Evertying is deployed locally so you can own your own agent. + +## 🚀 Quick Start + +### Install dependencies + +```bash +conda env create -f environment.yml +``` + +Activate the environment via `conda activate nemo-pipecat` + +### Run the server + +```bash +NEMO_PATH=??? # Use your own NeMo path +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH +export HF_TOKEN=??? # Use your own HuggingFace token +export WEBSOCKET_SERVER=websocket_server # currently only support websocket_server +python ./server/server.py +``` + +### Launch the client +In another terminal, run the client via: + +```bash +cd client +npm install +npm run dev +``` + +### Connect to the client via browser + +Open the client via browser: `http://[YOUR SERVER IP ADDRESS]:5173/` diff --git a/examples/voice_agent/nemo_chatbot/client/README.md b/examples/voice_agent/nemo_chatbot/client/README.md new file mode 100644 index 000000000000..753c6d563780 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/README.md @@ -0,0 +1,27 @@ +# JavaScript Implementation + +Basic implementation using the [Pipecat JavaScript SDK](https://docs.pipecat.ai/client/js/introduction). + +## Setup + +1. Run the bot server. See the [server README](../README). + +2. Navigate to the `client/javascript` directory: + +```bash +cd client/javascript +``` + +3. Install dependencies: + +```bash +npm install +``` + +4. Run the client app: + +``` +npm run dev +``` + +5. Visit http://localhost:5173 in your browser. diff --git a/examples/voice_agent/nemo_chatbot/client/index.html b/examples/voice_agent/nemo_chatbot/client/index.html new file mode 100644 index 000000000000..c347fa972b0b --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/index.html @@ -0,0 +1,85 @@ + + + + + + + AI Chatbot + + + + +
+
+
+ Transport: Disconnected +
+
+ + +
+
+ + + + +
+
+ +
+
Microphone Volume:
+
+
+
+
0%
+
+ + + +
+

Debug Info

+
+
+
+ + + + + + diff --git a/examples/voice_agent/nemo_chatbot/client/package-lock.json b/examples/voice_agent/nemo_chatbot/client/package-lock.json new file mode 100644 index 000000000000..c3d6301a2843 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/package-lock.json @@ -0,0 +1,1672 @@ +{ + "name": "client", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "client", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "@pipecat-ai/client-js": "^0.4.0", + "@pipecat-ai/websocket-transport": "^0.4.1", + "protobufjs": "^7.4.0" + }, + "devDependencies": { + "@types/node": "^22.15.30", + "@types/protobufjs": "^6.0.0", + "@vitejs/plugin-react-swc": "^3.10.1", + "typescript": "^5.8.3", + "vite": "^6.3.5" + } + }, + "node_modules/@babel/runtime": { + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz", + "integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@bufbuild/protobuf": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.5.2.tgz", + "integrity": "sha512-foZ7qr0IsUBjzWIq+SuBLfdQCpJ1j8cTuNNT4owngTHoN5KsJb8L9t65fzz7SCeSWzescoOil/0ldqiL041ABg==" + }, + "node_modules/@bufbuild/protoplugin": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/@bufbuild/protoplugin/-/protoplugin-2.5.2.tgz", + "integrity": "sha512-7d/NUae/ugs/qgHEYOwkVWGDE3Bf/xjuGviVFs38+MLRdwiHNTiuvzPVwuIPo/1wuZCZn3Nax1cg1owLuY72xw==", + "dependencies": { + "@bufbuild/protobuf": "2.5.2", + "@typescript/vfs": "^1.5.2", + "typescript": "5.4.5" + } + }, + "node_modules/@bufbuild/protoplugin/node_modules/typescript": { + "version": "5.4.5", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz", + "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/@daily-co/daily-js": { + "version": "0.79.0", + "resolved": "https://registry.npmjs.org/@daily-co/daily-js/-/daily-js-0.79.0.tgz", + "integrity": "sha512-Ii/Zi6cfTl2EZBpX8msRPNkkCHcajA+ErXpbN2Xe2KySd1Nb4IzC/QWJlSl9VA9pIlYPQicRTDoZnoym/0uEAw==", + "dependencies": { + "@babel/runtime": "^7.12.5", + "@sentry/browser": "^8.33.1", + "bowser": "^2.8.1", + "dequal": "^2.0.3", + "events": "^3.1.0" + }, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.5.tgz", + "integrity": "sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.5.tgz", + "integrity": "sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.5.tgz", + "integrity": "sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.5.tgz", + "integrity": "sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.5.tgz", + "integrity": "sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.5.tgz", + "integrity": "sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.5.tgz", + "integrity": "sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.5.tgz", + "integrity": "sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.5.tgz", + "integrity": "sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.5.tgz", + "integrity": "sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.5.tgz", + "integrity": "sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.5.tgz", + "integrity": "sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.5.tgz", + "integrity": "sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==", + "cpu": [ + "mips64el" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.5.tgz", + "integrity": "sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.5.tgz", + "integrity": "sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.5.tgz", + "integrity": "sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.5.tgz", + "integrity": "sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.5.tgz", + "integrity": "sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.5.tgz", + "integrity": "sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.5.tgz", + "integrity": "sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.5.tgz", + "integrity": "sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.5.tgz", + "integrity": "sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.5.tgz", + "integrity": "sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.5.tgz", + "integrity": "sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.5.tgz", + "integrity": "sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@pipecat-ai/client-js": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/@pipecat-ai/client-js/-/client-js-0.4.1.tgz", + "integrity": "sha512-3jLKRzeryqLxtkqvr4Bvxe2OxoI7mdOFecm6iolZizXnk/BE480SEg2oAKyov3b5oT6+jmPlT+1HRBlTzEtL7A==", + "dependencies": { + "@types/events": "^3.0.3", + "clone-deep": "^4.0.1", + "events": "^3.3.0", + "typed-emitter": "^2.1.0", + "uuid": "^10.0.0" + } + }, + "node_modules/@pipecat-ai/websocket-transport": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/@pipecat-ai/websocket-transport/-/websocket-transport-0.4.2.tgz", + "integrity": "sha512-mOYnw9n60usODrE35D+uhFbJXl0DqXV32pAqSHu1of049s128mex6Qv+W49DBMVr8h5W6pLGrXhm+XDAtN5leg==", + "dependencies": { + "@daily-co/daily-js": "^0.79.0", + "@protobuf-ts/plugin": "^2.11.0", + "@protobuf-ts/runtime": "^2.11.0", + "x-law": "^0.3.1" + }, + "peerDependencies": { + "@pipecat-ai/client-js": "~0.4.0" + } + }, + "node_modules/@protobuf-ts/plugin": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/plugin/-/plugin-2.11.0.tgz", + "integrity": "sha512-Y+p4Axrk3thxws4BVSIO+x4CKWH2c8k3K+QPrp6Oq8agdsXPL/uwsMTIdpTdXIzTaUEZFASJL9LU56pob5GTHg==", + "dependencies": { + "@bufbuild/protobuf": "^2.4.0", + "@bufbuild/protoplugin": "^2.4.0", + "@protobuf-ts/protoc": "^2.11.0", + "@protobuf-ts/runtime": "^2.11.0", + "@protobuf-ts/runtime-rpc": "^2.11.0", + "typescript": "^3.9" + }, + "bin": { + "protoc-gen-dump": "bin/protoc-gen-dump", + "protoc-gen-ts": "bin/protoc-gen-ts" + } + }, + "node_modules/@protobuf-ts/plugin/node_modules/typescript": { + "version": "3.9.10", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.10.tgz", + "integrity": "sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + }, + "node_modules/@protobuf-ts/protoc": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/protoc/-/protoc-2.11.0.tgz", + "integrity": "sha512-GYfmv1rjZ/7MWzUqMszhdXiuoa4Js/j6zCbcxFmeThBBUhbrXdPU42vY+QVCHL9PvAMXO+wEhUfPWYdd1YgnlA==", + "bin": { + "protoc": "protoc.js" + } + }, + "node_modules/@protobuf-ts/runtime": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/runtime/-/runtime-2.11.0.tgz", + "integrity": "sha512-DfpRpUiNvPC3Kj48CmlU4HaIEY1Myh++PIumMmohBAk8/k0d2CkxYxJfPyUAxfuUfl97F4AvuCu1gXmfOG7OJQ==" + }, + "node_modules/@protobuf-ts/runtime-rpc": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/runtime-rpc/-/runtime-rpc-2.11.0.tgz", + "integrity": "sha512-g/oMPym5LjVyCc3nlQc6cHer0R3CyleBos4p7CjRNzdKuH/FlRXzfQYo6EN5uv8vLtn7zEK9Cy4YBKvHStIaag==", + "dependencies": { + "@protobuf-ts/runtime": "^2.11.0" + } + }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==" + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.11.tgz", + "integrity": "sha512-L/gAA/hyCSuzTF1ftlzUSI/IKr2POHsv1Dd78GfqkR83KMNuswWD61JxGV2L7nRwBBBSDr6R1gCkdTmoN7W4ag==", + "dev": true + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.43.0.tgz", + "integrity": "sha512-Krjy9awJl6rKbruhQDgivNbD1WuLb8xAclM4IR4cN5pHGAs2oIMMQJEiC3IC/9TZJ+QZkmZhlMO/6MBGxPidpw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.43.0.tgz", + "integrity": "sha512-ss4YJwRt5I63454Rpj+mXCXicakdFmKnUNxr1dLK+5rv5FJgAxnN7s31a5VchRYxCFWdmnDWKd0wbAdTr0J5EA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.43.0.tgz", + "integrity": "sha512-eKoL8ykZ7zz8MjgBenEF2OoTNFAPFz1/lyJ5UmmFSz5jW+7XbH1+MAgCVHy72aG59rbuQLcJeiMrP8qP5d/N0A==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.43.0.tgz", + "integrity": "sha512-SYwXJgaBYW33Wi/q4ubN+ldWC4DzQY62S4Ll2dgfr/dbPoF50dlQwEaEHSKrQdSjC6oIe1WgzosoaNoHCdNuMg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.43.0.tgz", + "integrity": "sha512-SV+U5sSo0yujrjzBF7/YidieK2iF6E7MdF6EbYxNz94lA+R0wKl3SiixGyG/9Klab6uNBIqsN7j4Y/Fya7wAjQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.43.0.tgz", + "integrity": "sha512-J7uCsiV13L/VOeHJBo5SjasKiGxJ0g+nQTrBkAsmQBIdil3KhPnSE9GnRon4ejX1XDdsmK/l30IYLiAaQEO0Cg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.43.0.tgz", + "integrity": "sha512-gTJ/JnnjCMc15uwB10TTATBEhK9meBIY+gXP4s0sHD1zHOaIh4Dmy1X9wup18IiY9tTNk5gJc4yx9ctj/fjrIw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.43.0.tgz", + "integrity": "sha512-ZJ3gZynL1LDSIvRfz0qXtTNs56n5DI2Mq+WACWZ7yGHFUEirHBRt7fyIk0NsCKhmRhn7WAcjgSkSVVxKlPNFFw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.43.0.tgz", + "integrity": "sha512-8FnkipasmOOSSlfucGYEu58U8cxEdhziKjPD2FIa0ONVMxvl/hmONtX/7y4vGjdUhjcTHlKlDhw3H9t98fPvyA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.43.0.tgz", + "integrity": "sha512-KPPyAdlcIZ6S9C3S2cndXDkV0Bb1OSMsX0Eelr2Bay4EsF9yi9u9uzc9RniK3mcUGCLhWY9oLr6er80P5DE6XA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loongarch64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.43.0.tgz", + "integrity": "sha512-HPGDIH0/ZzAZjvtlXj6g+KDQ9ZMHfSP553za7o2Odegb/BEfwJcR0Sw0RLNpQ9nC6Gy8s+3mSS9xjZ0n3rhcYg==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.43.0.tgz", + "integrity": "sha512-gEmwbOws4U4GLAJDhhtSPWPXUzDfMRedT3hFMyRAvM9Mrnj+dJIFIeL7otsv2WF3D7GrV0GIewW0y28dOYWkmw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.43.0.tgz", + "integrity": "sha512-XXKvo2e+wFtXZF/9xoWohHg+MuRnvO29TI5Hqe9xwN5uN8NKUYy7tXUG3EZAlfchufNCTHNGjEx7uN78KsBo0g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.43.0.tgz", + "integrity": "sha512-ruf3hPWhjw6uDFsOAzmbNIvlXFXlBQ4nk57Sec8E8rUxs/AI4HD6xmiiasOOx/3QxS2f5eQMKTAwk7KHwpzr/Q==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.43.0.tgz", + "integrity": "sha512-QmNIAqDiEMEvFV15rsSnjoSmO0+eJLoKRD9EAa9rrYNwO/XRCtOGM3A5A0X+wmG+XRrw9Fxdsw+LnyYiZWWcVw==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.43.0.tgz", + "integrity": "sha512-jAHr/S0iiBtFyzjhOkAics/2SrXE092qyqEg96e90L3t9Op8OTzS6+IX0Fy5wCt2+KqeHAkti+eitV0wvblEoQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.43.0.tgz", + "integrity": "sha512-3yATWgdeXyuHtBhrLt98w+5fKurdqvs8B53LaoKD7P7H7FKOONLsBVMNl9ghPQZQuYcceV5CDyPfyfGpMWD9mQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.43.0.tgz", + "integrity": "sha512-wVzXp2qDSCOpcBCT5WRWLmpJRIzv23valvcTwMHEobkjippNf+C3ys/+wf07poPkeNix0paTNemB2XrHr2TnGw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.43.0.tgz", + "integrity": "sha512-fYCTEyzf8d+7diCw8b+asvWDCLMjsCEA8alvtAutqJOJp/wL5hs1rWSqJ1vkjgW0L2NB4bsYJrpKkiIPRR9dvw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.43.0.tgz", + "integrity": "sha512-SnGhLiE5rlK0ofq8kzuDkM0g7FN1s5VYY+YSMTibP7CqShxCQvqtNxTARS4xX4PFJfHjG0ZQYX9iGzI3FQh5Aw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@sentry-internal/browser-utils": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/browser-utils/-/browser-utils-8.55.0.tgz", + "integrity": "sha512-ROgqtQfpH/82AQIpESPqPQe0UyWywKJsmVIqi3c5Fh+zkds5LUxnssTj3yNd1x+kxaPDVB023jAP+3ibNgeNDw==", + "dependencies": { + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry-internal/feedback": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/feedback/-/feedback-8.55.0.tgz", + "integrity": "sha512-cP3BD/Q6pquVQ+YL+rwCnorKuTXiS9KXW8HNKu4nmmBAyf7urjs+F6Hr1k9MXP5yQ8W3yK7jRWd09Yu6DHWOiw==", + "dependencies": { + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry-internal/replay": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/replay/-/replay-8.55.0.tgz", + "integrity": "sha512-roCDEGkORwolxBn8xAKedybY+Jlefq3xYmgN2fr3BTnsXjSYOPC7D1/mYqINBat99nDtvgFvNfRcZPiwwZ1hSw==", + "dependencies": { + "@sentry-internal/browser-utils": "8.55.0", + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry-internal/replay-canvas": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/replay-canvas/-/replay-canvas-8.55.0.tgz", + "integrity": "sha512-nIkfgRWk1091zHdu4NbocQsxZF1rv1f7bbp3tTIlZYbrH62XVZosx5iHAuZG0Zc48AETLE7K4AX9VGjvQj8i9w==", + "dependencies": { + "@sentry-internal/replay": "8.55.0", + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry/browser": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry/browser/-/browser-8.55.0.tgz", + "integrity": "sha512-1A31mCEWCjaMxJt6qGUK+aDnLDcK6AwLAZnqpSchNysGni1pSn1RWSmk9TBF8qyTds5FH8B31H480uxMPUJ7Cw==", + "dependencies": { + "@sentry-internal/browser-utils": "8.55.0", + "@sentry-internal/feedback": "8.55.0", + "@sentry-internal/replay": "8.55.0", + "@sentry-internal/replay-canvas": "8.55.0", + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry/core": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry/core/-/core-8.55.0.tgz", + "integrity": "sha512-6g7jpbefjHYs821Z+EBJ8r4Z7LT5h80YSWRJaylGS4nW5W5Z2KXzpdnyFarv37O7QjauzVC2E+PABmpkw5/JGA==", + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@swc/core": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core/-/core-1.12.1.tgz", + "integrity": "sha512-aKXdDTqxTVFl/bKQZ3EQUjEMBEoF6JBv29moMZq0kbVO43na6u/u+3Vcbhbrh+A2N0X5OL4RaveuWfAjEgOmeA==", + "dev": true, + "hasInstallScript": true, + "dependencies": { + "@swc/counter": "^0.1.3", + "@swc/types": "^0.1.23" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/swc" + }, + "optionalDependencies": { + "@swc/core-darwin-arm64": "1.12.1", + "@swc/core-darwin-x64": "1.12.1", + "@swc/core-linux-arm-gnueabihf": "1.12.1", + "@swc/core-linux-arm64-gnu": "1.12.1", + "@swc/core-linux-arm64-musl": "1.12.1", + "@swc/core-linux-x64-gnu": "1.12.1", + "@swc/core-linux-x64-musl": "1.12.1", + "@swc/core-win32-arm64-msvc": "1.12.1", + "@swc/core-win32-ia32-msvc": "1.12.1", + "@swc/core-win32-x64-msvc": "1.12.1" + }, + "peerDependencies": { + "@swc/helpers": ">=0.5.17" + }, + "peerDependenciesMeta": { + "@swc/helpers": { + "optional": true + } + } + }, + "node_modules/@swc/core-darwin-arm64": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-darwin-arm64/-/core-darwin-arm64-1.12.1.tgz", + "integrity": "sha512-nUjWVcJ3YS2N40ZbKwYO2RJ4+o2tWYRzNOcIQp05FqW0+aoUCVMdAUUzQinPDynfgwVshDAXCKemY8X7nN5MaA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-darwin-x64": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-darwin-x64/-/core-darwin-x64-1.12.1.tgz", + "integrity": "sha512-OGm4a4d3OeJn+tRt8H/eiHgTFrJbS6r8mi/Ob65tAEXZGHN900T2kR7c5ALr0V2hBOQ8BfhexwPoQlGQP/B95w==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-arm-gnueabihf": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.12.1.tgz", + "integrity": "sha512-76YeeQKyK0EtNkQiNBZ0nbVGooPf9IucY0WqVXVpaU4wuG7ZyLEE2ZAIgXafIuzODGQoLfetue7I8boMxh1/MA==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-arm64-gnu": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.12.1.tgz", + "integrity": "sha512-BxJDIJPq1+aCh9UsaSAN6wo3tuln8UhNXruOrzTI8/ElIig/3sAueDM6Eq7GvZSGGSA7ljhNATMJ0elD7lFatQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-arm64-musl": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.12.1.tgz", + "integrity": "sha512-NhLdbffSXvY0/FwUSAl4hKBlpe5GHQGXK8DxTo3HHjLsD9sCPYieo3vG0NQoUYAy4ZUY1WeGjyxeq4qZddJzEQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-x64-gnu": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.12.1.tgz", + "integrity": "sha512-CrYnV8SZIgArQ9LKH0xEF95PKXzX9WkRSc5j55arOSBeDCeDUQk1Bg/iKdnDiuj5HC1hZpvzwMzSBJjv+Z70jA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-x64-musl": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.12.1.tgz", + "integrity": "sha512-BQMl3d0HaGB0/h2xcKlGtjk/cGRn2tnbsaChAKcjFdCepblKBCz1pgO/mL7w5iXq3s57wMDUn++71/a5RAkZOA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-win32-arm64-msvc": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.12.1.tgz", + "integrity": "sha512-b7NeGnpqTfmIGtUqXBl0KqoSmOnH64nRZoT5l4BAGdvwY7nxitWR94CqZuwyLPty/bLywmyDA9uO12Kvgb3+gg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-win32-ia32-msvc": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.12.1.tgz", + "integrity": "sha512-iU/29X2D7cHBp1to62cUg/5Xk8K+lyOJiKIGGW5rdzTW/c2zz3d/ehgpzVP/rqC4NVr88MXspqHU4il5gmDajw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-win32-x64-msvc": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.12.1.tgz", + "integrity": "sha512-+Zh+JKDwiFqV5N9yAd2DhYVGPORGh9cfenu1ptr9yge+eHAf7vZJcC3rnj6QMR1QJh0Y5VC9+YBjRFjZVA7XDw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/counter": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz", + "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==", + "dev": true + }, + "node_modules/@swc/types": { + "version": "0.1.23", + "resolved": "https://registry.npmjs.org/@swc/types/-/types-0.1.23.tgz", + "integrity": "sha512-u1iIVZV9Q0jxY+yM2vw/hZGDNudsN85bBpTqzAQ9rzkxW9D+e3aEM4Han+ow518gSewkXgjmEK0BD79ZcNVgPw==", + "dev": true, + "dependencies": { + "@swc/counter": "^0.1.3" + } + }, + "node_modules/@types/estree": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", + "integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==", + "dev": true + }, + "node_modules/@types/events": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/events/-/events-3.0.3.tgz", + "integrity": "sha512-trOc4AAUThEz9hapPtSd7wf5tiQKvTtu5b371UxXdTuqzIh0ArcRspRP0i0Viu+LXstIQ1z96t1nsPxT9ol01g==" + }, + "node_modules/@types/node": { + "version": "22.15.32", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.32.tgz", + "integrity": "sha512-3jigKqgSjsH6gYZv2nEsqdXfZqIFGAV36XYYjf9KGZ3PSG+IhLecqPnI310RvjutyMwifE2hhhNEklOUrvx/wA==", + "dev": true, + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/@types/node/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true + }, + "node_modules/@types/protobufjs": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/@types/protobufjs/-/protobufjs-6.0.0.tgz", + "integrity": "sha512-A27RDExpAf3rdDjIrHKiJK6x8kqqJ4CmoChwtipfhVAn1p7+wviQFFP7dppn8FslSbHtQeVPvi8wNKkDjSYjHw==", + "deprecated": "This is a stub types definition for protobufjs (https://github.com/dcodeIO/ProtoBuf.js). protobufjs provides its own type definitions, so you don't need @types/protobufjs installed!", + "dev": true, + "dependencies": { + "protobufjs": "*" + } + }, + "node_modules/@typescript/vfs": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/@typescript/vfs/-/vfs-1.6.1.tgz", + "integrity": "sha512-JwoxboBh7Oz1v38tPbkrZ62ZXNHAk9bJ7c9x0eI5zBfBnBYGhURdbnh7Z4smN/MV48Y5OCcZb58n972UtbazsA==", + "dependencies": { + "debug": "^4.1.1" + }, + "peerDependencies": { + "typescript": "*" + } + }, + "node_modules/@vitejs/plugin-react-swc": { + "version": "3.10.2", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react-swc/-/plugin-react-swc-3.10.2.tgz", + "integrity": "sha512-xD3Rdvrt5LgANug7WekBn1KhcvLn1H3jNBfJRL3reeOIua/WnZOEV5qi5qIBq5T8R0jUDmRtxuvk4bPhzGHDWw==", + "dev": true, + "dependencies": { + "@rolldown/pluginutils": "1.0.0-beta.11", + "@swc/core": "^1.11.31" + }, + "peerDependencies": { + "vite": "^4 || ^5 || ^6 || ^7.0.0-beta.0" + } + }, + "node_modules/bowser": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/bowser/-/bowser-2.11.0.tgz", + "integrity": "sha512-AlcaJBi/pqqJBIQ8U9Mcpc9i8Aqxn88Skv5d+xBX006BY5u8N3mGLHa5Lgppa7L/HfwgwLgZ6NYs+Ag6uUmJRA==" + }, + "node_modules/clone-deep": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-4.0.1.tgz", + "integrity": "sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ==", + "dependencies": { + "is-plain-object": "^2.0.4", + "kind-of": "^6.0.2", + "shallow-clone": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/debug": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", + "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "engines": { + "node": ">=6" + } + }, + "node_modules/esbuild": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.5.tgz", + "integrity": "sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==", + "dev": true, + "hasInstallScript": true, + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.5", + "@esbuild/android-arm": "0.25.5", + "@esbuild/android-arm64": "0.25.5", + "@esbuild/android-x64": "0.25.5", + "@esbuild/darwin-arm64": "0.25.5", + "@esbuild/darwin-x64": "0.25.5", + "@esbuild/freebsd-arm64": "0.25.5", + "@esbuild/freebsd-x64": "0.25.5", + "@esbuild/linux-arm": "0.25.5", + "@esbuild/linux-arm64": "0.25.5", + "@esbuild/linux-ia32": "0.25.5", + "@esbuild/linux-loong64": "0.25.5", + "@esbuild/linux-mips64el": "0.25.5", + "@esbuild/linux-ppc64": "0.25.5", + "@esbuild/linux-riscv64": "0.25.5", + "@esbuild/linux-s390x": "0.25.5", + "@esbuild/linux-x64": "0.25.5", + "@esbuild/netbsd-arm64": "0.25.5", + "@esbuild/netbsd-x64": "0.25.5", + "@esbuild/openbsd-arm64": "0.25.5", + "@esbuild/openbsd-x64": "0.25.5", + "@esbuild/sunos-x64": "0.25.5", + "@esbuild/win32-arm64": "0.25.5", + "@esbuild/win32-ia32": "0.25.5", + "@esbuild/win32-x64": "0.25.5" + } + }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "engines": { + "node": ">=0.8.x" + } + }, + "node_modules/fdir": { + "version": "6.4.6", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz", + "integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==", + "dev": true, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/kind-of": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz", + "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==" + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true + }, + "node_modules/picomatch": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.2.tgz", + "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/protobufjs": { + "version": "7.5.3", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.3.tgz", + "integrity": "sha512-sildjKwVqOI2kmFDiXQ6aEB0fjYTafpEvIBs8tOR8qI4spuL9OPROLVu2qZqi/xgCfsHIwVqlaF8JBjWFHnKbw==", + "hasInstallScript": true, + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/protobufjs/node_modules/@types/node": { + "version": "24.0.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.3.tgz", + "integrity": "sha512-R4I/kzCYAdRLzfiCabn9hxWfbuHS573x+r0dJMkkzThEa7pbrcDWK+9zu3e7aBOouf+rQAciqPFMnxwr0aWgKg==", + "dependencies": { + "undici-types": "~7.8.0" + } + }, + "node_modules/rollup": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.43.0.tgz", + "integrity": "sha512-wdN2Kd3Twh8MAEOEJZsuxuLKCsBEo4PVNLK6tQWAn10VhsVewQLzcucMgLolRlhFybGxfclbPeEYBaP6RvUFGg==", + "dev": true, + "dependencies": { + "@types/estree": "1.0.7" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.43.0", + "@rollup/rollup-android-arm64": "4.43.0", + "@rollup/rollup-darwin-arm64": "4.43.0", + "@rollup/rollup-darwin-x64": "4.43.0", + "@rollup/rollup-freebsd-arm64": "4.43.0", + "@rollup/rollup-freebsd-x64": "4.43.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.43.0", + "@rollup/rollup-linux-arm-musleabihf": "4.43.0", + "@rollup/rollup-linux-arm64-gnu": "4.43.0", + "@rollup/rollup-linux-arm64-musl": "4.43.0", + "@rollup/rollup-linux-loongarch64-gnu": "4.43.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.43.0", + "@rollup/rollup-linux-riscv64-gnu": "4.43.0", + "@rollup/rollup-linux-riscv64-musl": "4.43.0", + "@rollup/rollup-linux-s390x-gnu": "4.43.0", + "@rollup/rollup-linux-x64-gnu": "4.43.0", + "@rollup/rollup-linux-x64-musl": "4.43.0", + "@rollup/rollup-win32-arm64-msvc": "4.43.0", + "@rollup/rollup-win32-ia32-msvc": "4.43.0", + "@rollup/rollup-win32-x64-msvc": "4.43.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/rxjs": { + "version": "7.8.2", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.2.tgz", + "integrity": "sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==", + "optional": true, + "dependencies": { + "tslib": "^2.1.0" + } + }, + "node_modules/shallow-clone": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-3.0.1.tgz", + "integrity": "sha512-/6KqX+GVUdqPuPPd2LxDDxzX6CAbjJehAAOKlNpqqUpAqPM6HeL8f+o3a+JsyGjn2lv0WY8UsTgUJjU9Ok55NA==", + "dependencies": { + "kind-of": "^6.0.2" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/tinyglobby": { + "version": "0.2.14", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.14.tgz", + "integrity": "sha512-tX5e7OM1HnYr2+a2C/4V0htOcSQcoSTH9KgJnVvNm5zm/cyEWKJ7j7YutsH9CxMdtOkkLFy2AHrMci9IM8IPZQ==", + "dev": true, + "dependencies": { + "fdir": "^6.4.4", + "picomatch": "^4.0.2" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "optional": true + }, + "node_modules/typed-emitter": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/typed-emitter/-/typed-emitter-2.1.0.tgz", + "integrity": "sha512-g/KzbYKbH5C2vPkaXGu8DJlHrGKHLsM25Zg9WuC9pMGfuvT+X25tZQWo5fK1BjBm8+UrVE9LDCvaY0CQk+fXDA==", + "optionalDependencies": { + "rxjs": "*" + } + }, + "node_modules/typescript": { + "version": "5.8.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", + "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz", + "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==" + }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/vite": { + "version": "6.3.5", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz", + "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==", + "dev": true, + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.4.4", + "picomatch": "^4.0.2", + "postcss": "^8.5.3", + "rollup": "^4.34.9", + "tinyglobby": "^0.2.13" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || ^20.0.0 || >=22.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", + "jiti": ">=1.21.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/x-law": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/x-law/-/x-law-0.3.1.tgz", + "integrity": "sha512-Nvo6OKj6UL2LuzAc08uJkwIDkK2PsTEdpLiY82NkwMptuRpAA1V7arUl7ZY12BcgRYNq8uh1pdAu7G6VeQn7Hg==", + "engines": { + "node": ">=18" + } + } + } +} diff --git a/examples/voice_agent/nemo_chatbot/client/package.json b/examples/voice_agent/nemo_chatbot/client/package.json new file mode 100644 index 000000000000..d2df048f50f8 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/package.json @@ -0,0 +1,26 @@ +{ + "name": "client", + "version": "1.0.0", + "main": "index.js", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview" + }, + "keywords": [], + "author": "", + "license": "ISC", + "description": "", + "devDependencies": { + "@types/node": "^22.15.30", + "@types/protobufjs": "^6.0.0", + "@vitejs/plugin-react-swc": "^3.10.1", + "typescript": "^5.8.3", + "vite": "^6.3.5" + }, + "dependencies": { + "@pipecat-ai/client-js": "^0.4.0", + "@pipecat-ai/websocket-transport": "^0.4.1", + "protobufjs": "^7.4.0" + } +} diff --git a/examples/voice_agent/nemo_chatbot/client/src/app.ts b/examples/voice_agent/nemo_chatbot/client/src/app.ts new file mode 100644 index 000000000000..871d3fcc551f --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/src/app.ts @@ -0,0 +1,522 @@ +/** + * Copyright (c) 2024–2025, Daily + * + * SPDX-License-Identifier: BSD 2-Clause License + */ + +/** + * RTVI Client Implementation + * + * This client connects to an RTVI-compatible bot server using WebSocket. + * + * Requirements: + * - A running RTVI bot server (defaults to http://localhost:7860) + */ + +import { + RTVIClient, + RTVIClientOptions, + RTVIEvent, +} from '@pipecat-ai/client-js'; +import { + WebSocketTransport +} from "@pipecat-ai/websocket-transport"; + +class WebsocketClientApp { + private rtviClient: RTVIClient | null = null; + private connectBtn: HTMLButtonElement | null = null; + private disconnectBtn: HTMLButtonElement | null = null; + private muteBtn: HTMLButtonElement | null = null; + private resetBtn: HTMLButtonElement | null = null; + private serverSelect: HTMLSelectElement | null = null; + private statusSpan: HTMLElement | null = null; + private debugLog: HTMLElement | null = null; + private volumeBar: HTMLElement | null = null; + private volumeText: HTMLElement | null = null; + private botAudio: HTMLAudioElement; + private isConnecting: boolean = false; + private isDisconnecting: boolean = false; + private isMuted: boolean = false; + private audioContext: AudioContext | null = null; + private analyser: AnalyserNode | null = null; + private microphone: MediaStreamAudioSourceNode | null = null; + private volumeUpdateInterval: number | null = null; + + // Server configurations + private readonly serverConfigs = { + websocket: { + name: 'WebSocket Server', + baseUrl: 'http://10.110.41.36:7860', + // baseUrl: 'http://localhost:7860', + port: 8765 + }, + fastapi: { + name: 'FastAPI Server', + baseUrl: 'http://localhost:8000', + port: 8000 + } + }; + + constructor() { + console.log("WebsocketClientApp"); + this.botAudio = document.createElement('audio'); + this.botAudio.autoplay = true; + //this.botAudio.playsInline = true; + document.body.appendChild(this.botAudio); + + this.setupDOMElements(); + this.setupEventListeners(); + } + + /** + * Set up references to DOM elements and create necessary media elements + */ + private setupDOMElements(): void { + this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement; + this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement; + this.muteBtn = document.getElementById('mute-btn') as HTMLButtonElement; + this.resetBtn = document.getElementById('reset-btn') as HTMLButtonElement; + this.serverSelect = document.getElementById('server-select') as HTMLSelectElement; + this.statusSpan = document.getElementById('connection-status'); + this.debugLog = document.getElementById('debug-log'); + this.volumeBar = document.getElementById('volume-bar'); + this.volumeText = document.getElementById('volume-text'); + } + + /** + * Set up event listeners for connect/disconnect buttons + */ + private setupEventListeners(): void { + this.connectBtn?.addEventListener('click', () => this.connect()); + this.disconnectBtn?.addEventListener('click', () => this.disconnect()); + this.muteBtn?.addEventListener('click', () => this.toggleMute()); + this.resetBtn?.addEventListener('click', () => this.reset()); + this.serverSelect?.addEventListener('change', () => this.updateServerUrl()); + } + + /** + * Add a timestamped message to the debug log + */ + private log(message: string): void { + if (!this.debugLog) return; + const entry = document.createElement('div'); + entry.textContent = `${new Date().toISOString()} - ${message}`; + if (message.startsWith('User: ')) { + entry.style.color = '#2196F3'; + } else if (message.startsWith('Bot: ')) { + entry.style.color = '#4CAF50'; + } + this.debugLog.appendChild(entry); + this.debugLog.scrollTop = this.debugLog.scrollHeight; + console.log(message); + } + + /** + * Update the connection status display + */ + private updateStatus(status: string): void { + if (this.statusSpan) { + this.statusSpan.textContent = status; + } + this.log(`Status: ${status}`); + } + + /** + * Check for available media tracks and set them up if present + * This is called when the bot is ready or when the transport state changes to ready + */ + setupMediaTracks() { + if (!this.rtviClient) return; + const tracks = this.rtviClient.tracks(); + if (tracks.bot?.audio) { + this.setupAudioTrack(tracks.bot.audio); + } + } + + /** + * Set up listeners for track events (start/stop) + * This handles new tracks being added during the session + */ + setupTrackListeners() { + if (!this.rtviClient) { + this.log('Cannot setup track listeners: client is null'); + return; + } + + try { + // Listen for new tracks starting + this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => { + // Only handle non-local (bot) tracks + if (!participant?.local && track.kind === 'audio') { + this.setupAudioTrack(track); + } + }); + + // Listen for tracks stopping + this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => { + this.log(`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`); + }); + } catch (error) { + this.log(`Error setting up track listeners: ${error}`); + } + } + + /** + * Set up an audio track for playback + * Handles both initial setup and track updates + */ + private setupAudioTrack(track: MediaStreamTrack): void { + this.log('Setting up audio track'); + if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) { + const oldTrack = this.botAudio.srcObject.getAudioTracks()[0]; + if (oldTrack?.id === track.id) return; + } + this.botAudio.srcObject = new MediaStream([track]); + } + + /** + * Initialize and connect to the bot + * This sets up the RTVI client, initializes devices, and establishes the connection + */ + public async connect(): Promise { + if (this.isConnecting) { + this.log('Connection already in progress, ignoring...'); + return; + } + + try { + this.isConnecting = true; + const startTime = Date.now(); + + //const transport = new DailyTransport(); + const transport = new WebSocketTransport(); + const RTVIConfig: RTVIClientOptions = { + transport, + params: { + // The baseURL and endpoint of your bot server that the client will connect to + baseUrl: this.getSelectedServerConfig().baseUrl, + endpoints: { connect: '/connect' }, + }, + enableMic: true, + enableCam: false, + callbacks: { + onConnected: () => { + this.updateStatus('Connected'); + if (this.connectBtn) this.connectBtn.disabled = true; + if (this.disconnectBtn) this.disconnectBtn.disabled = false; + if (this.muteBtn) { + this.muteBtn.disabled = false; + this.muteBtn.textContent = 'Mute'; + } + if (this.resetBtn) this.resetBtn.disabled = false; + if (this.serverSelect) this.serverSelect.disabled = true; + // Start volume monitoring when connected + if (!this.isMuted) { + this.startVolumeMonitoring(); + } + }, + onDisconnected: () => { + // Only handle disconnect if we're not in the middle of error cleanup + if (!this.isConnecting) { + this.updateStatus('Disconnected'); + if (this.connectBtn) this.connectBtn.disabled = false; + if (this.disconnectBtn) this.disconnectBtn.disabled = true; + if (this.muteBtn) { + this.muteBtn.disabled = true; + this.muteBtn.textContent = 'Mute'; + } + if (this.resetBtn) this.resetBtn.disabled = true; + if (this.serverSelect) this.serverSelect.disabled = false; + // Stop volume monitoring when disconnected + this.stopVolumeMonitoring(); + this.log('Client disconnected'); + } + }, + onBotReady: (data) => { + this.log(`Bot ready: ${JSON.stringify(data)}`); + this.setupMediaTracks(); + }, + onUserTranscript: (data) => { + if (data.final) { + this.log(`User: ${data.text}`); + } + }, + onBotTranscript: (data) => this.log(`Bot: ${data.text}`), + onMessageError: (error) => console.error('Message error:', error), + onError: (error) => console.error('Error:', error), + }, + } + + // Create the client with error handling + try { + this.rtviClient = new RTVIClient(RTVIConfig); + this.setupTrackListeners(); + } catch (clientError) { + this.log(`Error creating RTVI client: ${clientError}`); + throw clientError; + } + + this.log('Initializing devices...'); + await this.rtviClient.initDevices(); + + this.log('Connecting to bot...'); + await this.rtviClient.connect(); + + const timeTaken = Date.now() - startTime; + this.log(`Connection complete, timeTaken: ${timeTaken}`); + } catch (error) { + this.log(`Error connecting: ${(error as Error).message}`); + this.updateStatus('Error'); + // Clean up if there's an error + await this.cleanupOnError(); + } finally { + this.isConnecting = false; + } + } + + /** + * Clean up resources when there's an error during connection + */ + private async cleanupOnError(): Promise { + // Set disconnecting flag to prevent onDisconnected callback interference + this.isDisconnecting = true; + + // Store reference to client before it might become null + const client = this.rtviClient; + + if (client) { + try { + // Check if the client is in a state where disconnect can be called + if (typeof client.disconnect === 'function') { + await client.disconnect(); + } + } catch (disconnectError) { + this.log(`Error during cleanup disconnect: ${disconnectError}`); + } finally { + // Always reset the client to null to allow reconnection + this.rtviClient = null; + } + } else { + this.log('Client was already null during cleanup'); + } + + // Reset button states + if (this.connectBtn) this.connectBtn.disabled = false; + if (this.disconnectBtn) this.disconnectBtn.disabled = true; + if (this.muteBtn) { + this.muteBtn.disabled = true; + this.muteBtn.textContent = 'Mute'; + } + if (this.resetBtn) this.resetBtn.disabled = true; + if (this.serverSelect) this.serverSelect.disabled = false; + + // Stop volume monitoring + this.stopVolumeMonitoring(); + + // Reset mute state + this.isMuted = false; + + // Reset disconnecting flag + this.isDisconnecting = false; + } + + /** + * Disconnect from the bot and clean up media resources + */ + public async disconnect(): Promise { + if (this.isDisconnecting) { + this.log('Disconnection already in progress, ignoring...'); + return; + } + + this.isDisconnecting = true; + + // Store reference to client before it might become null + const client = this.rtviClient; + + if (client) { + try { + // Check if the client is in a state where disconnect can be called + if (typeof client.disconnect === 'function') { + await client.disconnect(); + } + } catch (error) { + this.log(`Error disconnecting: ${(error as Error).message}`); + } finally { + // Always clean up resources and reset the client + this.rtviClient = null; + if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) { + this.botAudio.srcObject.getAudioTracks().forEach((track) => track.stop()); + this.botAudio.srcObject = null; + } + } + } else { + this.log('Client was already null during disconnect'); + } + + // Stop volume monitoring + this.stopVolumeMonitoring(); + + // Reset mute state + this.isMuted = false; + + this.isDisconnecting = false; + } + + /** + * Toggle microphone mute/unmute + */ + private toggleMute(): void { + if (!this.rtviClient) { + this.log('Cannot toggle mute: client is null'); + return; + } + + this.isMuted = !this.isMuted; + this.rtviClient.enableMic(!this.isMuted); + + // Update button text + if (this.muteBtn) { + this.muteBtn.textContent = this.isMuted ? 'Unmute' : 'Mute'; + } + + // Update volume monitoring + if (this.isMuted) { + this.stopVolumeMonitoring(); + } else { + this.startVolumeMonitoring(); + } + + this.log(this.isMuted ? 'Microphone muted' : 'Microphone unmuted'); + } + + /** + * Start monitoring microphone volume + */ + private async startVolumeMonitoring(): Promise { + try { + if (!this.audioContext) { + this.audioContext = new AudioContext(); + } + + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + + this.analyser = this.audioContext.createAnalyser(); + this.analyser.fftSize = 256; + this.analyser.smoothingTimeConstant = 0.8; + + this.microphone = this.audioContext.createMediaStreamSource(stream); + this.microphone.connect(this.analyser); + + // Start continuous volume updates + this.volumeUpdateInterval = window.setInterval(() => { + this.updateVolumeDisplay(); + }, 100); // Update every 100ms + + this.log('Volume monitoring started'); + } catch (error) { + this.log(`Error starting volume monitoring: ${error}`); + } + } + + /** + * Stop monitoring microphone volume + */ + private stopVolumeMonitoring(): void { + if (this.volumeUpdateInterval) { + clearInterval(this.volumeUpdateInterval); + this.volumeUpdateInterval = null; + } + + if (this.microphone) { + this.microphone.disconnect(); + this.microphone = null; + } + + // Reset volume display + this.updateVolumeDisplay(0); + this.log('Volume monitoring stopped'); + } + + /** + * Update the volume display + */ + private updateVolumeDisplay(volume?: number): void { + if (!this.volumeBar || !this.volumeText) return; + + if (volume === undefined && this.analyser) { + const dataArray = new Uint8Array(this.analyser.frequencyBinCount); + this.analyser.getByteFrequencyData(dataArray); + + // Calculate average volume + const average = dataArray.reduce((sum, value) => sum + value, 0) / dataArray.length; + volume = (average / 255) * 100; + } + + const displayVolume = volume || 0; + const clampedVolume = Math.min(100, Math.max(0, displayVolume)); + + this.volumeBar.style.width = `${clampedVolume}%`; + this.volumeText.textContent = `${Math.round(clampedVolume)}%`; + + // Update color based on volume level + if (clampedVolume < 30) { + this.volumeBar.style.background = '#4caf50'; // Green + } else if (clampedVolume < 70) { + this.volumeBar.style.background = '#ff9800'; // Orange + } else { + this.volumeBar.style.background = '#f44336'; // Red + } + } + + /** + * Reset the conversation context by calling the server action + */ + private async reset(): Promise { + if (!this.rtviClient) { + this.log('Cannot reset: not connected to server'); + return; + } + + try { + this.log('Resetting conversation context...'); + + // Call the reset action on the server + const result = await this.rtviClient.action({ service: 'context', action: 'reset', arguments: [] }); + + if (result) { + this.log('Conversation context reset successfully'); + } else { + this.log('Failed to reset conversation context'); + } + } catch (error) { + this.log(`Error resetting context: ${error}`); + } + } + + private getSelectedServerConfig(): { name: string; baseUrl: string; port: number } { + const selectedValue = this.serverSelect?.value || 'websocket'; + return this.serverConfigs[selectedValue as keyof typeof this.serverConfigs]; + } + + private updateServerUrl(): void { + const selectedConfig = this.getSelectedServerConfig(); + this.log(`Server changed to: ${selectedConfig.name} (${selectedConfig.baseUrl})`); + + // If connected, show a message that they need to reconnect + if (this.rtviClient) { + this.log('Please disconnect and reconnect to use the new server'); + } + } +} + +declare global { + interface Window { + WebsocketClientApp: typeof WebsocketClientApp; + } +} + +window.addEventListener('DOMContentLoaded', () => { + window.WebsocketClientApp = WebsocketClientApp; + new WebsocketClientApp(); +}); diff --git a/examples/voice_agent/nemo_chatbot/client/src/style.css b/examples/voice_agent/nemo_chatbot/client/src/style.css new file mode 100644 index 000000000000..a19b4e0f1bc0 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/src/style.css @@ -0,0 +1,180 @@ +body { + margin: 0; + padding: 20px; + font-family: Arial, sans-serif; + background-color: #f0f0f0; +} + +.container { + max-width: 1200px; + margin: 0 auto; +} + +.status-bar { + display: flex; + justify-content: space-between; + align-items: center; + padding: 10px; + background-color: #fff; + border-radius: 8px; + margin-bottom: 20px; +} + +.controls button { + padding: 8px 16px; + margin-left: 10px; + border: none; + border-radius: 4px; + cursor: pointer; +} + +#connect-btn { + background-color: #4caf50; + color: white; +} + +#disconnect-btn { + background-color: #f44336; + color: white; +} + +#mute-btn { + background-color: #ff9800; + color: white; +} + +#mute-btn:disabled { + background-color: #ccc; + color: #666; +} + +button:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.volume-indicator { + display: flex; + align-items: center; + gap: 10px; + padding: 10px; + background-color: #fff; + border-radius: 8px; + margin-bottom: 20px; +} + +.volume-label { + font-weight: bold; + min-width: 120px; +} + +.volume-bar-container { + flex: 1; + height: 20px; + background-color: #e0e0e0; + border-radius: 10px; + overflow: hidden; + position: relative; +} + +.volume-bar { + height: 100%; + background: linear-gradient(90deg, #4caf50, #ff9800, #f44336); + width: 0%; + transition: width 0.1s ease; + border-radius: 10px; +} + +.volume-text { + min-width: 40px; + text-align: right; + font-weight: bold; + font-size: 14px; +} + +.main-content { + background-color: #fff; + border-radius: 8px; + padding: 20px; + margin-bottom: 20px; +} + +.bot-container { + display: flex; + flex-direction: column; + align-items: center; +} + +#bot-video-container { + width: 640px; + height: 360px; + background-color: #e0e0e0; + border-radius: 8px; + margin: 20px auto; + overflow: hidden; + display: flex; + align-items: center; + justify-content: center; +} + +#bot-video-container video { + width: 100%; + height: 100%; + object-fit: cover; +} + +.debug-panel { + background-color: #fff; + border-radius: 8px; + padding: 20px; +} + +.debug-panel h3 { + margin: 0 0 10px 0; + font-size: 16px; + font-weight: bold; +} + +#debug-log { + height: 500px; + overflow-y: auto; + background-color: #f8f8f8; + padding: 10px; + border-radius: 4px; + font-family: monospace; + font-size: 12px; + line-height: 1.4; +} + +.server-selection { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 10px; +} + +.server-selection label { + font-weight: bold; + color: #333; +} + +.server-selection select { + padding: 6px 12px; + border: 1px solid #ccc; + border-radius: 4px; + background-color: white; + font-size: 14px; + cursor: pointer; +} + +.server-selection select:focus { + outline: none; + border-color: #2196F3; + box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2); +} + +.server-selection select:disabled { + background-color: #f5f5f5; + cursor: not-allowed; + opacity: 0.6; +} diff --git a/examples/voice_agent/nemo_chatbot/client/tsconfig.json b/examples/voice_agent/nemo_chatbot/client/tsconfig.json new file mode 100644 index 000000000000..c9c555d96f35 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/tsconfig.json @@ -0,0 +1,111 @@ +{ + "compilerOptions": { + /* Visit https://aka.ms/tsconfig to read more about this file */ + + /* Projects */ + // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ + // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ + // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ + // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ + // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ + // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + + /* Language and Environment */ + "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ + // "jsx": "preserve", /* Specify what JSX code is generated. */ + // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ + // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ + // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ + // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ + // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ + // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ + // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ + // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ + // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ + + /* Modules */ + "module": "commonjs", /* Specify what module code is generated. */ + // "rootDir": "./", /* Specify the root folder within your source files. */ + // "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */ + // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ + // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ + // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ + // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ + // "types": [], /* Specify type package names to be included without being referenced in a source file. */ + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ + // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ + // "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */ + // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ + // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ + // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ + // "noUncheckedSideEffectImports": true, /* Check side effect imports. */ + // "resolveJsonModule": true, /* Enable importing .json files. */ + // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ + // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ + + /* JavaScript Support */ + // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ + // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ + // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ + + /* Emit */ + // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + // "declarationMap": true, /* Create sourcemaps for d.ts files. */ + // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ + // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ + // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + // "noEmit": true, /* Disable emitting files from a compilation. */ + // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ + // "outDir": "./", /* Specify an output folder for all emitted files. */ + // "removeComments": true, /* Disable emitting comments. */ + // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ + // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ + // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ + // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ + // "newLine": "crlf", /* Set the newline character for emitting files. */ + // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ + // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ + // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ + // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ + // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + + /* Interop Constraints */ + // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ + // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ + // "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */ + // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ + "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ + // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ + "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ + + /* Type Checking */ + "strict": true, /* Enable all strict type-checking options. */ + // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ + // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ + // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ + // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ + // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ + // "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */ + // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ + // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ + // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ + // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ + // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ + // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ + // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ + // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ + // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ + // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ + + /* Completeness */ + // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + "skipLibCheck": true /* Skip type checking all .d.ts files. */ + } +} diff --git a/examples/voice_agent/nemo_chatbot/client/vite.config.js b/examples/voice_agent/nemo_chatbot/client/vite.config.js new file mode 100644 index 000000000000..936725c03697 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/client/vite.config.js @@ -0,0 +1,17 @@ +import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react-swc'; + +export default defineConfig({ + plugins: [react()], + server: { + host: '0.0.0.0', // Bind to all interfaces + port: 5173, // Back to default Vite port + proxy: { + // Proxy /api requests to the backend server + '/connect': { + target: 'http://0.0.0.0:7860', // Replace with your backend URL + changeOrigin: true, + }, + }, + }, +}); diff --git a/examples/voice_agent/nemo_chatbot/environment.yml b/examples/voice_agent/nemo_chatbot/environment.yml new file mode 100644 index 000000000000..8fe2883649a5 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/environment.yml @@ -0,0 +1,435 @@ +name: nemo-pipecat +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2025.2.25=h06a4308_0 + - cudatoolkit=11.8.0=h6a678d5_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - libxcb=1.17.0=h9b100fa_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.16=h5eee18b_0 + - pip=25.1=pyhc872135_2 + - pthread-stubs=0.3=h0ce48e5_1 + - python=3.10.12=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=78.1.1=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h993c535_1 + - wheel=0.45.1=py310h06a4308_0 + - xorg-libx11=1.8.12=h9b100fa_1 + - xorg-libxau=1.0.12=h9b100fa_0 + - xorg-libxdmcp=1.1.5=h9b100fa_0 + - xorg-xorgproto=2024.1=h5eee18b_1 + - xz=5.6.4=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.3.0 + - accelerate==1.7.0 + - accelerated-scan==0.2.0 + - addict==2.4.0 + - aiofiles==24.1.0 + - aiohappyeyeballs==2.6.1 + - aiohttp==3.11.18 + - aiosignal==1.3.2 + - alabaster==1.0.0 + - alembic==1.16.2 + - aniso8601==10.0.1 + - annotated-types==0.7.0 + - antlr4-python3-runtime==4.9.3 + - anyio==4.9.0 + - asciitree==0.3.3 + - asttokens==3.0.0 + - async-timeout==5.0.1 + - attrdict==2.0.1 + - attrs==25.3.0 + - audioread==3.0.1 + - av==14.4.0 + - babel==2.17.0 + - backoff==2.2.1 + - bcrypt==4.3.0 + - beautifulsoup4==4.13.1 + - bitsandbytes==0.46.0 + - black==24.10.0 + - blinker==1.9.0 + - boto3==1.38.38 + - botocore==1.38.38 + - braceexpand==0.1.7 + - bracex==2.5.post1 + - catalogue==2.0.10 + - cdifflib==1.2.6 + - certifi==2025.6.15 + - cffi==1.17.1 + - chardet==5.2.0 + - charset-normalizer==3.4.2 + - click==8.2.1 + - clip==0.2.0 + - cloudpickle==3.1.1 + - colorama==0.4.6 + - coloredlogs==15.0.1 + - colorlog==6.9.0 + - contourpy==1.3.2 + - coverage==7.9.1 + - cryptography==42.0.8 + - cycler==0.12.1 + - cytoolz==1.0.1 + - dataclasses-json==0.6.7 + - dataproperty==1.1.0 + - datasets==3.6.0 + - decorator==5.2.1 + - decord==0.6.0 + - defusedxml==0.7.1 + - deprecated==1.2.18 + - diffusers==0.33.1 + - dill==0.3.8 + - distance==0.1.3 + - distro==1.9.0 + - docker==7.1.0 + - docopt==0.6.2 + - docstring-parser==0.16 + - docutils==0.21.2 + - dotenv==0.9.9 + - editdistance==0.8.1 + - einops==0.8.1 + - einops-exts==0.0.4 + - emoji==2.14.1 + - eval-type-backport==0.2.2 + - evaluate==0.4.3 + - exceptiongroup==1.3.0 + - executing==2.2.0 + - fabric==3.2.2 + - faiss-cpu==1.11.0 + - fastapi==0.115.13 + - fasteners==0.19 + - fiddle==0.3.0 + - filelock==3.18.0 + - filetype==1.2.0 + - flashlight==0.1.1 + - flashlight-text==0.0.7 + - flask==3.1.1 + - flask-restful==0.3.10 + - flatbuffers==25.2.10 + - fonttools==4.58.4 + - frozenlist==1.7.0 + - fsspec==2024.12.0 + - ftfy==6.3.1 + - future==1.0.0 + - g2p-en==2.1.0 + - gdown==5.2.0 + - gitdb==4.0.12 + - gitpython==3.1.44 + - glibc==0.6.1 + - greenlet==3.2.3 + - grpcio==1.67.1 + - grpcio-tools==1.67.1 + - h11==0.16.0 + - h5py==3.14.0 + - hf-xet==1.1.4 + - httpcore==1.0.9 + - httptools==0.6.4 + - httpx==0.28.1 + - huggingface-hub==0.33.0 + - humanfriendly==10.0 + - hydra-core==1.3.2 + - idna==3.10 + - ijson==3.4.0 + - imageio==2.37.0 + - imagesize==1.4.1 + - immutabledict==4.2.0 + - importlib-metadata==8.7.0 + - indic-numtowords==1.0.2 + - inflect==7.5.0 + - iniconfig==2.1.0 + - inquirerpy==0.3.4 + - intervaltree==3.1.0 + - invoke==2.2.0 + - ipython==8.37.0 + - isort==5.13.2 + - itsdangerous==2.2.0 + - janome==0.5.0 + - jedi==0.19.2 + - jieba==0.42.1 + - jinja2==3.1.6 + - jiter==0.10.0 + - jiwer==3.1.0 + - jmespath==1.0.1 + - joblib==1.5.1 + - jsonlines==4.0.0 + - jsonschema==4.24.0 + - jsonschema-specifications==2025.4.1 + - kaldi-python-io==1.2.2 + - kaldiio==2.18.1 + - kiwisolver==1.4.8 + - kornia==0.8.1 + - kornia-rs==0.1.9 + - langdetect==1.0.9 + - latexcodec==3.0.1 + - lazy-loader==0.4 + - levenshtein==0.27.1 + - lhotse==1.30.3 + - libcst==1.8.2 + - librosa==0.11.0 + - lightning==2.4.0 + - lightning-utilities==0.14.3 + - lilcom==1.8.1 + - livekit==1.0.9 + - livekit-agents==1.1.1 + - livekit-api==1.0.2 + - livekit-plugins-turn-detector==1.1.1 + - livekit-protocol==1.0.3 + - llvmlite==0.44.0 + - loguru==0.7.3 + - lxml==5.4.0 + - mako==1.3.10 + - markdown==3.8 + - markdown-it-py==3.0.0 + - markdown2==2.5.3 + - markupsafe==3.0.2 + - marshmallow==3.26.1 + - matplotlib==3.10.3 + - matplotlib-inline==0.1.7 + - mbstrdecoder==1.1.4 + - mdurl==0.1.2 + - mediapy==1.1.6 + - megatron-core==0.12.1 + - megatron-energon==5.2.0 + - ml-dtypes==0.5.1 + - more-itertools==10.7.0 + - mpmath==1.3.0 + - msgpack==1.1.1 + - multi-storage-client==0.23.0 + - multidict==6.5.0 + - multiprocess==0.70.16 + - mypy-extensions==1.1.0 + - nemo-run==0.4.0 + - nemo-text-processing==1.1.0 + - nemo-toolkit==2.5.0rc0 + - nerfacc==0.5.3 + - nest-asyncio==1.6.0 + - networkx==3.4.2 + - ninja==1.11.1.4 + - nltk==3.9.1 + - num2words==0.5.14 + - numba==0.61.2 + - numcodecs==0.13.1 + - numexpr==2.11.0 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.6.4.1 + - nvidia-cuda-cupti-cu12==12.6.80 + - nvidia-cuda-nvrtc-cu12==12.6.77 + - nvidia-cuda-runtime-cu12==12.6.77 + - nvidia-cudnn-cu12==9.5.1.17 + - nvidia-cufft-cu12==11.3.0.4 + - nvidia-cufile-cu12==1.11.1.6 + - nvidia-curand-cu12==10.3.7.77 + - nvidia-cusolver-cu12==11.7.1.2 + - nvidia-cusparse-cu12==12.5.4.2 + - nvidia-cusparselt-cu12==0.6.3 + - nvidia-lm-eval==25.5 + - nvidia-ml-py==12.575.51 + - nvidia-modelopt==0.31.0 + - nvidia-modelopt-core==0.31.0 + - nvidia-nccl-cu12==2.26.2 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.6.77 + - nvidia-resiliency-ext==0.4.0 + - nvidia-riva-client==2.21.0 + - nvtx==0.2.12 + - omegaconf==2.3.0 + - onnx==1.17.0 + - onnxruntime==1.22.0 + - open-clip-torch==2.24.0 + - openai==1.70.0 + - opencc==1.1.9 + - opencc-python-reimplemented==0.1.7 + - opentelemetry-api==1.34.1 + - optuna==4.4.0 + - packaging==24.2 + - pandas==2.3.0 + - pangu==4.0.6.1 + - parameterized==0.9.0 + - paramiko==3.5.1 + - parso==0.8.4 + - pathspec==0.12.1 + - pathvalidate==3.3.1 + - peft==0.15.2 + - pesq==0.0.4 + - pexpect==4.9.0 + - pfzy==0.3.4 + - pillow==11.1.0 + - pipecat-ai==0.1.dev4182 + - plac==1.4.5 + - platformdirs==4.3.8 + - pluggy==1.6.0 + - pooch==1.8.2 + - portalocker==3.2.0 + - prettytable==3.16.0 + - progress==1.6 + - prompt-toolkit==3.0.51 + - propcache==0.3.2 + - protobuf==5.29.5 + - psutil==7.0.0 + - ptyprocess==0.7.0 + - pulp==3.2.1 + - pure-eval==0.2.3 + - pyannote-core==5.0.0 + - pyannote-database==5.1.3 + - pyannote-metrics==3.2.1 + - pyarrow==20.0.0 + - pybind11==2.13.6 + - pybtex==0.24.0 + - pybtex-docutils==1.0.3 + - pycparser==2.22 + - pydantic==2.10.6 + - pydantic-core==2.27.2 + - pydub==0.25.1 + - pygments==2.19.1 + - pyjwt==2.10.1 + - pyloudnorm==0.1.1 + - pynacl==1.5.0 + - pynini==2.1.6.post1 + - pynvml==12.0.0 + - pyparsing==3.2.3 + - pypdf==5.6.0 + - pypinyin==0.54.0 + - pypinyin-dict==0.9.0 + - pyre-extensions==0.0.32 + - pysocks==1.7.1 + - pystoi==0.4.1 + - pytablewriter==1.2.1 + - pytest==8.4.0 + - pytest-cov==6.2.1 + - pytest-httpserver==1.1.3 + - pytest-mock==3.14.1 + - pytest-random-order==1.1.1 + - pytest-runner==6.0.1 + - python-dateutil==2.9.0.post0 + - python-dotenv==1.1.0 + - python-graphviz==0.21 + - python-iso639==2025.2.18 + - python-magic==0.4.27 + - pytorch-lightning==2.5.1.post0 + - pytz==2025.2 + - pyyaml==6.0.2 + - qwen-vl-utils==0.0.11 + - rapidfuzz==3.13.0 + - referencing==0.36.2 + - regex==2024.11.6 + - requests==2.32.4 + - requests-toolbelt==1.0.0 + - resampy==0.4.3 + - rich==14.0.0 + - rouge-score==0.1.2 + - rpds-py==0.25.1 + - ruamel-yaml==0.18.14 + - ruamel-yaml-clib==0.2.12 + - s3fs==0.4.2 + - s3transfer==0.13.0 + - sacrebleu==2.5.1 + - sacremoses==0.1.1 + - safetensors==0.5.3 + - sanic==0.7.0 + - scikit-learn==1.7.0 + - scipy==1.15.3 + - seaborn==0.13.2 + - sentence-transformers==4.1.0 + - sentencepiece==0.2.0 + - sentry-sdk==2.30.0 + - setproctitle==1.3.6 + - shellingham==1.5.4 + - six==1.17.0 + - smmap==5.0.2 + - sniffio==1.3.1 + - snowballstemmer==3.0.1 + - sortedcontainers==2.4.0 + - sounddevice==0.5.2 + - soundfile==0.13.1 + - soupsieve==2.7 + - sox==1.5.0 + - soxr==0.5.0.post1 + - sphinx==8.1.3 + - sphinxcontrib-applehelp==2.0.0 + - sphinxcontrib-bibtex==2.6.4 + - sphinxcontrib-devhelp==2.0.0 + - sphinxcontrib-htmlhelp==2.1.0 + - sphinxcontrib-jsmath==1.0.1 + - sphinxcontrib-qthelp==2.0.0 + - sphinxcontrib-serializinghtml==2.0.0 + - sqlalchemy==2.0.41 + - stack-data==0.6.3 + - starlette==0.46.2 + - structlog==25.4.0 + - sympy==1.14.0 + - tabledata==1.3.4 + - tabulate==0.9.0 + - taming-transformers==0.0.1 + - tcolorpy==0.1.7 + - tenacity==9.1.2 + - tensorboard==2.19.0 + - tensorboard-data-server==0.7.2 + - tensorstore==0.1.71 + - termcolor==3.1.0 + - text-unidecode==1.3 + - textdistance==4.6.3 + - texterrors==0.5.1 + - threadpoolctl==3.6.0 + - tiktoken==0.7.0 + - timm==1.0.15 + - tokenizers==0.21.1 + - tomli==2.2.1 + - toolz==1.0.0 + - torch==2.7.1 + - torchaudio==2.7.1 + - torchdiffeq==0.2.5 + - torchmetrics==1.7.3 + - torchprofile==0.0.4 + - torchsde==0.2.6 + - torchvision==0.22.1 + - torchx==0.7.0 + - tqdm==4.67.1 + - tqdm-multiprocess==0.0.11 + - traitlets==5.14.3 + - trampoline==0.1.2 + - transformers==4.51.3 + - tree-sitter==0.24.0 + - tree-sitter-python==0.23.6 + - trimesh==4.6.12 + - triton==3.3.1 + - typeguard==4.4.3 + - typepy==1.3.4 + - typer==0.16.0 + - types-protobuf==4.25.0.20240417 + - typing-extensions==4.14.0 + - typing-inspect==0.9.0 + - typing-inspection==0.4.1 + - tzdata==2025.2 + - ujson==5.10.0 + - unstructured==0.14.9 + - unstructured-client==0.36.0 + - urllib3==1.26.20 + - uvicorn==0.34.3 + - uvloop==0.21.0 + - wandb==0.20.1 + - watchfiles==1.1.0 + - wcmatch==10.0 + - wcwidth==0.2.13 + - webdataset==0.2.111 + - websockets==15.0.1 + - werkzeug==3.1.3 + - wget==3.2 + - whisper-normalizer==0.1.12 + - word2number==1.1 + - wrapt==1.17.2 + - xattr==1.1.4 + - xxhash==3.5.0 + - yarl==1.20.1 + - zarr==2.18.3 + - zipp==3.23.0 + - zstandard==0.23.0 diff --git a/examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py b/examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py new file mode 100644 index 000000000000..a0629f75fa94 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py @@ -0,0 +1,381 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import copy +import os +import signal +import sys + +from loguru import logger +from omegaconf import OmegaConf + +# Configure loguru to output to both console and file +logger.remove() # Remove default handler +logger.add( + sys.stderr, + format="{time:YYYY-MM-DD HH:mm:ss.SSSS} | {level: <8} | {name}:{function}:{line} - {message}", + level="DEBUG", +) + +logger.add("bot_server.log", rotation="1 day", level="DEBUG") + +# Global flag for graceful shutdown +shutdown_event = asyncio.Event() + +from pipecat.audio.vad.silero import SileroVADAnalyzer, VADParams +from pipecat.frames.frames import EndTaskFrame, MetricsFrame +from pipecat.metrics.metrics import LLMUsageMetricsData, ProcessingMetricsData, TTFBMetricsData, TTSUsageMetricsData +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.frame_processor import Frame, FrameDirection, FrameProcessor +from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor +from pipecat.serializers.protobuf import ProtobufFrameSerializer + +from nemo.collections.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService +from nemo.collections.voice_agent.pipecat.services.nemo.llm import HuggingFaceLLMService +from nemo.collections.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService +from nemo.collections.voice_agent.pipecat.services.nemo.tts import NeMoFastPitchHiFiGANTTSService +from nemo.collections.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService +from nemo.collections.voice_agent.pipecat.transports.network.websocket_server import ( + WebsocketServerParams, + WebsocketServerTransport, +) +from nemo.collections.voice_agent.pipecat.utils.text.simple_text_aggregator import SimpleSegmentedTextAggregator + +SERVER_CONFIG_PATH = os.environ.get( + "SERVER_CONFIG_PATH", f"{os.path.dirname(os.path.abspath(__file__))}/server_config.yaml" +) + +server_config = OmegaConf.load(SERVER_CONFIG_PATH) + +logger.info(f"Server config: {server_config}") + +# Default Configuration +SAMPLE_RATE = 16000 # Standard sample rate for speech recognition +RAW_AUDIO_FRAME_LEN_IN_SECS = 0.016 # 16ms for websocket transport +BOT_PROMPT = """ +You are a helpful AI agent named Lisa. +Start by greeting the user warmly and introducing yourself within one sentence. +Your answer should be concise and to the point. +""" + +################ Start of Configuration ################# +if server_config.get("bot_prompt", None) is not None: + bot_prompt = server_config.bot_prompt + if os.path.isfile(bot_prompt): + with open(bot_prompt, "r") as f: + bot_prompt = f.read() + BOT_PROMPT = bot_prompt + +logger.info(f"BOT_PROMPT: {BOT_PROMPT}") + +TRANSPORT_AUDIO_OUT_10MS_CHUNKS = server_config.transport.audio_out_10ms_chunks + +vad_params = VADParams( + confidence=server_config.vad.confidence, + start_secs=server_config.vad.start_secs, + stop_secs=server_config.vad.stop_secs, + min_volume=server_config.vad.min_volume, +) + +STT_MODEL_PATH = server_config.stt.model +STT_DEVICE = server_config.stt.device +stt_params = NeMoSTTInputParams( + att_context_size=server_config.stt.att_context_size, + frame_len_in_secs=server_config.stt.frame_len_in_secs, + raw_audio_frame_len_in_secs=RAW_AUDIO_FRAME_LEN_IN_SECS, +) + +DIAR_MODEL = server_config.diar.model +USE_DIAR = server_config.diar.enabled +diar_params = NeMoDiarInputParams( + frame_len_in_secs=server_config.diar.frame_len_in_secs, + threshold=server_config.diar.threshold, +) + +TURN_TAKING_MAX_BUFFER_SIZE = server_config.turn_taking.max_buffer_size + +LLM_MODEL = server_config.llm.model +LLM_DEVICE = server_config.llm.device +LLM_TEMPERATURE = server_config.llm.temperature +LLM_MAX_TOKENS = server_config.llm.max_tokens +LLM_TOP_P = server_config.llm.top_p + + +TTS_FASTPITCH_MODEL = server_config.tts.fastpitch_model +TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model +TTS_DEVICE = server_config.tts.device + +EXTRA_SEPARATOR = server_config.tts.get("extra_separator", ":,!?") + +################ End of Configuration ################# + + +def signal_handler(signum, frame): + """Handle shutdown signals gracefully""" + logger.info(f"Received signal {signum}, initiating graceful shutdown...") + shutdown_event.set() + + +async def run_bot_websocket_server(): + # Set up signal handlers for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + logger.info("Initializing WebSocket server transport...") + logger.info("Server configured to run indefinitely with no timeouts") + + """ + NO-TIMEOUT CONFIGURATION: + - session_timeout=None: Disables WebSocket session timeout + - idle_timeout=None: Disables pipeline idle timeout + - asyncio.wait_for(timeout=None): No timeout on pipeline runner + - Server will run indefinitely until manually stopped (Ctrl+C) + """ + + vad_analyzer = SileroVADAnalyzer( + sample_rate=SAMPLE_RATE, + params=vad_params, + ) + logger.info("VAD analyzer initialized") + + ws_transport = WebsocketServerTransport( + params=WebsocketServerParams( + serializer=ProtobufFrameSerializer(), + audio_in_enabled=True, + audio_out_enabled=True, + add_wav_header=False, + vad_analyzer=vad_analyzer, + session_timeout=None, # Disable session timeout + audio_in_sample_rate=SAMPLE_RATE, + can_create_user_frames=False, + audio_out_10ms_chunks=TRANSPORT_AUDIO_OUT_10MS_CHUNKS, + ), + host="0.0.0.0", # Bind to all interfaces + port=8765, + ) + + logger.info("Initializing STT service...") + + stt = NemoSTTService( + model=STT_MODEL_PATH, + device=STT_DEVICE, + params=stt_params, + sample_rate=SAMPLE_RATE, + audio_passthrough=True, + has_turn_taking=True, + backend="legacy", + decoder_type="rnnt", + ) + logger.info("STT service initialized") + + diar = NemoDiarService( + model=DIAR_MODEL, + device=STT_DEVICE, + params=diar_params, + sample_rate=SAMPLE_RATE, + backend="legacy", + enabled=USE_DIAR, + ) + logger.info("Diarization service initialized") + + turn_taking = NeMoTurnTakingService( + use_vad=True, + use_diar=USE_DIAR, + max_buffer_size=TURN_TAKING_MAX_BUFFER_SIZE, + ) + logger.info("Turn taking service initialized") + + logger.info("Initializing LLM service...") + + llm = HuggingFaceLLMService( + model=LLM_MODEL, + device=LLM_DEVICE, + temperature=LLM_TEMPERATURE, + max_tokens=LLM_MAX_TOKENS, + top_p=LLM_TOP_P, + ) + logger.info("LLM service initialized") + + text_aggregator = SimpleSegmentedTextAggregator(punctuation_marks=EXTRA_SEPARATOR) + + tts = NeMoFastPitchHiFiGANTTSService( + fastpitch_model=TTS_FASTPITCH_MODEL, + hifigan_model=TTS_HIFIGAN_MODEL, + device=TTS_DEVICE, + text_aggregator=text_aggregator, + ) + + logger.info("TTS service initialized") + + context = OpenAILLMContext( + [ + { + "role": "system", + "content": BOT_PROMPT, + } + ], + ) + + original_messages = copy.deepcopy(context.get_messages()) + original_context = copy.deepcopy(context) + original_context.set_llm_adapter(llm.get_llm_adapter()) + + context_aggregator = llm.create_context_aggregator(context) + user_context_aggregator = context_aggregator.user() + assistant_context_aggregator = context_aggregator.assistant() + + # RTVI events for Pipecat client UI + rtvi = RTVIProcessor(config=RTVIConfig(config=[])) + + # Add reset action to RTVI processor + async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arguments: dict[str, any]) -> bool: + """Reset both user and assistant context aggregators""" + logger.info("Resetting conversation context...") + try: + user_context_aggregator.reset() + assistant_context_aggregator.reset() + user_context_aggregator.set_messages(copy.deepcopy(original_messages)) + assistant_context_aggregator.set_messages(copy.deepcopy(original_messages)) + + logger.info("Conversation context reset successfully") + return True + except Exception as e: + logger.error(f"Error resetting context: {e}") + return False + + reset_action = RTVIAction( + service="context", + action="reset", + result="bool", + arguments=[], + handler=reset_context_handler, + ) + rtvi.register_action(reset_action) + + logger.info("Setting up pipeline...") + + class MetricsLogger(FrameProcessor): + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, MetricsFrame): + for d in frame.data: + if isinstance(d, TTFBMetricsData): + logger.debug(f"TTFB Metrics: {d.processor} = {d.value:.3f}s") + elif isinstance(d, ProcessingMetricsData): + logger.debug(f"Processing Metrics: {d.processor} = {d.value:.3f}s") + elif isinstance(d, LLMUsageMetricsData): + tokens = d.value + logger.debug( + f"LLM Usage: {d.processor} - prompt: {tokens.prompt_tokens}, completion: {tokens.completion_tokens}" + ) + elif isinstance(d, TTSUsageMetricsData): + logger.debug(f"TTS Usage: {d.processor} = {d.value} characters") + await self.push_frame(frame, direction) + + pipeline = Pipeline( + [ + ws_transport.input(), + rtvi, + stt, + diar, + turn_taking, + user_context_aggregator, + llm, # LLM + tts, + ws_transport.output(), + assistant_context_aggregator, + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=False, + enable_usage_metrics=False, + send_initial_empty_metrics=True, + report_only_initial_ttfb=True, + idle_timeout=None, # Disable idle timeout + ), + observers=[RTVIObserver(rtvi)], + idle_timeout_secs=None, + cancel_on_idle_timeout=False, + ) + + # Track task state + task_running = True + + @rtvi.event_handler("on_client_ready") + async def on_client_ready(rtvi: RTVIProcessor): + logger.info("Pipecat client ready.") + await rtvi.set_bot_ready() + # Kick off the conversation. + try: + await task.queue_frames([user_context_aggregator.get_context_frame()]) + except Exception as e: + logger.error(f"Error queuing context frame: {e}") + + @ws_transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Pipecat Client connected from {client.remote_address}") + # Reset RTVI state for new connection + rtvi._client_ready = False + rtvi._bot_ready = False + + @ws_transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Pipecat Client disconnected from {client.remote_address}") + # Don't cancel the task immediately - let it handle the disconnection gracefully + # The task will continue running and can accept new connections + # Only send an EndTaskFrame to clean up the current session + if task_running: + try: + await task.queue_frames([EndTaskFrame()]) + except Exception as e: + # Don't log warnings for normal connection closures + if "ConnectionClosedOK" not in str(e) and "1005" not in str(e): + logger.warning(f"Error sending EndTaskFrame: {e}") + else: + logger.debug(f"Normal connection closure: {e}") + + @ws_transport.event_handler("on_session_timeout") + async def on_session_timeout(transport, client): + logger.info(f"Session timeout for {client.remote_address}") + # Don't cancel the task - keep server running indefinitely + logger.info("Session timeout occurred but keeping server running") + # Note: With session_timeout=None, this handler should never be called + + logger.info("Starting pipeline runner...") + + try: + runner = PipelineRunner() + # Run the task until shutdown is requested + await asyncio.wait_for(runner.run(task), timeout=None) # No timeout - run indefinitely + except asyncio.TimeoutError: + logger.info("Pipeline runner timeout (should not happen with no timeout)") + except Exception as e: + logger.error(f"Pipeline runner error: {e}") + task_running = False + finally: + logger.info("Pipeline runner stopped") + + +if __name__ == "__main__": + asyncio.run(run_bot_websocket_server()) diff --git a/examples/voice_agent/nemo_chatbot/server/env.example b/examples/voice_agent/nemo_chatbot/server/env.example new file mode 100644 index 000000000000..65caf95bdd81 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/server/env.example @@ -0,0 +1,2 @@ +HF_TOKEN= # Your HuggingFace API key +WEBSOCKET_SERVER='websocket_server' # Options: 'fast_api' or 'websocket_server' \ No newline at end of file diff --git a/examples/voice_agent/nemo_chatbot/server/prompts.py b/examples/voice_agent/nemo_chatbot/server/prompts.py new file mode 100644 index 000000000000..be27fed6d4a2 --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/server/prompts.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +lunch_menu = """ +Fast Bites Lunch Menu + +Burgers and Sandwiches +1. Classic Cheeseburger – $5.99 + Juicy beef patty, cheddar cheese, pickles, ketchup & mustard on a toasted bun. + - Make it a double cheeseburger by adding another patty - $1.50 +2. Crispy Chicken Sandwich – $6.49 + Fried chicken filet, lettuce, mayo, and pickles on a brioche bun. +3. Veggie Wrap – $5.49 + Grilled vegetables, hummus, lettuce, and tomato in a spinach wrap. + +Combo Deals (includes small fries and fountain soda) +4. Cheeseburger Combo – $8.99 +5. Chicken Sandwich Combo – $9.49 +6. Veggie Wrap Combo – $8.49 + +Sides +7. French Fries + - Small - $2.49 + - Medium - $3.49 + - Large - $4.49 +8. Chicken Nuggets + - 4 pcs - $3.29 + - 8 pcs - $5.99 + - 12 pcs - $8.99 +9. Side Salad - $2.99 + +Drinks +10. Fountain Soda (16 oz, choices: Coke, Diet Coke, Sprite, Fanta) – $1.99 +11. Iced Tea or Lemonade – $2.29 +12. Bottled Water – $1.49 +""" + +bot_prompt = f""" +{lunch_menu}\n\n +You are a helpful assistant named Lisa that helps customers order food from the lunch menu.\n +Start by greeting the user warmly and introducing yourself within one sentence "Hi welcome to Fast Bites! I'm Lisa, what can I help you with?".\n +Your answer should be concise and to the point.\n +Do not include the whole lunch menu in your response, only include the items that are relevant to the user's question.\n +If the user asks about a specific item, you should include the price of that item.\n +If the user asks about the menu, you should include the entire lunch menu.\n +If the user asks about the prices, you should include the prices of the items.\n +If the user asks about the location, you should include the location of the restaurant (123 Main St, Anytown, USA).\n +If the user asks about the hours, you should include the hours of the restaurant (11:00 AM - 9:00 PM).\n +When a user asks for the total price of the order, you should include the total price of the order.\n +When the conversation is done, you should say "Thank you for your order! Your total is . Please come back soon!", where is the total price of the orders of all speakers.\n +If a speaker finishes their order and you don't know their name, you should ask them for their name and associate it with their order.\n +When introducing an item from the menu, you should include the name of the item and the price.\n +Stick strictly to the lunch menu and do not make up any items.\n +You might also see speaker tags (, , etc.) in the user context.\n +You should respond to the user based on the speaker tag and the context of that speaker. \n +Do not include the speaker tags in your response, use them only to identify the speaker.\n +If there are multiple speakers, you should handle the order of each speaker separately and not mix up the speakers.\n +Do not respond only with "Hi" or "Hi there", you should focus on the task of taking the order and not just greeting the user. \n +""" diff --git a/examples/voice_agent/nemo_chatbot/server/requirements.txt b/examples/voice_agent/nemo_chatbot/server/requirements.txt new file mode 100644 index 000000000000..270c9195f11b --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/server/requirements.txt @@ -0,0 +1,4 @@ +fastapi[all] +pipecat-ai[silero,websocket] +python-dotenv +uvicorn diff --git a/examples/voice_agent/nemo_chatbot/server/server.py b/examples/voice_agent/nemo_chatbot/server/server.py new file mode 100644 index 000000000000..d3e4df98125f --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/server/server.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +from contextlib import asynccontextmanager +from typing import Any, Dict + +import uvicorn +from dotenv import load_dotenv +from fastapi import FastAPI, Request, WebSocket +from fastapi.middleware.cors import CORSMiddleware + +# Load environment variables +load_dotenv(override=True) + +from bot_websocket_server import run_bot_websocket_server + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Handles FastAPI startup and shutdown.""" + yield # Run app + + +# Initialize FastAPI app with lifespan manager +app = FastAPI(lifespan=lifespan) + +# Configure CORS to allow requests from any origin +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket, path: str = "/ws"): + raise NotImplementedError("FastAPI websocket endpoint is not implemented") + + +@app.post("/connect") +async def bot_connect(request: Request) -> Dict[Any, Any]: + print("Received /connect request") + server_mode = os.getenv("WEBSOCKET_SERVER", "websocket_server") + if server_mode == "websocket_server": + # Use the host that the client connected to (from the request) + server_host = request.url.hostname or request.headers.get("host", "").split(":")[0] + ws_url = f"ws://{server_host}:8765" + else: + ws_url = "ws://localhost:7860/ws" + print(f"Returning WebSocket URL: {ws_url}") + return {"ws_url": ws_url} + + +async def main(): + server_mode = os.getenv("WEBSOCKET_SERVER", "websocket_server") + tasks = [] + try: + if server_mode == "websocket_server": + tasks.append(run_bot_websocket_server()) + config = uvicorn.Config(app, host="0.0.0.0", port=7860) + server = uvicorn.Server(config) + tasks.append(server.serve()) + + await asyncio.gather(*tasks) + except asyncio.CancelledError: + print("Tasks cancelled (probably due to shutdown).") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/voice_agent/nemo_chatbot/server/server_config.yaml b/examples/voice_agent/nemo_chatbot/server/server_config.yaml new file mode 100644 index 000000000000..c195817f991c --- /dev/null +++ b/examples/voice_agent/nemo_chatbot/server/server_config.yaml @@ -0,0 +1,47 @@ + +bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker." + + +transport: + audio_out_10ms_chunks: 8 + +vad: + type: silero + confidence: 0.6 + start_secs: 0.1 + stop_secs: 0.8 + min_volume: 0.4 + +stt: + type: nemo + model: "/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms.nemo" # "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi" + device: "cuda" + att_context_size: [70, 1] + frame_len_in_secs: 0.08 # default for FastConformer, do not change + +diar: + type: nemo + enabled: true + model: "/home/heh/codes/niva-kunal/im417-normNA-ft3-mem14_epoch23-36.apr23_2025.nemo" + device: "cuda" + threshold: 0.4 + frame_len_in_secs: 0.08 # default for FastConformer, do not change + +turn_taking: + max_buffer_size: 2 + +llm: + type: hf + model: "/media/data/cache2/meta-llama/Meta-Llama-3-8B-Instruct" # "meta-llama/Meta-Llama-3-8B-Instruct" + device: "cuda" + temperature: 0.7 + max_tokens: 128 + top_p: 0.9 + +tts: + type: nemo + model: fastpitch-hifigan + fastpitch_model: "/media/data/cache2/nvidia/tts_en_fastpitch/tts_en_fastpitch.nemo" # "nvidia/tts_en_fastpitch" + hifigan_model: "/media/data/cache2/nvidia/tts_hifigan/tts_hifigan.nemo" # "nvidia/tts_hifigan" + device: "cuda" + extra_separator: ":,?!" diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index 44b532e25e5c..3dcc6b95ed8b 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -1003,6 +1003,8 @@ def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = num_flattened_tokens += len([c for c in char_offsets[t]['char'] if c != self.blank_id]) if num_flattened_tokens != len(hypothesis.text): + print(f"alignments: {alignments}") + print(f"token_repetitions: {token_repetitions}") raise ValueError( f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}" " have to be of the same length, but are: " diff --git a/nemo/collections/voice_agent/__init__.py b/nemo/collections/voice_agent/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/__init__.py b/nemo/collections/voice_agent/pipecat/__init__.py new file mode 100644 index 000000000000..55fb128340af --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import pipecat +except ImportError: + raise ImportError("pipecat is not installed. Please install it with `pip install pipecat-ai`.") diff --git a/nemo/collections/voice_agent/pipecat/frames/__init__.py b/nemo/collections/voice_agent/pipecat/frames/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/frames/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/frames/frames.py b/nemo/collections/voice_agent/pipecat/frames/frames.py new file mode 100644 index 000000000000..fbe80fe7fecc --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/frames/frames.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import dataclass, field +import numpy as np +from pipecat.frames.frames import DataFrame + + +@dataclass +class DiarResultFrame(DataFrame): + """Diarization frame.""" + + diar_result: np.ndarray | int + stream_id: str = "default" diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py b/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py new file mode 100644 index 000000000000..8b8ab6f75eee --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys + +from .diar import NemoDiarService +from .llm import HuggingFaceLLMService +from .stt import NemoSTTService +from .tts import NeMoFastPitchHiFiGANTTSService +from .turn_taking import NeMoTurnTakingService diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py new file mode 100644 index 000000000000..168950ef5e76 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py @@ -0,0 +1,373 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + + +import asyncio +import time +from typing import AsyncGenerator, List, Mapping, Optional, Tuple + +import numpy as np +import torch +from loguru import logger +from omegaconf import OmegaConf +from pipecat.frames.frames import ( + AudioRawFrame, + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + InterimTranscriptionFrame, + StartFrame, + TranscriptionFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.stt_service import STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 +from pipecat.utils.tracing.service_decorators import traced_stt +from pydantic import BaseModel + +from nemo.collections.voice_agent.pipecat.frames.frames import DiarResultFrame +from nemo.collections.voice_agent.pipecat.services.nemo.legacy_diar import DiarizationConfig, NeMoLegacyDiarService + +try: + import nemo.collections.asr as nemo_asr + from nemo.collections.asr.models import ASRModel + + # disable nemo logging + from nemo.utils import logging + + level = logging.getEffectiveLevel() + logging.setLevel(logging.CRITICAL) + + +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error('In order to use NVIDIA NeMo STT, you need to `pip install "nemo_toolkit[asr]"`.') + raise Exception(f"Missing module: {e}") + + +class NeMoDiarInputParams(BaseModel): + threshold: Optional[float] = 0.5 + language: Optional[Language] = Language.EN_US + frame_len_in_secs: Optional[float] = 0.08 # 80ms for FastConformer model + config_path: Optional[str] = None # path to the Niva ASR config file + raw_audio_frame_len_in_secs: Optional[float] = 0.016 # 16ms for websocket transport + buffer_size: Optional[int] = ( + 30 # number of audio frames to buffer, 1 frame is 16ms, streaming Sortformer was trained with 6*0.08=0.48s chunks + ) + + +class NemoDiarService(STTService): + def __init__( + self, + *, + model: Optional[str] = "", + device: Optional[str] = "cuda:0", + sample_rate: Optional[int] = 16000, + params: Optional[NeMoDiarInputParams] = None, + use_vad: bool = True, + audio_passthrough: bool = True, + backend: Optional[str] = "legacy", + enabled: bool = True, + **kwargs, + ): + super().__init__(audio_passthrough=audio_passthrough, **kwargs) + + self._enabled = enabled + self._queue = asyncio.Queue() + self._response_queue = asyncio.Queue() # Add response queue + self._processing_task = None # Add processing task + self._response_task = None # Add response task + self._device = device + self._sample_rate = sample_rate + self._audio_passthrough = audio_passthrough + params.buffer_size = params.frame_len_in_secs // params.raw_audio_frame_len_in_secs + self._params = params + self._model_name = model + self._use_vad = use_vad + self._backend = backend + if not params: + raise ValueError("params is required") + + self._load_model() + logger.info(f"Diarization service initialized on device: {self._model.device}") + + self._vad_user_speaking = False + self._audio_buffer = [] + self._current_speaker_id = None + self._processing_running = False + + if not self._use_vad: + self._vad_user_speaking = True + + def _load_model(self): + if not self._enabled: + self._model = None + return + + if self._backend == "legacy": + cfg = DiarizationConfig() + cfg.device = self._device + self._model = NeMoLegacyDiarService( + cfg, self._model_name, frame_len_in_secs=self._params.frame_len_in_secs, sample_rate=self.sample_rate + ) + else: + raise ValueError(f"Invalid backend: {self._backend}") + + def can_generate_metrics(self) -> bool: + """Indicates whether this service can generate metrics. + + Returns: + bool: True, as this service supports metric generation. + """ + return True + + async def start(self, frame: StartFrame): + """Handle service start.""" + await super().start(frame) + + # Initialize the model if not already done + if not hasattr(self, "_model"): + self._load_model() + + # Start background processing task + if not self._processing_task: + self._processing_task = self.create_task(self._processing_task_handler()) + + # Start response handling task + if not self._response_task: + self._response_task = self.create_task(self._response_task_handler()) + + async def stop(self, frame: EndFrame): + """Handle service stop.""" + await super().stop(frame) + await self._stop_tasks() + + async def cancel(self, frame: CancelFrame): + """Handle service cancellation.""" + await super().cancel(frame) + await self._stop_tasks() + + async def _stop_tasks(self): + """Stop background processing tasks.""" + await self._queue.put(None) # Signal to stop processing + if self._processing_task: + await self.cancel_task(self._processing_task) + self._processing_task = None + + if self._response_task: + await self.cancel_task(self._response_task) + self._response_task = None + + def _diarization_processor(self): + """Background processor that handles diarization calls.""" + try: + while self._processing_running: + try: + # Get audio from queue - blocking call that will be interrupted by cancellation + future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop()) + audio = future.result() + + if audio is None: # Stop signal + logger.debug("Received stop signal in background processor") + break + + # logger.debug(f"Processing audio chunk of size {len(audio)} bytes") + + # Process diarization + diar_result = self._model.diarize(audio) + # logger.debug(f"Diarization result: {diar_result is not None}") + + # Send result back to async loop + asyncio.run_coroutine_threadsafe(self._response_queue.put(diar_result), self.get_event_loop()) + + except Exception as e: + logger.error(f"Error in background diarization processor: {e}") + # Send error back to async loop + asyncio.run_coroutine_threadsafe(self._response_queue.put(('error', e)), self.get_event_loop()) + + except Exception as e: + logger.error(f"Background diarization processor fatal error: {e}") + finally: + logger.debug("Background diarization processor stopped") + + async def _processing_task_handler(self): + """Handler for background processing task.""" + try: + self._processing_running = True + logger.debug("Starting background processing task") + await asyncio.to_thread(self._diarization_processor) + except asyncio.CancelledError: + logger.debug("Background processing task cancelled") + self._processing_running = False + raise + finally: + self._processing_running = False + + async def _handle_diarization_result(self, diar_result): + """Handle diarization result from background processing.""" + try: + if diar_result is None: + return + dominant_speaker_id = self._get_dominant_speaker_id(diar_result) + # logger.debug(f"Dominant speaker ID: {dominant_speaker_id}") + if dominant_speaker_id is not None and dominant_speaker_id != self._current_speaker_id: + self._current_speaker_id = dominant_speaker_id + logger.debug(f"Pushing DiarResultFrame with speaker {dominant_speaker_id}") + await self.push_frame(DiarResultFrame(dominant_speaker_id, stream_id="default")) + except Exception as e: + logger.error(f"Error handling diarization result: {e}") + await self.push_frame( + ErrorFrame( + str(e), + time_now_iso8601(), + ) + ) + + async def _response_task_handler(self): + """Handler for processing diarization results.""" + logger.debug("Response task handler started") + try: + while True: + try: + result = await self._response_queue.get() + + if isinstance(result, tuple) and result[0] == 'error': + # Handle error from background processing + error = result[1] + logger.error(f"Error in NeMo Diarization processing: {error}") + await self.push_frame( + ErrorFrame( + str(error), + time_now_iso8601(), + ) + ) + else: + # Handle successful diarization result + await self._handle_diarization_result(result) + + except Exception as e: + logger.error(f"Error in response task handler: {e}") + except asyncio.CancelledError: + logger.debug("Response task handler cancelled") + raise + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Process audio data and generate transcription frames. + + Args: + audio: Raw audio bytes to transcribe + + Yields: + Frame: Transcription frames containing the results + """ + if self._vad_user_speaking and self._enabled: + self._audio_buffer.append(audio) + if len(self._audio_buffer) >= self._params.buffer_size: + await self.start_ttfb_metrics() + await self.start_processing_metrics() + audio = b"".join(self._audio_buffer) + self._audio_buffer = [] + # Queue audio for background processing + await self._queue.put(audio) + yield None + + @traced_stt + async def _handle_transcription(self, transcript: str, is_final: bool, language: Optional[str] = None): + """Handle a transcription result. + + Args: + transcript: The transcribed text + is_final: Whether this is a final transcription + language: The language of the transcription + """ + pass # Base implementation - can be extended for specific handling needs + + async def set_language(self, language: Language): + """Update the service's recognition language. + + Args: + language: New language for recognition + """ + if self._params: + self._params.language = language + else: + self._params = NeMoDiarInputParams(language=language) + + logger.info(f"Switching STT language to: {language}") + + async def set_model(self, model: str): + """Update the service's model. + + Args: + model: New model name/path to use + """ + await super().set_model(model) + self._model_name = model + self._load_model() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process audio data and generate transcription frames. + + Args: + audio: Raw audio bytes to transcribe + + Yields: + Frame: Transcription frames containing the results + """ + if not self._enabled: + # if diarization is disabled, just pass the frame through + await self.push_frame(frame, direction) + return + + await super().process_frame(frame, direction) + if isinstance(frame, VADUserStartedSpeakingFrame): + self._vad_user_speaking = True + self._audio_buffer = [] + logger.debug("VADUserStartedSpeakingFrame received") + elif isinstance(frame, VADUserStoppedSpeakingFrame): + self._vad_user_speaking = False + logger.debug("VADUserStoppedSpeakingFrame received") + self._current_speaker_id = None + self._audio_buffer = [] + + def reset(self): + self._current_speaker_id = None + self._audio_buffer = [] + self._vad_user_speaking = False + self._model.reset_state() + + def _get_dominant_speaker_id(self, spk_pred: np.ndarray): + spk_pred = (spk_pred > self._params.threshold).astype(int) + dominant_speaker_id = None + if spk_pred.sum() > 0: + # get the dominant speaker id + # Filter to only keep frames that have any speaker probability > 0.0 + valid_frame_mask = spk_pred.sum(axis=1) > 0 + + # Filter diar_result to only keep valid frames + filtered_diar_result = spk_pred[valid_frame_mask] # ndarray of shape [num_valid_frames, num_speakers] + + # Get the primary speaker for each valid frame + primary_spk = np.argmax(filtered_diar_result, axis=1) # ndarray of shape [num_valid_frames] + # logger.debug(f"Primary speaker for valid frames: {primary_spk}") + + # count the number of different speakers in the primary speaker sequence + num_speakers = len(np.unique(primary_spk)) + # logger.debug(f"Number of different speakers: {num_speakers}") + + # If there are multiple speakers, get the dominant one + if num_speakers > 1: + # Count occurrences of each speaker + speaker_counts = np.bincount(primary_spk) + dominant_speaker_id = np.argmax(speaker_counts) + else: + # Only one speaker, return that speaker ID + dominant_speaker_id = primary_spk[0] + return dominant_speaker_id diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py new file mode 100644 index 000000000000..b25db5466fc9 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -0,0 +1,264 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import List + +import numpy as np +import torch +from omegaconf import open_dict +from pipecat.services.nemo.utils import CacheFeatureBufferer + +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis +from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer + + +class NemoLegacyASRService: + def __init__( + self, + model: str = "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi", + att_context_size: List[int] = [70, 1], + device: str = "cuda", + eou_string: str = "", + eob_string: str = "", + decoder_type: str = None, + chunk_size: int = -1, + shift_size: int = -1, + left_chunks: int = 2, + sample_rate: int = 16000, + frame_len_in_secs: float = 0.08, + use_amp: bool = False, + ): + self.model = model + self.eou_string = eou_string + self.eob_string = eob_string + self.device = device + self.att_context_size = att_context_size + self.decoder_type = decoder_type + self.chunk_size = chunk_size + self.shift_size = shift_size + self.left_chunks = left_chunks + self.asr_model = self._load_model(model) + self.tokenizer = self.asr_model.tokenizer # type: SentencePieceTokenizer + self.use_amp = use_amp + self.pad_and_drop_preencoded = False + self.blank_id = self.get_blank_id() + + print("NemoLegacyASRService initialized") + + assert len(self.att_context_size) == 2, "Att context size must be a list of two integers" + assert ( + self.att_context_size[0] >= 0 + ), f"Left att context size must be greater than 0: {self.att_context_size[0]}" + assert ( + self.att_context_size[1] >= 0 + ), f"Right att context size must be greater than 0: {self.att_context_size[1]}" + + self.buffer_size_in_secs = (1 + sum(self.att_context_size)) * frame_len_in_secs + self.chunk_size_in_secs = frame_len_in_secs # (1 + self.att_context_size[1]) * frame_len_in_secs + + window_stride_in_secs = self.asr_model.cfg.preprocessor.window_stride + model_stride = self.asr_model.cfg.encoder.subsampling_factor + self.tokens_per_frame = math.ceil(np.trunc(self.chunk_size_in_secs / window_stride_in_secs) / model_stride) + self.model_chunk_size = self.asr_model.encoder.streaming_cfg.chunk_size + if isinstance(self.model_chunk_size, list): + self.model_chunk_size = self.model_chunk_size[1] + # overwrite the encoder streaming params with proper shift size for cache aware streaming + self.asr_model.encoder.setup_streaming_params( + chunk_size=self.model_chunk_size // model_stride, shift_size=self.tokens_per_frame + ) + self._audio_buffer = CacheFeatureBufferer( + sample_rate=sample_rate, + buffer_size_in_secs=self.buffer_size_in_secs, + chunk_size_in_secs=self.chunk_size_in_secs, + preprocessor_cfg=self.asr_model.cfg.preprocessor, + device=self.device, + ) + self._reset_cache() + self._previous_hypotheses = self._get_blank_hypothesis() + self._prev_num_tokens = 0 + + def _reset_cache(self): + ( + self._cache_last_channel, # [17, B, 70, 512] + self._cache_last_time, # [17, B, 512, 8] + self._cache_last_channel_len, # B + ) = self.asr_model.encoder.get_initial_cache_state( + 1 + ) # batch size is 1 + + def _get_blank_hypothesis(self) -> List[Hypothesis]: + blank_hypothesis = Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) + return [blank_hypothesis] + + def calc_drop_extra_pre_encoded(self, step_num) -> int: + # for the first step there is no need to drop any tokens after the downsampling as no caching is being used + if step_num == 0 and not self.pad_and_drop_preencoded: + return 0 + else: + return self.asr_model.encoder.streaming_cfg.drop_extra_pre_encoded + + def get_blank_id(self): + return len(self.tokenizer.vocab) + + def get_text_from_tokens(self, tokens: List[int]) -> str: + sep = "\u2581" # '▁' + tokens = [int(t) for t in tokens if t != self.blank_id] + if tokens: + pieces = self.tokenizer.ids_to_tokens(tokens) + text = "".join([p.replace(sep, ' ') if p.startswith(sep) else p for p in pieces]) + else: + text = "" + return text + + def _load_model(self, model: str): + if model.endswith(".nemo"): + asr_model = nemo_asr.models.ASRModel.restore_from(model, map_location=torch.device(self.device)) + else: + asr_model = nemo_asr.models.ASRModel.from_pretrained(model, map_location=torch.device(self.device)) + + if self.decoder_type is not None and hasattr(asr_model, "cur_decoder"): + asr_model.change_decoding_strategy(decoder_type=self.decoder_type) + elif isinstance(asr_model, nemo_asr.models.EncDecCTCModel): + self.decoder_type = "ctc" + elif isinstance(asr_model, nemo_asr.models.EncDecRNNTModel): + self.decoder_type = "rnnt" + else: + raise ValueError("Decoder type not supported for this model.") + + if self.att_context_size is not None: + if hasattr(asr_model.encoder, "set_default_att_context_size"): + asr_model.encoder.set_default_att_context_size(att_context_size=self.att_context_size) + else: + raise ValueError("Model does not support multiple lookaheads.") + else: + self.att_context_size = asr_model.cfg.encoder.att_context_size + + decoding_cfg = asr_model.cfg.decoding + with open_dict(decoding_cfg): + decoding_cfg.strategy = "greedy" + decoding_cfg.compute_timestamps = None + decoding_cfg.preserve_alignments = True + if hasattr(asr_model, 'joint'): # if an RNNT model + decoding_cfg.greedy.max_symbols = 10 + decoding_cfg.fused_batch_size = -1 + asr_model.change_decoding_strategy(decoding_cfg) + + if hasattr(asr_model.encoder, "set_default_att_context_size"): + asr_model.encoder.set_default_att_context_size(att_context_size=self.att_context_size) + + # chunk_size is set automatically for models trained for streaming. For models trained for offline mode with full context, we need to pass the chunk_size explicitly. + if self.chunk_size > 0: + if self.shift_size < 0: + shift_size = self.chunk_size + else: + shift_size = self.shift_size + asr_model.encoder.setup_streaming_params( + chunk_size=self.chunk_size, left_chunks=self.left_chunks, shift_size=shift_size + ) + + asr_model.eval() + return asr_model + + def _get_best_hypothesis(self, encoded, encoded_len, partial_hypotheses=None): + if self.decoder_type == "ctc": + best_hyp = self.asr_model.decoding.ctc_decoder_predictions_tensor( + encoded, + encoded_len, + return_hypotheses=True, + ) + elif self.decoder_type == "rnnt": + best_hyp = self.asr_model.decoding.rnnt_decoder_predictions_tensor( + encoded, encoded_len, return_hypotheses=True, partial_hypotheses=partial_hypotheses + ) + else: + raise ValueError("Decoder type not supported for this model.") + return best_hyp + + def _get_tokens_from_alignments(self, alignments): + tokens = [] + if self.decoder_type == "ctc": + tokens = alignments[1] + tokens = [int(t) for t in tokens if t != self.blank_id] + elif self.decoder_type == "rnnt": + for t in range(len(alignments)): + for u in range(len(alignments[t])): + logprob, token_id = alignments[t][u] # (logprob, token_id) + token_id = int(token_id) + if token_id != self.blank_id: + tokens.append(token_id) + else: + raise ValueError("Decoder type not supported for this model.") + return tokens + + def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: int = 1) -> str: + # Convert bytes to numpy array + audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0 + + audio_len_in_secs = len(audio_array) / 16000 + + self._audio_buffer.update(audio_array) + + features = self._audio_buffer.get_feature_buffer() + feature_lengths = torch.tensor([features.shape[1]], device=self.device) + features = features.unsqueeze(0) # Add batch dimension + is_first_step = self._audio_buffer.is_buffer_empty() + step_num = int(not is_first_step) + keep_all_outputs = False + + with torch.no_grad(): + ( + encoded, + encoded_len, + cache_last_channel, + cache_last_time, + cache_last_channel_len, + ) = self.asr_model.encoder.cache_aware_stream_step( + processed_signal=features, + processed_signal_length=feature_lengths, + cache_last_channel=self._cache_last_channel, + cache_last_time=self._cache_last_time, + cache_last_channel_len=self._cache_last_channel_len, + keep_all_outputs=False, + drop_extra_pre_encoded=self.calc_drop_extra_pre_encoded(step_num), + ) + + if valid_out_len and not keep_all_outputs: + # drop right context if any + encoded = encoded[:, :, :valid_out_len] + encoded_len = torch.ones_like(encoded_len) * valid_out_len + + best_hyp = self._get_best_hypothesis(encoded, encoded_len, partial_hypotheses=self._previous_hypotheses) + + self._previous_hypotheses = best_hyp + self._cache_last_channel = cache_last_channel + self._cache_last_time = cache_last_time + self._cache_last_channel_len = cache_last_channel_len + + tokens = self._get_tokens_from_alignments(best_hyp[0].alignments) + + text = self.get_text_from_tokens(tokens) + + is_final = False + if self.eou_string in text or self.eob_string in text: + is_final = True + self.reset_state(stream_id=stream_id) + return text, is_final + + def reset_state(self, stream_id: str = "default"): + self._audio_buffer.reset() + self._reset_cache() + self._previous_hypotheses = self._get_blank_hypothesis() + self._prev_num_tokens = 0 diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py new file mode 100644 index 000000000000..a64cd75e5369 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -0,0 +1,306 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Tuple + +import numpy as np +import torch +from pipecat.services.nemo.utils import CacheFeatureBufferer +from torch import Tensor + +from nemo.collections.asr.models import SortformerEncLabelModel + + +@dataclass +class PostProcessingParams: + """ + Postprocessing parameters for end-to-end speaker diarization models. + These parameters can significantly affect DER performance depending on the evaluation style and the dataset. + It is recommended to tune these parameters based on the evaluation style and the dataset + to achieve the desired DER performance. + """ + + onset: float = 0.5 # Onset threshold for detecting the beginning and end of a speech + offset: float = 0.5 # Offset threshold for detecting the end of a speech + pad_onset: float = 0.0 # Adding durations before each speech segment + pad_offset: float = 0.0 # Adding durations after each speech segment + min_duration_on: float = 0.0 # Threshold for short speech segment deletion + min_duration_off: float = 0.0 # Threshold for small non-speech deletion + + +@dataclass +class DiarizationConfig: + """Diarization configuration parameters for inference.""" + + model_path: str = "nvidia/diar_sortformer_4spk-v1" + device: str = "cuda" + + log: bool = False # If True, log will be printed + max_num_speakers: int = 4 + spkcache_len: int = 188 + spkcache_refresh_rate: int = 144 + fifo_len: int = 188 + chunk_len: int = 6 + chunk_left_context: int = 1 + chunk_right_context: int = 7 + + +@dataclass +class SortformerStreamingState: + """ + A dataclass that holds the streaming state for the Sortformer diarization model. + This is based on the streaming state in SortformerEncLabelModel in NeMo. + """ + + spkcache: Optional[Tensor] = None + spkcache_lengths: Optional[Tensor] = None + spkcache_preds: Optional[Tensor] = None + fifo: Optional[Tensor] = None + fifo_lengths: Optional[Tensor] = None + fifo_preds: Optional[Tensor] = None + spk_perm: Optional[Tensor] = None + + def to(self, device): + """ + Move all tensors to the specified device. + + Args: + device: The device to move the tensors to. + + Returns: + SortformerStreamingState: The state with tensors moved to the specified device. + """ + if self.spkcache is not None: + self.spkcache = self.spkcache.to(device) + if self.spkcache_lengths is not None: + self.spkcache_lengths = self.spkcache_lengths.to(device) + if self.spkcache_preds is not None: + self.spkcache_preds = self.spkcache_preds.to(device) + if self.fifo is not None: + self.fifo = self.fifo.to(device) + if self.fifo_lengths is not None: + self.fifo_lengths = self.fifo_lengths.to(device) + if self.fifo_preds is not None: + self.fifo_preds = self.fifo_preds.to(device) + if self.spk_perm is not None: + self.spk_perm = self.spk_perm.to(device) + return self + + +class NeMoLegacyDiarService: + def __init__( + self, + cfg: DiarizationConfig, + model: str, + frame_len_in_secs: float = 0.08, + sample_rate: int = 16000, + left_offset: int = 8, + right_offset: int = 8, + use_amp: bool = False, + compute_dtype: torch.dtype = torch.float32, + ): + self.model = model + self.cfg = cfg + self.cfg.model_path = model + self.diarizer = self.build_diarizer() + self.device = cfg.device + self.use_amp = use_amp + self.compute_dtype = compute_dtype + self.frame_len_in_secs = frame_len_in_secs + self.left_offset = left_offset + self.right_offset = right_offset + self.chunk_size = self.cfg.chunk_len + self.buffer_size_in_secs = ( + self.cfg.chunk_len * self.frame_len_in_secs + (self.left_offset + self.right_offset) * 0.01 + ) + self.max_num_speakers = self.cfg.max_num_speakers + + self.feature_bufferer = CacheFeatureBufferer( + sample_rate=sample_rate, + buffer_size_in_secs=self.buffer_size_in_secs, + chunk_size_in_secs=self.cfg.chunk_len * self.frame_len_in_secs, + preprocessor_cfg=self.diarizer.cfg.preprocessor, + device=self.device, + ) + self.streaming_state = self.init_streaming_state(batch_size=1) + self.total_preds = torch.zeros((1, 0, self.max_num_speakers), device=self.diarizer.device) + + print("NivaDiarService initialized") + + def build_diarizer(self): + if self.cfg.model_path.endswith(".nemo"): + diar_model = SortformerEncLabelModel.restore_from(self.cfg.model_path, map_location=self.cfg.device) + else: + diar_model = SortformerEncLabelModel.from_pretrained(self.cfg.model_path, map_location=self.cfg.device) + + # Steaming mode setup + diar_model.sortformer_modules.chunk_len = self.cfg.chunk_len + diar_model.sortformer_modules.spkcache_len = self.cfg.spkcache_len + diar_model.sortformer_modules.chunk_left_context = self.cfg.chunk_left_context + diar_model.sortformer_modules.chunk_right_context = self.cfg.chunk_right_context + diar_model.sortformer_modules.fifo_len = self.cfg.fifo_len + diar_model.sortformer_modules.log = self.cfg.log + diar_model.sortformer_modules.spkcache_refresh_rate = self.cfg.spkcache_refresh_rate + diar_model.eval() + + return diar_model + + def print_diar_result(self, diar_result: np.ndarray): + for t in range(diar_result.shape[0]): + spk_probs = "" + for s in range(diar_result.shape[1]): + spk_probs += f"{diar_result[t, s]:.2f} " + print(f"Time {t}: {spk_probs}") + + def diarize(self, audio: bytes, stream_id: str = "default") -> str: + + audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0 + + self.feature_bufferer.update(audio_array) + + features = self.feature_bufferer.get_feature_buffer() + feature_buffers = features.unsqueeze(0) # add batch dimension + feature_buffers = feature_buffers.transpose(1, 2) # [batch, feature, time] -> [batch, time, feature] + feature_buffer_lens = torch.tensor([feature_buffers.shape[1]], device=self.device) + self.streaming_state, chunk_preds = self.stream_step( + processed_signal=feature_buffers, + processed_signal_length=feature_buffer_lens, + streaming_state=self.streaming_state, + total_preds=self.total_preds, + left_offset=self.left_offset, + right_offset=self.right_offset, + ) + self.total_preds = chunk_preds + diar_result = chunk_preds[:, -self.chunk_size :, :].clone().cpu().numpy() + return diar_result[0] # tensor of shape [6, 4] + + def reset_state(self, stream_id: str = "default"): + self.feature_bufferer.reset() + self.streaming_state = self.init_streaming_state(batch_size=1) + self.total_preds = torch.zeros((1, 0, self.max_num_speakers), device=self.diarizer.device) + + def init_streaming_state(self, batch_size: int = 1) -> SortformerStreamingState: + """ + Initialize the streaming state for the diarization model. + + Args: + batch_size: The batch size to use. + + Returns: + SortformerStreamingState: The initialized streaming state. + """ + # Use the model's init_streaming_state method but convert to SortformerStreamingState format + nemo_state = self.diarizer.sortformer_modules.init_streaming_state( + batch_size=batch_size, async_streaming=self.diarizer.async_streaming, device=self.device + ) + + # Convert SortformerStreamingState format + state = SortformerStreamingState( + spkcache=nemo_state.spkcache, + spkcache_lengths=nemo_state.spkcache_lengths, + spkcache_preds=nemo_state.spkcache_preds, + fifo=nemo_state.fifo, + fifo_lengths=nemo_state.fifo_lengths, + fifo_preds=nemo_state.fifo_preds, + spk_perm=nemo_state.spk_perm, + ) + + return state + + def stream_step( + self, + processed_signal: Tensor, + processed_signal_length: Tensor, + streaming_state: SortformerStreamingState, + total_preds: Tensor, + left_offset: int = 0, + right_offset: int = 0, + ) -> Tuple[SortformerStreamingState, Tensor]: + """ + Execute a single streaming step for diarization. + + Args: + processed_signal: The processed audio signal. + processed_signal_length: The length of the processed signal. + streaming_state: The current streaming state. + total_preds: The total predictions so far. + left_offset: The left offset for the current chunk. + right_offset: The right offset for the current chunk. + + Returns: + Tuple[SortformerStreamingState, Tensor]: The updated streaming state and predictions. + """ + # Move tensors to correct device + if processed_signal.device != self.device: + processed_signal = processed_signal.to(self.device) + + if processed_signal_length.device != self.device: + processed_signal_length = processed_signal_length.to(self.device) + + # Make sure state is on the correct device + streaming_state = streaming_state.to(self.device) + + if total_preds is not None and total_preds.device != self.device: + total_preds = total_preds.to(self.device) + + # Convert SortformerStreamingState to NeMo's format + class NemoStreamingState: + def __init__(self, state): + self.spkcache = state.spkcache + self.spkcache_lengths = state.spkcache_lengths + self.spkcache_preds = state.spkcache_preds + self.fifo = state.fifo + self.fifo_lengths = state.fifo_lengths + self.fifo_preds = state.fifo_preds + self.spk_perm = state.spk_perm + + nemo_streaming_state = NemoStreamingState(streaming_state) + + with ( + torch.amp.autocast(device_type=self.device, dtype=self.compute_dtype, enabled=self.use_amp), + torch.inference_mode(), + torch.no_grad(), + ): + try: + # Call the model's forward_streaming_step method + nemo_streaming_state, diar_pred_out_stream = self.diarizer.forward_streaming_step( + processed_signal=processed_signal, + processed_signal_length=processed_signal_length, + streaming_state=nemo_streaming_state, + total_preds=total_preds, + left_offset=left_offset, + right_offset=right_offset, + ) + except Exception as e: + print(f"Error in diarizer streaming step: {e}") + # print the stack trace + import traceback + + traceback.print_exc() + # Return the existing state and preds if there's an error + return streaming_state, total_preds + + # Convert back to SortformerStreamingState format + new_streaming_state = SortformerStreamingState( + spkcache=nemo_streaming_state.spkcache, + spkcache_lengths=nemo_streaming_state.spkcache_lengths, + spkcache_preds=nemo_streaming_state.spkcache_preds, + fifo=nemo_streaming_state.fifo, + fifo_lengths=nemo_streaming_state.fifo_lengths, + fifo_preds=nemo_streaming_state.fifo_preds, + spk_perm=nemo_streaming_state.spk_perm, + ) + + return new_streaming_state, diar_pred_out_stream diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py new file mode 100644 index 000000000000..1e8d048c8db5 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py @@ -0,0 +1,162 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import uuid +from threading import Thread +from typing import AsyncGenerator, List + +import torch +from loguru import logger +from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam +from pipecat.frames.frames import LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMTextFrame +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.openai.llm import OpenAILLMService +from transformers import AsyncTextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer + + +class HuggingFaceLLMLocalService: + def __init__( + self, + model: str = "meta-llama/Meta-Llama-3-8B-Instruct", + device: str = "cuda:0", + temperature=0.7, + max_tokens=256, + top_p=0.9, + ): + self.device = device + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForCausalLM.from_pretrained( + model, device_map=device, torch_dtype=torch.bfloat16 + ) # type: AutoModelForCausalLM + self.temperature = temperature + self.max_tokens = max_tokens + self.top_p = top_p + + async def generate_stream( + self, messages: List[ChatCompletionMessageParam], **kwargs + ) -> AsyncGenerator[ChatCompletionChunk, None]: + # Convert messages to prompt format + + prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + logger.debug(f"LLM prompt: {prompt}") + + inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(self.device) + + # Generate with streaming + streamer = AsyncTextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) + generation_kwargs = { + **inputs, + "streamer": streamer, + "max_new_tokens": self.max_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "do_sample": True, + } + + # Start generation in background + thread = Thread( + target=self.model.generate, + kwargs=generation_kwargs, + ) + thread.start() + + # Stream the output + async for text in streamer: + # logger.debug(f"Streamer yielded text: {text}") + chunk = ChatCompletionChunk( + id="hf-" + str(uuid.uuid4()), + choices=[{"delta": {"content": text}, "finish_reason": None, "index": 0}], + created=int(time.time()), + model=self.model.config._name_or_path, + object="chat.completion.chunk", + ) + yield chunk + + +class HuggingFaceLLMService(OpenAILLMService): + def __init__( + self, + *, + model: str = "google/gemma-7b-it", + device: str = "cuda", + temperature=0.7, + max_tokens=256, + top_p=0.9, + **kwargs, + ): + self.model = model + self.device = device + self.temperature = temperature + self.max_tokens = max_tokens + self.top_p = top_p + super().__init__(model=model, **kwargs) + + def create_client(self, api_key=None, base_url=None, **kwargs): + return HuggingFaceLLMLocalService( + model=self.model, + device=self.device, + temperature=self.temperature, + max_tokens=self.max_tokens, + top_p=self.top_p, + ) + + async def _process_context(self, context: OpenAILLMContext): + """Process a context through the LLM and push text frames. + + Args: + context (OpenAILLMContext): The context to process, containing messages + and other information needed for the LLM interaction. + """ + await self.push_frame(LLMFullResponseStartFrame()) + + try: + await self.start_ttfb_metrics() + messages = context.get_messages() + async for chunk in self._client.generate_stream(messages): + if chunk.choices[0].delta.content: + await self.stop_ttfb_metrics() + text = chunk.choices[0].delta.content + frame = LLMTextFrame(text) + await self.push_frame(frame) + except Exception as e: + logger.error(f"Error in _process_context: {e}", exc_info=True) + raise + finally: + await self.push_frame(LLMFullResponseEndFrame()) + + async def get_chat_completions( + self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam] + ) -> AsyncGenerator[ChatCompletionChunk, None]: + """Create a streaming chat completion using HuggingFace model. + + Args: + context (OpenAILLMContext): The context object containing tools configuration + and other settings for the chat completion. + messages (List[ChatCompletionMessageParam]): The list of messages comprising + the conversation history and current request. + + Returns: + AsyncGenerator[ChatCompletionChunk]: A streaming response of chat completion + chunks that can be processed asynchronously. + """ + params = { + "max_tokens": self._settings["max_tokens"], + "temperature": self._settings["temperature"], + "top_p": self._settings["top_p"], + } + params.update(self._settings["extra"]) + + return self._client.generate_stream(messages, **params) diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/stt.py b/nemo/collections/voice_agent/pipecat/services/nemo/stt.py new file mode 100644 index 000000000000..e3c207c675e0 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/stt.py @@ -0,0 +1,250 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +from typing import AsyncGenerator, List, Mapping, Optional, Tuple + +import numpy as np +import torch +from loguru import logger +from omegaconf import OmegaConf +from pipecat.frames.frames import ( + AudioRawFrame, + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + InterimTranscriptionFrame, + StartFrame, + TranscriptionFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.stt_service import SegmentedSTTService, STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 +from pipecat.utils.tracing.service_decorators import traced_stt +from pydantic import BaseModel + +from nemo.collections.voice_agent.pipecat.services.nemo.legacy_asr import NemoLegacyASRService + +try: + import nemo.collections.asr as nemo_asr + from nemo.collections.asr.models import ASRModel + + # disable nemo logging + from nemo.utils import logging + + level = logging.getEffectiveLevel() + logging.setLevel(logging.CRITICAL) + + +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error('In order to use NVIDIA NeMo STT, you need to `pip install "nemo_toolkit[asr]"`.') + raise Exception(f"Missing module: {e}") + + +class NeMoSTTInputParams(BaseModel): + language: Optional[Language] = Language.EN_US + att_context_size: Optional[List] = [70, 1] + frame_len_in_secs: Optional[float] = 0.08 # 80ms for FastConformer model + config_path: Optional[str] = None # path to the Niva ASR config file + raw_audio_frame_len_in_secs: Optional[float] = 0.016 # 16ms for websocket transport + buffer_size: Optional[int] = 5 # number of audio frames to buffer, 1 frame is 16ms + + +class NemoSTTService(STTService): + def __init__( + self, + *, + model: Optional[str] = "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi", + device: Optional[str] = "cuda:0", + sample_rate: Optional[int] = 16000, + params: Optional[NeMoSTTInputParams] = None, + has_turn_taking: bool = False, + backend: Optional[str] = "legacy", + decoder_type: Optional[str] = "rnnt", + **kwargs, + ): + super().__init__(**kwargs) + + self._queue = asyncio.Queue() + self._sample_rate = sample_rate + params.buffer_size = params.frame_len_in_secs // params.raw_audio_frame_len_in_secs + self._params = params + self._model_name = model + self._has_turn_taking = has_turn_taking + self._backend = backend + self._decoder_type = decoder_type + if not params: + raise ValueError("params is required") + + self._device = device + + self._load_model() + + self.audio_buffer = [] + + def _load_model(self): + if self._backend == "legacy": + self._model = NemoLegacyASRService(self._model_name, device=self._device, decoder_type=self._decoder_type) + else: + raise ValueError(f"Invalid ASR backend: {self._backend}") + + def can_generate_metrics(self) -> bool: + """Indicates whether this service can generate metrics. + + Returns: + bool: True, as this service supports metric generation. + """ + return True + + async def start(self, frame: StartFrame): + """Handle service start. + + Args: + frame: StartFrame containing initial configuration + """ + await super().start(frame) + + # Initialize the model if not already done + if not hasattr(self, "_model"): + self._load_model() + + async def stop(self, frame: EndFrame): + """Handle service stop. + + Args: + frame: EndFrame that triggered this method + """ + await super().stop(frame) + # Clear any internal state if needed + await self._queue.put(None) # Signal to stop processing + + async def cancel(self, frame: CancelFrame): + """Handle service cancellation. + + Args: + frame: CancelFrame that triggered this method + """ + await super().cancel(frame) + # Clear any internal state + await self._queue.put(None) # Signal to stop processing + self._queue = asyncio.Queue() # Reset the queue + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Process audio data and generate transcription frames. + + Args: + audio: Raw audio bytes to transcribe + + Yields: + Frame: Transcription frames containing the results + """ + await self.start_ttfb_metrics() + await self.start_processing_metrics() + + try: + transcription = None + self.audio_buffer.append(audio) + if len(self.audio_buffer) >= self._params.buffer_size: + audio = b"".join(self.audio_buffer) + self.audio_buffer = [] + + transcription, is_final = self._model.transcribe(audio) + await self.stop_ttfb_metrics() + await self.stop_processing_metrics() + + if transcription: + logger.debug(f"Transcription (is_final={is_final}): `{transcription}`") + + # Get the language from params or default to EN_US + language = self._params.language if self._params else Language.EN_US + + # Create and push the transcription frame + if self._has_turn_taking or not is_final: + frame_type = InterimTranscriptionFrame + else: + frame_type = TranscriptionFrame + await self.push_frame( + frame_type( + transcription, + "", # No speaker ID in this implementation + time_now_iso8601(), + language, + result={"text": transcription}, + ) + ) + + # Handle the transcription + await self._handle_transcription( + transcript=transcription, + is_final=is_final, + language=language, + ) + + yield None + + except Exception as e: + logger.error(f"Error in NeMo STT processing: {e}") + await self.push_frame( + ErrorFrame( + str(e), + time_now_iso8601(), + ) + ) + yield None + + @traced_stt + async def _handle_transcription(self, transcript: str, is_final: bool, language: Optional[str] = None): + """Handle a transcription result. + + Args: + transcript: The transcribed text + is_final: Whether this is a final transcription + language: The language of the transcription + """ + pass # Base implementation - can be extended for specific handling needs + + async def set_language(self, language: Language): + """Update the service's recognition language. + + Args: + language: New language for recognition + """ + if self._params: + self._params.language = language + else: + self._params = NeMoSTTInputParams(language=language) + + logger.info(f"Switching STT language to: {language}") + + async def set_model(self, model: str): + """Update the service's model. + + Args: + model: New model name/path to use + """ + await super().set_model(model) + self._model_name = model + self._load_model() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, VADUserStoppedSpeakingFrame) and isinstance(self._model, NemoLegacyASRService): + # manualy reset the state of the model when end of utterance is detected by VAD + logger.debug("Resetting state of the model due to VADUserStoppedSpeakingFrame") + self._model.reset_state() + await super().process_frame(frame, direction) diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py new file mode 100644 index 000000000000..57dfaf35b242 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py @@ -0,0 +1,339 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import inspect +from collections.abc import AsyncGenerator +from typing import Iterator, Union + +import numpy as np +import torch +from loguru import logger +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + StartFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.services.tts_service import TTSService + +from nemo.collections.tts.models import FastPitchModel, HifiGanModel + + +class BaseNemoTTSService(TTSService): + """Text-to-Speech service using Nemo TTS models. + + This service works with any TTS model that exposes a generate(text) method + that returns audio data. The TTS generation runs in a dedicated background thread to + avoid blocking the main asyncio event loop, following the same pattern as NemoDiarService. + + Args: + model: TTS model instance with a generate(text) method + sample_rate: Audio sample rate in Hz (defaults to 22050) + **kwargs: Additional arguments passed to TTSService + """ + + def __init__( + self, + *, + model, + device: str = "cuda", + sample_rate: int = 22050, + **kwargs, + ): + super().__init__(sample_rate=sample_rate, **kwargs) + self._model_name = model + self._device = device + self._model = self._setup_model() + + # Background processing infrastructure - no response handler needed + self._tts_queue = asyncio.Queue() + self._processing_task = None + self._processing_running = False + + # Track pending requests with their response queues + self._pending_requests = {} + + def _setup_model(self): + raise NotImplementedError("Subclass must implement _setup_model") + + def _generate_audio(self, text: str) -> Iterator[np.ndarray]: + raise NotImplementedError("Subclass must implement _generate_audio") + + def can_generate_metrics(self) -> bool: + return True + + async def start(self, frame: StartFrame): + """Handle service start.""" + await super().start(frame) + + # Initialize the model if not already done + if not hasattr(self, "_model") or self._model is None: + self._model = self._setup_model() + + # Only start background processing task - no response handler needed + if not self._processing_task: + self._processing_task = self.create_task(self._processing_task_handler()) + + async def stop(self, frame: EndFrame): + """Handle service stop.""" + await super().stop(frame) + await self._stop_tasks() + + async def cancel(self, frame: CancelFrame): + """Handle service cancellation.""" + await super().cancel(frame) + await self._stop_tasks() + + async def _stop_tasks(self): + """Stop background processing tasks.""" + self._processing_running = False + await self._tts_queue.put(None) # Signal to stop processing + + if self._processing_task: + await self.cancel_task(self._processing_task) + self._processing_task = None + + def _tts_processor(self): + """Background processor that handles TTS generation calls.""" + try: + while self._processing_running: + try: + future = asyncio.run_coroutine_threadsafe(self._tts_queue.get(), self.get_event_loop()) + request = future.result() + + if request is None: # Stop signal + logger.debug("Received stop signal in TTS background processor") + break + + text, request_id = request + logger.debug(f"Processing TTS request for text: [{text}]") + + # Get the response queue for this request + response_queue = None + future = asyncio.run_coroutine_threadsafe( + self._get_response_queue(request_id), self.get_event_loop() + ) + response_queue = future.result() + + if response_queue is None: + logger.warning(f"No response queue found for request {request_id}") + continue + + # Process TTS generation + try: + audio_result = self._generate_audio(text) + + # Send result directly to the waiting request + asyncio.run_coroutine_threadsafe( + response_queue.put(('success', audio_result)), self.get_event_loop() + ) + except Exception as e: + logger.error(f"Error in TTS generation: {e}") + # Send error directly to the waiting request + asyncio.run_coroutine_threadsafe(response_queue.put(('error', e)), self.get_event_loop()) + + except Exception as e: + logger.error(f"Error in background TTS processor: {e}") + + except Exception as e: + logger.error(f"Background TTS processor fatal error: {e}") + finally: + logger.debug("Background TTS processor stopped") + + async def _get_response_queue(self, request_id: str): + """Get the response queue for a specific request.""" + return self._pending_requests.get(request_id) + + async def _processing_task_handler(self): + """Handler for background processing task.""" + try: + self._processing_running = True + logger.debug("Starting background TTS processing task") + await asyncio.to_thread(self._tts_processor) + except asyncio.CancelledError: + logger.debug("Background TTS processing task cancelled") + self._processing_running = False + raise + finally: + self._processing_running = False + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using the Nemo TTS model.""" + logger.debug(f"{self}: Generating TTS [{text}]") + + try: + await self.start_ttfb_metrics() + yield TTSStartedFrame() + + # Generate unique request ID + import uuid + + request_id = str(uuid.uuid4()) + + # Create response queue for this specific request + request_queue = asyncio.Queue() + self._pending_requests[request_id] = request_queue + + try: + # Queue the TTS request for background processing + await self._tts_queue.put((text, request_id)) + + # Wait for the result directly from our request queue + result = await request_queue.get() + status, data = result + + if status == 'error': + logger.error(f"{self} TTS generation error: {data}") + yield ErrorFrame(error=f"TTS generation error: {str(data)}") + return + + audio_result = data + if audio_result is None: + logger.error(f"{self} TTS model returned None for text: [{text}]") + yield ErrorFrame(error="TTS generation failed - no audio returned") + return + + await self.start_tts_usage_metrics(text) + + # Process the audio result (same as before) + if ( + inspect.isgenerator(audio_result) + or hasattr(audio_result, '__iter__') + and hasattr(audio_result, '__next__') + ): + # Handle generator case + first_chunk = True + for audio_chunk in audio_result: + if first_chunk: + await self.stop_ttfb_metrics() + first_chunk = False + + if audio_chunk is None: + break + + audio_bytes = self._convert_to_bytes(audio_chunk) + chunk_size = self.chunk_size + for i in range(0, len(audio_bytes), chunk_size): + audio_chunk_bytes = audio_bytes[i : i + chunk_size] + if not audio_chunk_bytes: + break + + frame = TTSAudioRawFrame( + audio=audio_chunk_bytes, sample_rate=self.sample_rate, num_channels=1 + ) + yield frame + else: + # Handle single result case + await self.stop_ttfb_metrics() + audio_bytes = self._convert_to_bytes(audio_result) + + chunk_size = self.chunk_size + for i in range(0, len(audio_bytes), chunk_size): + chunk = audio_bytes[i : i + chunk_size] + if not chunk: + break + + frame = TTSAudioRawFrame(audio=chunk, sample_rate=self.sample_rate, num_channels=1) + yield frame + + yield TTSStoppedFrame() + + finally: + # Clean up the pending request + if request_id in self._pending_requests: + del self._pending_requests[request_id] + + except Exception as e: + logger.exception(f"{self} error generating TTS: {e}") + error_message = f"TTS generation error: {str(e)}" + yield ErrorFrame(error=error_message) + + def _convert_to_bytes(self, audio_data) -> bytes: + """Convert various audio data formats to bytes.""" + if isinstance(audio_data, (bytes, bytearray)): + return bytes(audio_data) + + # Handle numpy arrays + try: + import numpy as np + + if isinstance(audio_data, np.ndarray): + # Ensure it's in the right format (16-bit PCM) + if audio_data.dtype in [np.float32, np.float64]: + # Convert float [-1, 1] to int16 [-32768, 32767] + audio_data = np.clip(audio_data, -1.0, 1.0) # Ensure values are in range + audio_data = (audio_data * 32767).astype(np.int16) + elif audio_data.dtype != np.int16: + # Convert other integer types to int16 + audio_data = audio_data.astype(np.int16) + return audio_data.tobytes() + elif hasattr(audio_data, 'tobytes'): + return audio_data.tobytes() + else: + return bytes(audio_data) + except ImportError: + # Fallback if numpy is not available + if hasattr(audio_data, 'tobytes'): + return audio_data.tobytes() + else: + return bytes(audio_data) + + +class NeMoFastPitchHiFiGANTTSService(BaseNemoTTSService): + def __init__( + self, + fastpitch_model: str = "nvidia/tts_en_fastpitch", + hifigan_model: str = "nvidia/tts_hifigan", + device: str = "cuda", + **kwargs, + ): + model_name = f"{fastpitch_model}+{hifigan_model}" + self._fastpitch_model_name = fastpitch_model + self._hifigan_model_name = hifigan_model + super().__init__(model=model_name, device=device, **kwargs) + + def _setup_model(self): + print("Loading model...") + self._fastpitch_model = self._setup_fastpitch_model(self._fastpitch_model_name) + self._hifigan_model = self._setup_hifigan_model(self._hifigan_model_name) + return self._fastpitch_model, self._hifigan_model + + def _setup_fastpitch_model(self, model_name: str): + if model_name.endswith(".nemo"): + fastpitch_model = FastPitchModel.restore_from(model_name, map_location=torch.device(self._device)) + else: + fastpitch_model = FastPitchModel.from_pretrained(model_name, map_location=torch.device(self._device)) + fastpitch_model.eval() + return fastpitch_model + + def _setup_hifigan_model(self, model_name: str): + if model_name.endswith(".nemo"): + hifigan_model = HifiGanModel.restore_from(model_name, map_location=torch.device(self._device)) + else: + hifigan_model = HifiGanModel.from_pretrained(model_name, map_location=torch.device(self._device)) + hifigan_model.eval() + return hifigan_model + + def _generate_audio(self, text: str) -> Iterator[np.ndarray]: + with torch.no_grad(): + parsed = self._fastpitch_model.parse(text) + spectrogram = self._fastpitch_model.generate_spectrogram(tokens=parsed) + audio = self._hifigan_model.convert_spectrogram_to_audio(spec=spectrogram) + audio = audio.detach().view(-1).cpu().numpy() + yield audio diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py new file mode 100644 index 000000000000..90eff8b65d3e --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py @@ -0,0 +1,442 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from loguru import logger +from pipecat.frames.frames import ( + AudioRawFrame, + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, + Frame, + InterimTranscriptionFrame, + StartFrame, + StartInterruptionFrame, + StopInterruptionFrame, + TextFrame, + TranscriptionFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 + +from nemo.collections.voice_agent.pipecat.frames.frames import DiarResultFrame + +DEFAULT_BACKCHANNEL_PHRASES = [ + "cool", + "huh", + "okay okay", + "mhmm", + "mmhmm", + 'uhhuh', + 'uhhuh okay', + 'sure thing', + 'uh huh', + 'mm hmm', + 'hmm', + 'humm', + 'absolutely', + 'ah', + 'all right', + 'alright', + 'but yeah', + 'definitely', + 'exactly', + 'go ahead', + 'good', + 'great', + 'great thanks', + 'ha ha', + 'hi', + 'i know', + 'i know right', + 'i see', + 'indeed', + 'interesting', + 'mhmm', + 'mhmm mhmm', + 'mhmm right', + 'mhmm yeah', + 'mhmm yes', + 'nice', + 'of course', + 'oh', + 'oh dear', + 'oh man', + 'oh okay', + 'oh wow', + 'oh yes', + 'ok', + 'ok thanks', + 'okay', + 'okay okay', + 'okay thanks', + 'perfect', + 'really', + 'right', + 'right exactly', + 'right right', + 'right yeah', + 'so yeah', + 'sounds good', + 'sure', + 'thank you', + 'thanks', + "that's awesome", + 'thats right', + 'thats true', + 'true', + 'uh-huh', + 'uh-huh yeah', + 'uhhuh', + 'um-humm', + 'well', + 'what', + 'wow', + 'yeah', + 'yeah i know', + 'yeah i see', + 'yeah mhmm', + 'yeah okay', + 'yeah right', + 'yeah uh-huh', + 'yeah yeah', + 'yep', + 'yes', + 'yes please', + 'yes yes', +] + + +class NeMoTurnTakingService(FrameProcessor): + def __init__( + self, + eou_string: str = "", + eob_string: str = "", + language: Language = Language.EN_US, + use_vad: bool = True, + use_diar: bool = False, + max_buffer_size: int = 5, + backchannel_phrases: List[str] = DEFAULT_BACKCHANNEL_PHRASES, + **kwargs, + ): + super().__init__(**kwargs) + self.eou_string = eou_string + self.eob_string = eob_string + self.language = language + self.use_vad = use_vad + self.use_diar = use_diar + self.max_buffer_size = max_buffer_size + self.backchannel_phrases = backchannel_phrases + self.backchannel_phrases_nopc = set([self.clean_text(phrase) for phrase in self.backchannel_phrases]) + + # internal data + self._current_speaker_id = None + self._prev_speaker_id = None + self._bot_speaking = False + self._vad_user_speaking = False + self._have_sent_user_started_speaking = False + self._user_speaking_buffer = "" + if not self.use_vad: + # if vad is not used, we assume the user is always speaking + self._vad_user_speaking = True + + def clean_text(self, text: str) -> str: + """ + Clean the text so that it can be used for backchannel detection. + """ + if self.language != Language.EN_US: + raise ValueError(f"Language {self.language} not supported, currently only English is supported.") + for eou_string in [self.eou_string, self.eob_string]: + if text.endswith(eou_string): + text = text[: -len(eou_string)].strip() + text = text.lower() + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + return " ".join(text.split()).strip() + + def is_backchannel(self, text: str) -> bool: + """ + Check if the text is a backchannel phrase. + """ + if text.startswith("") :] + text = self.clean_text(text) + return text in self.backchannel_phrases_nopc + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + if isinstance(frame, (TranscriptionFrame, InterimTranscriptionFrame)): + await self._handle_transcription(frame, direction) + elif isinstance(frame, VADUserStartedSpeakingFrame): + await self._handle_user_started_speaking(frame, direction) + elif isinstance(frame, VADUserStoppedSpeakingFrame): + await self._handle_user_stopped_speaking(frame, direction) + elif isinstance(frame, BotStartedSpeakingFrame): + logger.debug("BotStartedSpeakingFrame received") + self._bot_speaking = True + elif isinstance(frame, BotStoppedSpeakingFrame): + logger.debug("BotStoppedSpeakingFrame received") + self._bot_speaking = False + elif isinstance(frame, DiarResultFrame): + logger.debug("DiarResultFrame received") + await self._handle_diar_result(frame, direction) + else: + await self.push_frame(frame, direction) + + async def _handle_transcription( + self, frame: TranscriptionFrame | InterimTranscriptionFrame, direction: FrameDirection + ): + text_segment = frame.text + if self._vad_user_speaking: + self._user_speaking_buffer += text_segment + has_eou = self._user_speaking_buffer.endswith(self.eou_string) + has_eob = self._user_speaking_buffer.endswith(self.eob_string) + if has_eou: + # EOU detected, we assume the user is done speaking, so we push the completed text and interrupt the bot + logger.debug(f" Detected: `{self._user_speaking_buffer}`") + completed_text = self._user_speaking_buffer[: -len(self.eou_string)].strip() + self._user_speaking_buffer = "" + if self._bot_speaking and self.is_backchannel(completed_text): + logger.debug(f" detected for a backchannel phrase while bot is speaking: `{completed_text}`") + else: + await self._handle_completed_text(completed_text, direction) + await self._handle_user_interruption(UserStoppedSpeakingFrame()) + self._have_sent_user_started_speaking = False # user is done speaking, so we reset the flag + elif has_eob and self._bot_speaking: + # ignore the backchannel string while bot is speaking + logger.debug(f"Ignoring backchannel string while bot is speaking: `{self._user_speaking_buffer}`") + # push the backchannel string upstream, not downstream + await self.push_frame( + TranscriptionFrame( + text=f"({self._user_speaking_buffer})", + user_id="", + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": f"Backchannel detected: {self._user_speaking_buffer}"}, + ), + direction=FrameDirection.UPSTREAM, + ) + self._have_sent_user_started_speaking = False # treat it as if the user is not speaking + completed_text = "" + self._user_speaking_buffer = "" + else: + # if bot is not speaking, the backchannel string is not considered a backchannel phrase + # user is still speaking, so we append the text segment to the buffer + logger.debug(f"User is speaking: `{self._user_speaking_buffer}`") + if has_eob: + logger.debug( + f"{self.eob_string} detected but ignored because bot is NOT speaking: `{self._user_speaking_buffer}`" + ) + self._user_speaking_buffer = self._user_speaking_buffer[: -len(self.eob_string)].strip() + completed_words = self._user_speaking_buffer.strip().split()[ + :-1 + ] # assume the last word is not completed + if len(completed_words) >= self.max_buffer_size: + completed_text = " ".join(completed_words) + await self._handle_completed_text(completed_text, direction, is_final=False) + else: + # if vad is not detecting user speaking + logger.debug( + f"VAD is not detecting user speaking, but still received text segment from STT: `{text_segment}`" + ) + is_backchannel = self.is_backchannel(text_segment) + if text_segment.endswith(self.eob_string): + is_backchannel = True + logger.debug(f"Dropping EOB token: `{text_segment}`") + text_segment = text_segment[: -len(self.eob_string)].strip() + elif text_segment.endswith(self.eou_string): + logger.debug(f"Dropping EOU token: `{text_segment}`") + text_segment = text_segment[: -len(self.eou_string)].strip() + + if not text_segment.strip(): + return + if is_backchannel and self._bot_speaking: + logger.debug(f"Backchannel detected while bot is speaking: `{text_segment}`") + # push the backchannel string upstream, not downstream + curr_text = str(self._user_speaking_buffer + text_segment) + self._user_speaking_buffer = "" + await self.push_frame( + TranscriptionFrame( + text=f"({curr_text})", + user_id="", + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": f"Backchannel detected: {self._user_speaking_buffer+text_segment}"}, + ), + direction=FrameDirection.UPSTREAM, + ) + else: + # if the text segment is not empty and have non-space characters, we append it to the buffer + self._user_speaking_buffer += text_segment + if self.is_backchannel(self._user_speaking_buffer): + logger.debug(f"Backchannel detected: `{self._user_speaking_buffer}`") + self._user_speaking_buffer = "" + self._have_sent_user_started_speaking = False + return + logger.debug(f"Appending text segment to user speaking buffer: `{self._user_speaking_buffer}`") + + async def _handle_completed_text(self, completed_text: str, direction: FrameDirection, is_final: bool = True): + if not self._have_sent_user_started_speaking: + # if we haven't sent the user started speaking frame, we send it now + # so that the bot can be interrupted and be ready to respond to the new user turn + await self._handle_user_interruption(UserStartedSpeakingFrame()) + self._have_sent_user_started_speaking = True + + completed_text = completed_text.strip() + completed_text = completed_text.replace(self.eou_string, "").replace(self.eob_string, "") + + if self.use_diar and not completed_text.startswith(" {completed_text}" + + frame_type = TranscriptionFrame if is_final else InterimTranscriptionFrame + text_frame = frame_type( + text=completed_text, + user_id="", # No speaker ID in this implementation + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": completed_text}, + ) + logger.debug(f"Pushing text frame: {text_frame}") + await self.push_frame(text_frame, direction) + + async def _handle_user_started_speaking(self, frame: VADUserStartedSpeakingFrame, direction: FrameDirection): + self._vad_user_speaking = True + logger.debug("NeMoTurnTakingService: VADUserStartedSpeakingFrame") + await self.push_frame(frame, direction) + + def _contains_only_speaker_tags(self, text: str) -> bool: + """ + Check if the text contains only speaker tags. + """ + return text.strip().startswith("") + + async def _handle_user_stopped_speaking(self, frame: VADUserStoppedSpeakingFrame, direction: FrameDirection): + """ + Handle the user stopped speaking frame. + If the buffer is not empty: + If the bot is not speaking, we push the completed text frame regardless of whether it is a backchannel string. + If the bot is speaking, we ignore the backchannel string if it is a backchannel string. + If the buffer is empty, we do nothing. + """ + if self.use_vad: + self._vad_user_speaking = False + logger.debug("NeMoTurnTakingService: VADUserStoppedSpeakingFrame") + await self.push_frame(frame, direction) + + # if user buffer only contains speaker tags, we don't push the completed text frame + if self._contains_only_speaker_tags(self._user_speaking_buffer): + logger.debug(f"User buffer only contains speaker tags: `{self._user_speaking_buffer}`, ignoring") + return + + is_backchannel = self.is_backchannel(self._user_speaking_buffer) + if not self._user_speaking_buffer: + return + if not self._bot_speaking or not is_backchannel: + logger.debug(f"Bot talking: {self._bot_speaking}, backchannel: {is_backchannel}") + logger.debug(f"Pushing completed text frame for VAD user stopped speaking: {self._user_speaking_buffer}") + await self._handle_completed_text(self._user_speaking_buffer, direction) + self._user_speaking_buffer = "" + if self._have_sent_user_started_speaking: + await self._handle_user_interruption(UserStoppedSpeakingFrame()) + self._have_sent_user_started_speaking = False + elif is_backchannel: + logger.debug(f"Backchannel detected: `{self._user_speaking_buffer}`") + # push the backchannel string upstream, not downstream + await self.push_frame( + TranscriptionFrame( + text=f"({self._user_speaking_buffer})", + user_id="", + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": f"Backchannel detected: {self._user_speaking_buffer}"}, + ), + direction=FrameDirection.UPSTREAM, + ) + self._user_speaking_buffer = "" + self._have_sent_user_started_speaking = False + + async def _handle_user_interruption(self, frame: Frame): + # Adapted from BaseInputTransport._handle_user_interruption + if isinstance(frame, UserStartedSpeakingFrame): + logger.debug("User started speaking") + await self.push_frame(frame) + await self.push_frame(StartInterruptionFrame(), direction=FrameDirection.DOWNSTREAM) + elif isinstance(frame, UserStoppedSpeakingFrame): + logger.debug("User stopped speaking") + await self.push_frame(frame) + if self.interruptions_allowed: + await self.push_frame(StopInterruptionFrame(), direction=FrameDirection.DOWNSTREAM) + else: + logger.debug(f"Unknown frame type for _handle_user_interruption: {type(frame)}") + + async def _handle_diar_result(self, frame: DiarResultFrame, direction: FrameDirection): + if not self.use_diar: + logger.debug("Diarization is disabled, skipping") + return + + new_speaker_id = frame.diar_result # speaker id of the dominant speaker + + # logger.debug(f"Dominant speaker ID: {dominant_speaker_id}") + self._prev_speaker_id = self._current_speaker_id + last_speaker_id = self._current_speaker_id + + if not self._user_speaking_buffer.startswith(" to the beginning of the current utterance + self._user_speaking_buffer = f" {self._user_speaking_buffer}" + elif last_speaker_id != new_speaker_id: + # change the speaker tag to the dominant speaker id + self._user_speaking_buffer = self._user_speaking_buffer[len("") :] + self._user_speaking_buffer = f" {self._user_speaking_buffer}" + logger.debug(f"Speaker changed from {last_speaker_id} to {new_speaker_id}") + self._current_speaker_id = new_speaker_id + + +class NeMoTextTurnTakingService(NeMoTurnTakingService): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def _handle_transcription( + self, frame: TranscriptionFrame | InterimTranscriptionFrame, direction: FrameDirection + ): + text_segment = frame.text + if self._vad_user_speaking: + self._user_speaking_buffer = " " + text_segment + is_backchannel = self.is_backchannel(self._user_speaking_buffer) + # num_words = len(self._user_speaking_buffer.strip().split()) + if isinstance(frame, TranscriptionFrame): + logger.debug(f"Completed user turn detected: `{self._user_speaking_buffer}`") + if is_backchannel: + logger.debug(f"Backchannel detected: `{self._user_speaking_buffer}`") + self._user_speaking_buffer = "" + self._have_sent_user_started_speaking = False + return + + logger.debug(f"Completed user turn: `{self._user_speaking_buffer}`") + completed_text = self._user_speaking_buffer + await self._handle_completed_text(completed_text, direction) + await self._handle_user_interruption(UserStoppedSpeakingFrame()) + self._have_sent_user_started_speaking = False + self._user_speaking_buffer = "" + + elif isinstance(frame, InterimTranscriptionFrame): + logger.debug(f"InterimTranscription Detected: `{self._user_speaking_buffer}`") + else: + logger.debug(f"User is not speaking, ignoring text segment: `{text_segment}`") diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/utils.py b/nemo/collections/voice_agent/pipecat/services/nemo/utils.py new file mode 100644 index 000000000000..e125d9efd577 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/nemo/utils.py @@ -0,0 +1,196 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import torch +from omegaconf import DictConfig + +import nemo.collections.asr as nemo_asr + +LOG_MEL_ZERO = -16.635 + + +class AudioBufferer: + def __init__(self, sample_rate: int, buffer_size_in_secs: float): + self.buffer_size = int(buffer_size_in_secs * sample_rate) + self.sample_buffer = torch.zeros(self.buffer_size, dtype=torch.float32) + + def reset(self) -> None: + """ + Reset the buffer to zero + """ + self.sample_buffer.zero_() + + def update(self, audio: np.ndarray) -> None: + """ + Update the buffer with the new frame + Args: + frame (Frame): frame to update the buffer with + """ + if not isinstance(audio, torch.Tensor): + audio = torch.from_numpy(audio) + + audio_size = audio.shape[0] + if audio_size > self.buffer_size: + raise ValueError(f"Frame size ({audio_size}) exceeds buffer size ({self.buffer_size})") + + shift = audio_size + self.sample_buffer[:-shift] = self.sample_buffer[shift:].clone() + self.sample_buffer[-shift:] = audio.clone() + + def get_buffer(self) -> torch.Tensor: + """ + Get the current buffer + Returns: + torch.Tensor: current state of the buffer + """ + return self.sample_buffer.clone() + + def is_buffer_empty(self) -> bool: + """ + Check if the buffer is empty + Returns: + bool: True if the buffer is empty, False otherwise + """ + return self.sample_buffer.sum() == 0 + + +class CacheFeatureBufferer: + def __init__( + self, + sample_rate: int, + buffer_size_in_secs: float, + chunk_size_in_secs: float, + preprocessor_cfg: DictConfig, + device: torch.device, + fill_value: float = LOG_MEL_ZERO, + ): + + if buffer_size_in_secs < chunk_size_in_secs: + raise ValueError( + f"Buffer size ({buffer_size_in_secs}s) should be no less than chunk size ({chunk_size_in_secs}s)" + ) + + self.sample_rate = sample_rate + self.buffer_size_in_secs = buffer_size_in_secs + self.chunk_size_in_secs = chunk_size_in_secs + self.device = device + + if hasattr(preprocessor_cfg, 'log') and preprocessor_cfg.log: + self.ZERO_LEVEL_SPEC_DB_VAL = LOG_MEL_ZERO # Log-Mel spectrogram value for zero signals + else: + self.ZERO_LEVEL_SPEC_DB_VAL = fill_value + + self.n_feat = preprocessor_cfg.features + self.timestep_duration = preprocessor_cfg.window_stride + self.n_chunk_look_back = int(self.timestep_duration * self.sample_rate) + self.chunk_size = int(self.chunk_size_in_secs * self.sample_rate) + self.sample_buffer = AudioBufferer(sample_rate, buffer_size_in_secs) + + self.feature_buffer_len = int(buffer_size_in_secs / self.timestep_duration) + self.feature_chunk_len = int(chunk_size_in_secs / self.timestep_duration) + self.feature_buffer = torch.full( + [self.n_feat, self.feature_buffer_len], + self.ZERO_LEVEL_SPEC_DB_VAL, + dtype=torch.float32, + device=self.device, + ) + + self.preprocessor = nemo_asr.models.ASRModel.from_config_dict(preprocessor_cfg) + self.preprocessor.to(self.device) + + def is_buffer_empty(self) -> bool: + """ + Check if the buffer is empty + Returns: + bool: True if the buffer is empty, False otherwise + """ + return self.sample_buffer.is_buffer_empty() + + def reset(self) -> None: + """ + Reset the buffer to zero + """ + self.sample_buffer.reset() + self.feature_buffer.fill_(self.ZERO_LEVEL_SPEC_DB_VAL) + + def _update_feature_buffer(self, feat_chunk: torch.Tensor) -> None: + """ + Add an extracted feature to `feature_buffer` + """ + self.feature_buffer[:, : -self.feature_chunk_len] = self.feature_buffer[:, self.feature_chunk_len :].clone() + self.feature_buffer[:, -self.feature_chunk_len :] = feat_chunk.clone() + + def preprocess(self, audio_signal: torch.Tensor) -> torch.Tensor: + """ + Preprocess the audio signal using the preprocessor + Args: + audio_signal (torch.Tensor): audio signal + Returns: + torch.Tensor: preprocessed features + """ + audio_signal = audio_signal.unsqueeze_(0).to(self.device) + audio_signal_len = torch.tensor([audio_signal.shape[1]], device=self.device) + features, _ = self.preprocessor( + input_signal=audio_signal, + length=audio_signal_len, + ) + features = features.squeeze() + return features + + def update(self, audio: np.ndarray) -> None: + """ + Update the sample anf feature buffers with the new frame + Args: + frame (Frame): frame to update the buffer with + """ + + # Update the sample buffer with the new frame + self.sample_buffer.update(audio) + + if math.isclose(self.buffer_size_in_secs, self.chunk_size_in_secs): + # If the buffer size is equal to the chunk size, just take the whole buffer + samples = self.sample_buffer.sample_buffer.clone() + else: + # Add look_back to have context for the first feature + samples = self.sample_buffer.sample_buffer[-(self.n_chunk_look_back + self.chunk_size) :] + + # Get the mel spectrogram + features = self.preprocess(samples) + + # If the features are longer than supposed to be, drop the last frames + # Drop the last diff frames because they might be incomplete + if (diff := features.shape[1] - self.feature_chunk_len - 1) > 0: + features = features[:, :-diff] + + # Update the feature buffer with the new features + self._update_feature_buffer(features[:, -self.feature_chunk_len :]) + + def get_buffer(self) -> torch.Tensor: + """ + Get the current sample buffer + Returns: + torch.Tensor: current state of the buffer + """ + return self.sample_buffer.get_buffer() + + def get_feature_buffer(self) -> torch.Tensor: + """ + Get the current feature buffer + Returns: + torch.Tensor: current state of the feature buffer + """ + return self.feature_buffer.clone() diff --git a/nemo/collections/voice_agent/pipecat/transports/__init__.py b/nemo/collections/voice_agent/pipecat/transports/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/transports/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/transports/base_input.py b/nemo/collections/voice_agent/pipecat/transports/base_input.py new file mode 100644 index 000000000000..22113b1c719e --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/transports/base_input.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pipecat.audio.vad.vad_analyzer import VADState +from pipecat.frames.frames import ( + InputAudioRawFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.transports.base_input import BaseInputTransport as _BaseInputTransport + + +class BaseInputTransport(_BaseInputTransport): + async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState): + """Handle Voice Activity Detection results and generate appropriate frames.""" + new_vad_state = await self._vad_analyze(audio_frame) + # if new_vad_state != VADState.QUIET and vad_state != VADState.QUIET: + # logger.debug(f"base_input: VAD state changed from {vad_state} to {new_vad_state}") + if new_vad_state != vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: + frame = None + # If the turn analyser is enabled, this will prevent: + # - Creating the UserStoppedSpeakingFrame + # - Creating the UserStartedSpeakingFrame multiple times + can_create_user_frames = ( + self._params.turn_analyzer is None or not self._params.turn_analyzer.speech_triggered + ) and self._params.can_create_user_frames + + if new_vad_state == VADState.SPEAKING: + await self.push_frame(VADUserStartedSpeakingFrame()) + if can_create_user_frames: + frame = UserStartedSpeakingFrame() + elif new_vad_state == VADState.QUIET: + await self.push_frame(VADUserStoppedSpeakingFrame()) + if can_create_user_frames: + frame = UserStoppedSpeakingFrame() + + if frame: + await self._handle_user_interruption(frame) + + vad_state = new_vad_state + return vad_state diff --git a/nemo/collections/voice_agent/pipecat/transports/base_transport.py b/nemo/collections/voice_agent/pipecat/transports/base_transport.py new file mode 100644 index 000000000000..eb57024611b6 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/transports/base_transport.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pipecat.transports.base_transport import TransportParams as _TransportParams + + +class TransportParams(_TransportParams): + can_create_user_frames: bool = True diff --git a/nemo/collections/voice_agent/pipecat/transports/network/__init__.py b/nemo/collections/voice_agent/pipecat/transports/network/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/transports/network/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py b/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py new file mode 100644 index 000000000000..1f75c73b2094 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py @@ -0,0 +1,128 @@ +from typing import Optional + +from loguru import logger +from pipecat.serializers.base_serializer import FrameSerializer +from pipecat.transports.base_transport import BaseTransport +from pipecat.transports.network.websocket_server import ( + WebsocketServerCallbacks, + WebsocketServerInputTransport, + WebsocketServerOutputTransport, + WebsocketServerParams, +) + +from nemo.collections.voice_agent.pipecat.transports.base_transport import TransportParams + +try: + import websockets +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use websockets, you need to `pip install pipecat-ai[websocket]`.") + raise Exception(f"Missing module: {e}") + + +class WebsocketServerParams(TransportParams): + """Configuration parameters for WebSocket server transport. + + Parameters: + add_wav_header: Whether to add WAV headers to audio frames. + serializer: Frame serializer for message encoding/decoding. + session_timeout: Timeout in seconds for client sessions. + """ + + add_wav_header: bool = False + serializer: Optional[FrameSerializer] = None + session_timeout: Optional[int] = None + + +class WebsocketServerTransport(BaseTransport): + """WebSocket server transport for bidirectional real-time communication. + + Provides a complete WebSocket server implementation with separate input and + output transports, client connection management, and event handling for + real-time audio and data streaming applications. + """ + + def __init__( + self, + params: WebsocketServerParams, + host: str = "localhost", + port: int = 8765, + input_name: Optional[str] = None, + output_name: Optional[str] = None, + ): + """Initialize the WebSocket server transport. + + Args: + params: WebSocket server configuration parameters. + host: Host address to bind the server to. Defaults to "localhost". + port: Port number to bind the server to. Defaults to 8765. + input_name: Optional name for the input processor. + output_name: Optional name for the output processor. + """ + super().__init__(input_name=input_name, output_name=output_name) + self._host = host + self._port = port + self._params = params + + self._callbacks = WebsocketServerCallbacks( + on_client_connected=self._on_client_connected, + on_client_disconnected=self._on_client_disconnected, + on_session_timeout=self._on_session_timeout, + on_websocket_ready=self._on_websocket_ready, + ) + self._input: Optional[WebsocketServerInputTransport] = None + self._output: Optional[WebsocketServerOutputTransport] = None + self._websocket: Optional[websockets.WebSocketServerProtocol] = None + + # Register supported handlers. The user will only be able to register + # these handlers. + self._register_event_handler("on_client_connected") + self._register_event_handler("on_client_disconnected") + self._register_event_handler("on_session_timeout") + self._register_event_handler("on_websocket_ready") + + def input(self) -> WebsocketServerInputTransport: + """Get the input transport for receiving client data. + + Returns: + The WebSocket server input transport instance. + """ + if not self._input: + self._input = WebsocketServerInputTransport( + self, self._host, self._port, self._params, self._callbacks, name=self._input_name + ) + return self._input + + def output(self) -> WebsocketServerOutputTransport: + """Get the output transport for sending data to clients. + + Returns: + The WebSocket server output transport instance. + """ + if not self._output: + self._output = WebsocketServerOutputTransport(self, self._params, name=self._output_name) + return self._output + + async def _on_client_connected(self, websocket): + """Handle client connection events.""" + if self._output: + await self._output.set_client_connection(websocket) + await self._call_event_handler("on_client_connected", websocket) + else: + logger.error("A WebsocketServerTransport output is missing in the pipeline") + + async def _on_client_disconnected(self, websocket): + """Handle client disconnection events.""" + if self._output: + await self._output.set_client_connection(None) + await self._call_event_handler("on_client_disconnected", websocket) + else: + logger.error("A WebsocketServerTransport output is missing in the pipeline") + + async def _on_session_timeout(self, websocket): + """Handle client session timeout events.""" + await self._call_event_handler("on_session_timeout", websocket) + + async def _on_websocket_ready(self): + """Handle WebSocket server ready events.""" + await self._call_event_handler("on_websocket_ready") diff --git a/nemo/collections/voice_agent/pipecat/utils/__init__.py b/nemo/collections/voice_agent/pipecat/utils/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/utils/text/__init__.py b/nemo/collections/voice_agent/pipecat/utils/text/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/utils/text/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/utils/text/simple_text_aggregator.py b/nemo/collections/voice_agent/pipecat/utils/text/simple_text_aggregator.py new file mode 100644 index 000000000000..ada66aef6dec --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/utils/text/simple_text_aggregator.py @@ -0,0 +1,52 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from pipecat.utils.string import match_endofsentence +from pipecat.utils.text.simple_text_aggregator import SimpleTextAggregator + + +class SimpleSegmentedTextAggregator(SimpleTextAggregator): + def __init__(self, punctuation_marks: str | list[str] = ",!?", **kwargs): + super().__init__(**kwargs) + if not punctuation_marks: + self._punctuation_marks = set() + else: + self._punctuation_marks = set(punctuation_marks) + + def _find_segment_end(self, text: str) -> Optional[int]: + for punc in self._punctuation_marks: + idx = text.find(punc) + if idx != -1: + return idx + return None + + async def aggregate(self, text: str) -> Optional[str]: + result: Optional[str] = None + + self._text += text + + self._text = self._text.replace("*", "") + + eos_end_marker = match_endofsentence(self._text) + + if not eos_end_marker: + eos_end_marker = self._find_segment_end(self._text) + + if eos_end_marker: + result = self._text[:eos_end_marker] + self._text = self._text[eos_end_marker:] + + return result From fda5450ea6f7ef9b0e133d5efe4ae2d3afede4e6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 23 Jul 2025 17:45:47 -0400 Subject: [PATCH 03/47] update readme Signed-off-by: stevehuang52 --- examples/voice_agent/nemo_chatbot/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/voice_agent/nemo_chatbot/README.md b/examples/voice_agent/nemo_chatbot/README.md index 666292d5d9d6..3a1bf776cc41 100644 --- a/examples/voice_agent/nemo_chatbot/README.md +++ b/examples/voice_agent/nemo_chatbot/README.md @@ -12,6 +12,11 @@ conda env create -f environment.yml Activate the environment via `conda activate nemo-pipecat` +### Configure the server + +Edit the `server/server_config.yaml` file to configure the server. + + ### Run the server ```bash From f843762699ca0623e404fe30b9d0ffa3cc2dd1c9 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 23 Jul 2025 21:24:38 -0400 Subject: [PATCH 04/47] update websocket Signed-off-by: stevehuang52 --- .../transports/network/websocket_server.py | 194 +++++++++++++++++- 1 file changed, 192 insertions(+), 2 deletions(-) diff --git a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py b/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py index 1f75c73b2094..5b8518fa37f5 100644 --- a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py +++ b/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py @@ -1,15 +1,46 @@ -from typing import Optional +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import io +import time +import wave +from typing import Awaitable, Callable, Optional from loguru import logger +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + Frame, + InputAudioRawFrame, + OutputAudioRawFrame, + StartFrame, + StartInterruptionFrame, + StopInterruptionFrame, + TransportMessageFrame, + TransportMessageUrgentFrame, +) from pipecat.serializers.base_serializer import FrameSerializer from pipecat.transports.base_transport import BaseTransport from pipecat.transports.network.websocket_server import ( WebsocketServerCallbacks, - WebsocketServerInputTransport, WebsocketServerOutputTransport, WebsocketServerParams, ) +from nemo.collections.voice_agent.pipecat.transports.base_input import BaseInputTransport from nemo.collections.voice_agent.pipecat.transports.base_transport import TransportParams try: @@ -34,6 +65,165 @@ class WebsocketServerParams(TransportParams): session_timeout: Optional[int] = None +class WebsocketServerInputTransport(BaseInputTransport): + """WebSocket server input transport for receiving client data. + + Handles incoming WebSocket connections, message processing, and client + session management including timeout monitoring and connection lifecycle. + """ + + def __init__( + self, + transport: BaseTransport, + host: str, + port: int, + params: WebsocketServerParams, + callbacks: WebsocketServerCallbacks, + **kwargs, + ): + """Initialize the WebSocket server input transport. + + Args: + transport: The parent transport instance. + host: Host address to bind the WebSocket server to. + port: Port number to bind the WebSocket server to. + params: WebSocket server configuration parameters. + callbacks: Callback functions for WebSocket events. + **kwargs: Additional arguments passed to parent class. + """ + super().__init__(params, **kwargs) + + self._transport = transport + self._host = host + self._port = port + self._params = params + self._callbacks = callbacks + + self._websocket: Optional[websockets.WebSocketServerProtocol] = None + + self._server_task = None + + # This task will monitor the websocket connection periodically. + self._monitor_task = None + + self._stop_server_event = asyncio.Event() + + # Whether we have seen a StartFrame already. + self._initialized = False + + async def start(self, frame: StartFrame): + """Start the WebSocket server and initialize components. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + + if self._initialized: + return + + self._initialized = True + + if self._params.serializer: + await self._params.serializer.setup(frame) + if not self._server_task: + self._server_task = self.create_task(self._server_task_handler()) + await self.set_transport_ready(frame) + + async def stop(self, frame: EndFrame): + """Stop the WebSocket server and cleanup resources. + + Args: + frame: The end frame signaling transport shutdown. + """ + await super().stop(frame) + self._stop_server_event.set() + if self._monitor_task: + await self.cancel_task(self._monitor_task) + self._monitor_task = None + if self._server_task: + await self.wait_for_task(self._server_task) + self._server_task = None + + async def cancel(self, frame: CancelFrame): + """Cancel the WebSocket server and stop all processing. + + Args: + frame: The cancel frame signaling immediate cancellation. + """ + await super().cancel(frame) + if self._monitor_task: + await self.cancel_task(self._monitor_task) + self._monitor_task = None + if self._server_task: + await self.cancel_task(self._server_task) + self._server_task = None + + async def cleanup(self): + """Cleanup resources and parent transport.""" + await super().cleanup() + await self._transport.cleanup() + + async def _server_task_handler(self): + """Handle WebSocket server startup and client connections.""" + logger.info(f"Starting websocket server on {self._host}:{self._port}") + async with websockets.serve(self._client_handler, self._host, self._port) as server: + await self._callbacks.on_websocket_ready() + await self._stop_server_event.wait() + + async def _client_handler(self, websocket: websockets.WebSocketServerProtocol, path: Optional[str] = None): + """Handle individual client connections and message processing.""" + logger.info(f"New client connection from {websocket.remote_address}") + if self._websocket: + await self._websocket.close() + logger.warning("Only one client connected, using new connection") + + self._websocket = websocket + + # Notify + await self._callbacks.on_client_connected(websocket) + + # Create a task to monitor the websocket connection + if not self._monitor_task and self._params.session_timeout: + self._monitor_task = self.create_task(self._monitor_websocket(websocket, self._params.session_timeout)) + + # Handle incoming messages + try: + async for message in websocket: + if not self._params.serializer: + continue + + frame = await self._params.serializer.deserialize(message) + + if not frame: + continue + + if isinstance(frame, InputAudioRawFrame): + await self.push_audio_frame(frame) + else: + await self.push_frame(frame) + except Exception as e: + logger.error(f"{self} exception receiving data: {e.__class__.__name__} ({e})") + + # Notify disconnection + await self._callbacks.on_client_disconnected(websocket) + + await self._websocket.close() + self._websocket = None + + logger.info(f"Client {websocket.remote_address} disconnected") + + async def _monitor_websocket(self, websocket: websockets.WebSocketServerProtocol, session_timeout: int): + """Monitor WebSocket connection for session timeout.""" + try: + await asyncio.sleep(session_timeout) + if not websocket.closed: + await self._callbacks.on_session_timeout(websocket) + except asyncio.CancelledError: + logger.info(f"Monitoring task cancelled for: {websocket.remote_address}") + raise + + class WebsocketServerTransport(BaseTransport): """WebSocket server transport for bidirectional real-time communication. From 16a27baf689d3b0d2980d2f2efadb3c55a89f23b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 12:59:43 -0400 Subject: [PATCH 05/47] update Signed-off-by: stevehuang52 --- .gitignore | 1 + examples/voice_agent/README.md | 126 ++++++++++++++++++ .../{nemo_chatbot => }/client/README.md | 0 .../{nemo_chatbot => }/client/index.html | 0 .../client/package-lock.json | 0 .../{nemo_chatbot => }/client/package.json | 0 .../{nemo_chatbot => }/client/src/app.ts | 0 .../{nemo_chatbot => }/client/src/style.css | 0 .../{nemo_chatbot => }/client/tsconfig.json | 0 .../{nemo_chatbot => }/client/vite.config.js | 0 .../{nemo_chatbot => }/environment.yml | 14 +- .../fast-bite.txt} | 58 +++----- examples/voice_agent/nemo_chatbot/README.md | 41 ------ examples/voice_agent/requirements.txt | 5 + .../server/bot_websocket_server.py | 2 +- .../{nemo_chatbot => }/server/env.example | 0 .../server/requirements.txt | 0 .../{nemo_chatbot => }/server/server.py | 0 .../server/server_config.yaml | 19 +-- .../pipecat/services/nemo/legacy_asr.py | 2 +- .../pipecat/services/nemo/legacy_diar.py | 4 +- .../pipecat/transports/base_input.py | 5 + 22 files changed, 177 insertions(+), 100 deletions(-) create mode 100644 examples/voice_agent/README.md rename examples/voice_agent/{nemo_chatbot => }/client/README.md (100%) rename examples/voice_agent/{nemo_chatbot => }/client/index.html (100%) rename examples/voice_agent/{nemo_chatbot => }/client/package-lock.json (100%) rename examples/voice_agent/{nemo_chatbot => }/client/package.json (100%) rename examples/voice_agent/{nemo_chatbot => }/client/src/app.ts (100%) rename examples/voice_agent/{nemo_chatbot => }/client/src/style.css (100%) rename examples/voice_agent/{nemo_chatbot => }/client/tsconfig.json (100%) rename examples/voice_agent/{nemo_chatbot => }/client/vite.config.js (100%) rename examples/voice_agent/{nemo_chatbot => }/environment.yml (98%) rename examples/voice_agent/{nemo_chatbot/server/prompts.py => example_prompts/fast-bite.txt} (62%) delete mode 100644 examples/voice_agent/nemo_chatbot/README.md create mode 100644 examples/voice_agent/requirements.txt rename examples/voice_agent/{nemo_chatbot => }/server/bot_websocket_server.py (99%) rename examples/voice_agent/{nemo_chatbot => }/server/env.example (100%) rename examples/voice_agent/{nemo_chatbot => }/server/requirements.txt (100%) rename examples/voice_agent/{nemo_chatbot => }/server/server.py (100%) rename examples/voice_agent/{nemo_chatbot => }/server/server_config.yaml (68%) diff --git a/.gitignore b/.gitignore index d437cc83474c..6184f338cb60 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,4 @@ nemo_experiments/ slurm*.out node_modules/ +.vite/ \ No newline at end of file diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md new file mode 100644 index 000000000000..6fbe3305bf41 --- /dev/null +++ b/examples/voice_agent/README.md @@ -0,0 +1,126 @@ +# NeMo Voice Agent + +A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everthing is deployed locally so you can have your own voice agent. + + + +## ✨ Key Features + +- [x] Open-source, local deployment, and flexible customization. +- [x] Talk to most LLMs from HuggingFace, use different prompts to configure the agent. +- [x] Speaker diarization up to 4 speakers. +- [x] Streaming speech recognition. +- [x] FastPitch-HiFiGAN TTS. +- [x] WebSocket server for easy deployment. + + +## 💻 Hardware Requirements + +- A computer with at least one GPU. At least 18GB VRAM is recommended for using 8B LLMs, and 10GB VRAM for 4B LLMs. +- A microphone. +- A speaker. + + +## 🚀 Quick Start + +### Install dependencies + +Create a new conda environment with the dependencies: +```bash +conda env create -f environment.yml +``` + +Activate the environment via `conda activate nemo-voice` + +Alternatively, you can install the dependencies manually in an existing environment: +```bash +pip install -r requirements.txt +``` +The incompatabilities errors from pip can be ignored. + +### Configure the server + +Edit the `server/server_config.yaml` file to configure the server, for example: +- Changing the LLM and prompt you want to use, by either putting a local path to a text file or the whole prompt string. See `example_prompts/` for examples to start with. +- Configure the LLM parameters, such as temperature, max tokens, etc. +- Distribute different components to different GPUs if you have more than one. +- Adjust VAD parameters for sensitivity and end-of-turn detection timeout. + + +### Start the server + +Open a terminal and run the server via: + +```bash +NEMO_PATH=??? # Use your local NeMo path for the latest version +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +export HF_TOKEN=??? # Use your own HuggingFace token if needed +export WEBSOCKET_SERVER=websocket_server # currently only support websocket_server +python ./server/server.py +``` + +### Launch the client +In another terminal, run the client via: + +```bash +cd client +npm install +npm run dev +``` + +### Connect to the client via browser + +Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. + + +## 📑 Supported Models + +### 🤖 LLM + +Most LLMs from HuggingFace are supported. A few examples are: +- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) + +### 🎤 ASR + +We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing Englishmodels for now: +- [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) +- [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) + +### 💬 Diarization + +We use [streaming Sortformer](https://arxiv.org/abs/2409.06656) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. + +### 🔉 TTS + +We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, more TTS models will be supported in the future. + + +## 📝 Notes +- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. +- The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. +- The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. + + +## ☁️ NVIDIA NIM Services + +You can also modify the `server/bot_websocket_server.py` to use NVIDIA NIM services for better LLM,ASR and TTS performance, by refering to these Pipecat services: +- [NVIDIA NIM LLM Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/nim/llm.py) +- [NVIDIA Riva ASR Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py) +- [NVIDIA Riva TTS Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/tts.py) + +For details of available NVIDIA NIM services, please refer to: +- [NVIDIA NIM LLM Service](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html) +- [NVIDIA Riva ASR NIM Service](https://docs.nvidia.com/nim/riva/asr/latest/overview.html) +- [NVIDIA Riva TTS NIM Service](https://docs.nvidia.com/nim/riva/tts/latest/overview.html) + + + +## 💡 Upcoming Next +- [ ] Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). +- [ ] Better streaming ASR and diarization pipeline. +- [ ] Better TTS model with more natural voice. +- [ ] More noise-robust diarization models. +- [ ] Joint ASR and diarization model. diff --git a/examples/voice_agent/nemo_chatbot/client/README.md b/examples/voice_agent/client/README.md similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/README.md rename to examples/voice_agent/client/README.md diff --git a/examples/voice_agent/nemo_chatbot/client/index.html b/examples/voice_agent/client/index.html similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/index.html rename to examples/voice_agent/client/index.html diff --git a/examples/voice_agent/nemo_chatbot/client/package-lock.json b/examples/voice_agent/client/package-lock.json similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/package-lock.json rename to examples/voice_agent/client/package-lock.json diff --git a/examples/voice_agent/nemo_chatbot/client/package.json b/examples/voice_agent/client/package.json similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/package.json rename to examples/voice_agent/client/package.json diff --git a/examples/voice_agent/nemo_chatbot/client/src/app.ts b/examples/voice_agent/client/src/app.ts similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/src/app.ts rename to examples/voice_agent/client/src/app.ts diff --git a/examples/voice_agent/nemo_chatbot/client/src/style.css b/examples/voice_agent/client/src/style.css similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/src/style.css rename to examples/voice_agent/client/src/style.css diff --git a/examples/voice_agent/nemo_chatbot/client/tsconfig.json b/examples/voice_agent/client/tsconfig.json similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/tsconfig.json rename to examples/voice_agent/client/tsconfig.json diff --git a/examples/voice_agent/nemo_chatbot/client/vite.config.js b/examples/voice_agent/client/vite.config.js similarity index 100% rename from examples/voice_agent/nemo_chatbot/client/vite.config.js rename to examples/voice_agent/client/vite.config.js diff --git a/examples/voice_agent/nemo_chatbot/environment.yml b/examples/voice_agent/environment.yml similarity index 98% rename from examples/voice_agent/nemo_chatbot/environment.yml rename to examples/voice_agent/environment.yml index 8fe2883649a5..6589bcb04ab7 100644 --- a/examples/voice_agent/nemo_chatbot/environment.yml +++ b/examples/voice_agent/environment.yml @@ -1,4 +1,4 @@ -name: nemo-pipecat +name: nemo-voice channels: - defaults dependencies: @@ -210,7 +210,7 @@ dependencies: - mypy-extensions==1.1.0 - nemo-run==0.4.0 - nemo-text-processing==1.1.0 - - nemo-toolkit==2.5.0rc0 + - nemo-toolkit==2.4.0rc2 - nerfacc==0.5.3 - nest-asyncio==1.6.0 - networkx==3.4.2 @@ -246,7 +246,7 @@ dependencies: - onnx==1.17.0 - onnxruntime==1.22.0 - open-clip-torch==2.24.0 - - openai==1.70.0 + - openai==1.74.0 - opencc==1.1.9 - opencc-python-reimplemented==0.1.7 - opentelemetry-api==1.34.1 @@ -264,7 +264,7 @@ dependencies: - pexpect==4.9.0 - pfzy==0.3.4 - pillow==11.1.0 - - pipecat-ai==0.1.dev4182 + - pipecat-ai==0.0.76 - plac==1.4.5 - platformdirs==4.3.8 - pluggy==1.6.0 @@ -312,7 +312,7 @@ dependencies: - pytest-runner==6.0.1 - python-dateutil==2.9.0.post0 - python-dotenv==1.1.0 - - python-graphviz==0.21 + - graphviz==0.21 - python-iso639==2025.2.18 - python-magic==0.4.27 - pytorch-lightning==2.5.1.post0 @@ -411,8 +411,8 @@ dependencies: - typing-inspection==0.4.1 - tzdata==2025.2 - ujson==5.10.0 - - unstructured==0.14.9 - - unstructured-client==0.36.0 + - unstructured + - unstructured-client - urllib3==1.26.20 - uvicorn==0.34.3 - uvloop==0.21.0 diff --git a/examples/voice_agent/nemo_chatbot/server/prompts.py b/examples/voice_agent/example_prompts/fast-bite.txt similarity index 62% rename from examples/voice_agent/nemo_chatbot/server/prompts.py rename to examples/voice_agent/example_prompts/fast-bite.txt index be27fed6d4a2..7170000fdfed 100644 --- a/examples/voice_agent/nemo_chatbot/server/prompts.py +++ b/examples/voice_agent/example_prompts/fast-bite.txt @@ -1,19 +1,3 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -lunch_menu = """ Fast Bites Lunch Menu Burgers and Sandwiches @@ -45,27 +29,23 @@ 10. Fountain Soda (16 oz, choices: Coke, Diet Coke, Sprite, Fanta) – $1.99 11. Iced Tea or Lemonade – $2.29 12. Bottled Water – $1.49 -""" -bot_prompt = f""" -{lunch_menu}\n\n -You are a helpful assistant named Lisa that helps customers order food from the lunch menu.\n -Start by greeting the user warmly and introducing yourself within one sentence "Hi welcome to Fast Bites! I'm Lisa, what can I help you with?".\n -Your answer should be concise and to the point.\n -Do not include the whole lunch menu in your response, only include the items that are relevant to the user's question.\n -If the user asks about a specific item, you should include the price of that item.\n -If the user asks about the menu, you should include the entire lunch menu.\n -If the user asks about the prices, you should include the prices of the items.\n -If the user asks about the location, you should include the location of the restaurant (123 Main St, Anytown, USA).\n -If the user asks about the hours, you should include the hours of the restaurant (11:00 AM - 9:00 PM).\n -When a user asks for the total price of the order, you should include the total price of the order.\n -When the conversation is done, you should say "Thank you for your order! Your total is . Please come back soon!", where is the total price of the orders of all speakers.\n -If a speaker finishes their order and you don't know their name, you should ask them for their name and associate it with their order.\n -When introducing an item from the menu, you should include the name of the item and the price.\n -Stick strictly to the lunch menu and do not make up any items.\n -You might also see speaker tags (, , etc.) in the user context.\n -You should respond to the user based on the speaker tag and the context of that speaker. \n -Do not include the speaker tags in your response, use them only to identify the speaker.\n -If there are multiple speakers, you should handle the order of each speaker separately and not mix up the speakers.\n -Do not respond only with "Hi" or "Hi there", you should focus on the task of taking the order and not just greeting the user. \n -""" +You are a helpful assistant named Lisa that helps customers order food from the lunch menu. +Start by greeting the user warmly and introducing yourself within one sentence "Hi welcome to Fast Bites! I'm Lisa, what can I help you with?". +Your answer should be concise and to the point. +Do not include the whole lunch menu in your response, only include the items that are relevant to the user's question. +If the user asks about a specific item, you should include the price of that item. +If the user asks about the menu, you should include the entire lunch menu. +If the user asks about the prices, you should include the prices of the items. +If the user asks about the location, you should include the location of the restaurant (123 Main St, Anytown, USA). +If the user asks about the hours, you should include the hours of the restaurant (11:00 AM - 9:00 PM). +When a user asks for the total price of the order, you should include the total price of the order. +When the conversation is done, you should say "Thank you for your order! Your total is . Please come back soon!", where is the total price of the orders of all speakers. +If a speaker finishes their order and you don't know their name, you should ask them for their name and associate it with their order. +When introducing an item from the menu, you should include the name of the item and the price. +Stick strictly to the lunch menu and do not make up any items. +You might also see speaker tags (, , etc.) in the user context. +You should respond to the user based on the speaker tag and the context of that speaker. +Do not include the speaker tags in your response, use them only to identify the speaker. +If there are multiple speakers, you should handle the order of each speaker separately and not mix up the speakers. +Do not respond only with "Hi" or "Hi there", you should focus on the task of taking the order and not just greeting the user. diff --git a/examples/voice_agent/nemo_chatbot/README.md b/examples/voice_agent/nemo_chatbot/README.md deleted file mode 100644 index 3a1bf776cc41..000000000000 --- a/examples/voice_agent/nemo_chatbot/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# NeMo Voice Agent - -A Pipecat example demonstrating the simplest way to create a voice agent using `WebsocketTransport`, NeMo STT/TTS service, and HuggingFace LLM. Evertying is deployed locally so you can own your own agent. - -## 🚀 Quick Start - -### Install dependencies - -```bash -conda env create -f environment.yml -``` - -Activate the environment via `conda activate nemo-pipecat` - -### Configure the server - -Edit the `server/server_config.yaml` file to configure the server. - - -### Run the server - -```bash -NEMO_PATH=??? # Use your own NeMo path -export PYTHONPATH=$NEMO_PATH:$PYTHONPATH -export HF_TOKEN=??? # Use your own HuggingFace token -export WEBSOCKET_SERVER=websocket_server # currently only support websocket_server -python ./server/server.py -``` - -### Launch the client -In another terminal, run the client via: - -```bash -cd client -npm install -npm run dev -``` - -### Connect to the client via browser - -Open the client via browser: `http://[YOUR SERVER IP ADDRESS]:5173/` diff --git a/examples/voice_agent/requirements.txt b/examples/voice_agent/requirements.txt new file mode 100644 index 000000000000..6415c597d999 --- /dev/null +++ b/examples/voice_agent/requirements.txt @@ -0,0 +1,5 @@ +huggingface-hub +nemo-toolkit +onnxruntime +pipecat-ai +websockets diff --git a/examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py similarity index 99% rename from examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py rename to examples/voice_agent/server/bot_websocket_server.py index a0629f75fa94..a9ab1d710a35 100644 --- a/examples/voice_agent/nemo_chatbot/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -121,7 +121,7 @@ TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model TTS_DEVICE = server_config.tts.device -EXTRA_SEPARATOR = server_config.tts.get("extra_separator", ":,!?") +EXTRA_SEPARATOR = server_config.tts.get("extra_separator", None) ################ End of Configuration ################# diff --git a/examples/voice_agent/nemo_chatbot/server/env.example b/examples/voice_agent/server/env.example similarity index 100% rename from examples/voice_agent/nemo_chatbot/server/env.example rename to examples/voice_agent/server/env.example diff --git a/examples/voice_agent/nemo_chatbot/server/requirements.txt b/examples/voice_agent/server/requirements.txt similarity index 100% rename from examples/voice_agent/nemo_chatbot/server/requirements.txt rename to examples/voice_agent/server/requirements.txt diff --git a/examples/voice_agent/nemo_chatbot/server/server.py b/examples/voice_agent/server/server.py similarity index 100% rename from examples/voice_agent/nemo_chatbot/server/server.py rename to examples/voice_agent/server/server.py diff --git a/examples/voice_agent/nemo_chatbot/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml similarity index 68% rename from examples/voice_agent/nemo_chatbot/server/server_config.yaml rename to examples/voice_agent/server/server_config.yaml index c195817f991c..5632bd83c472 100644 --- a/examples/voice_agent/nemo_chatbot/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -1,16 +1,17 @@ +# bot_prompt: /path/to/prompt.txt bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker." transport: - audio_out_10ms_chunks: 8 + audio_out_10ms_chunks: 8 # use 4 as websocket default, but increasing to larger number might have less glitches in TTS audio vad: type: silero - confidence: 0.6 - start_secs: 0.1 - stop_secs: 0.8 - min_volume: 0.4 + confidence: 0.6 # VAD threshold for detecting speech versus non-speech + start_secs: 0.1 # min amout of speech to trigger UserStartSpeaking + stop_secs: 0.8 # min about of silence to trigger UserStopSpeaking + min_volume: 0.4 # Microphone volumn threshold for VAD stt: type: nemo @@ -34,9 +35,9 @@ llm: type: hf model: "/media/data/cache2/meta-llama/Meta-Llama-3-8B-Instruct" # "meta-llama/Meta-Llama-3-8B-Instruct" device: "cuda" - temperature: 0.7 - max_tokens: 128 - top_p: 0.9 + temperature: 0.7 # LLM sampling params + top_p: 0.9 # LLM sampling params + max_tokens: 128 # max num of tokens per LLM output tts: type: nemo @@ -44,4 +45,4 @@ tts: fastpitch_model: "/media/data/cache2/nvidia/tts_en_fastpitch/tts_en_fastpitch.nemo" # "nvidia/tts_en_fastpitch" hifigan_model: "/media/data/cache2/nvidia/tts_hifigan/tts_hifigan.nemo" # "nvidia/tts_hifigan" device: "cuda" - extra_separator: ":,?!" + extra_separator: ":,?!" # some punctuations to chunk LLM response into segments for faster TTS output diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py index b25db5466fc9..8ce3bc886e2c 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -18,11 +18,11 @@ import numpy as np import torch from omegaconf import open_dict -from pipecat.services.nemo.utils import CacheFeatureBufferer import nemo.collections.asr as nemo_asr from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer +from nemo.collections.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer class NemoLegacyASRService: diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py index a64cd75e5369..5d869f2e89b5 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -17,10 +17,10 @@ import numpy as np import torch -from pipecat.services.nemo.utils import CacheFeatureBufferer from torch import Tensor from nemo.collections.asr.models import SortformerEncLabelModel +from nemo.collections.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer @dataclass @@ -137,7 +137,7 @@ def __init__( self.streaming_state = self.init_streaming_state(batch_size=1) self.total_preds = torch.zeros((1, 0, self.max_num_speakers), device=self.diarizer.device) - print("NivaDiarService initialized") + print("NeMoLegacyDiarService initialized") def build_diarizer(self): if self.cfg.model_path.endswith(".nemo"): diff --git a/nemo/collections/voice_agent/pipecat/transports/base_input.py b/nemo/collections/voice_agent/pipecat/transports/base_input.py index 22113b1c719e..73f964cf52ee 100644 --- a/nemo/collections/voice_agent/pipecat/transports/base_input.py +++ b/nemo/collections/voice_agent/pipecat/transports/base_input.py @@ -13,6 +13,7 @@ # limitations under the License. +from loguru import logger from pipecat.audio.vad.vad_analyzer import VADState from pipecat.frames.frames import ( InputAudioRawFrame, @@ -43,10 +44,14 @@ async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState await self.push_frame(VADUserStartedSpeakingFrame()) if can_create_user_frames: frame = UserStartedSpeakingFrame() + else: + logger.debug("base_input: VAD state changed to SPEAKING but can_create_user_frames is False") elif new_vad_state == VADState.QUIET: await self.push_frame(VADUserStoppedSpeakingFrame()) if can_create_user_frames: frame = UserStoppedSpeakingFrame() + else: + logger.debug("base_input: VAD state changed to QUIET but can_create_user_frames is False") if frame: await self._handle_user_interruption(frame) From 94b43bc6a6bfaa54767a9e12d50f061ba1d700f7 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 13:13:26 -0400 Subject: [PATCH 06/47] update Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 38 ++++++++++++++------------ examples/voice_agent/client/src/app.ts | 3 +- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 6fbe3305bf41..e64824156a34 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -6,23 +6,33 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim ## ✨ Key Features -- [x] Open-source, local deployment, and flexible customization. -- [x] Talk to most LLMs from HuggingFace, use different prompts to configure the agent. -- [x] Speaker diarization up to 4 speakers. -- [x] Streaming speech recognition. -- [x] FastPitch-HiFiGAN TTS. -- [x] WebSocket server for easy deployment. +- Open-source, local deployment, and flexible customization. +- Talk to most LLMs from HuggingFace, use different prompts to configure the agent. +- Speaker diarization up to 4 speakers. +- Streaming speech recognition. +- FastPitch-HiFiGAN TTS. +- WebSocket server for easy deployment. + + +## 💡 Upcoming Next +- More accurate and noise-robust streaming ASR and diarization models. +- Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). +- Better streaming ASR and diarization pipeline. +- Better TTS model with more natural voice. +- Joint ASR and diarization model. -## 💻 Hardware Requirements -- A computer with at least one GPU. At least 18GB VRAM is recommended for using 8B LLMs, and 10GB VRAM for 4B LLMs. -- A microphone. -- A speaker. ## 🚀 Quick Start +### Hardware requirements + +- A computer with at least one GPU. At least 18GB VRAM is recommended for using 8B LLMs, and 10GB VRAM for 4B LLMs. +- A microphone connected to the computer. +- A speaker connected to the computer. + ### Install dependencies Create a new conda environment with the dependencies: @@ -102,6 +112,7 @@ We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to ge - If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. +- If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. ## ☁️ NVIDIA NIM Services @@ -117,10 +128,3 @@ For details of available NVIDIA NIM services, please refer to: - [NVIDIA Riva TTS NIM Service](https://docs.nvidia.com/nim/riva/tts/latest/overview.html) - -## 💡 Upcoming Next -- [ ] Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). -- [ ] Better streaming ASR and diarization pipeline. -- [ ] Better TTS model with more natural voice. -- [ ] More noise-robust diarization models. -- [ ] Joint ASR and diarization model. diff --git a/examples/voice_agent/client/src/app.ts b/examples/voice_agent/client/src/app.ts index 871d3fcc551f..c9809fa69c8a 100644 --- a/examples/voice_agent/client/src/app.ts +++ b/examples/voice_agent/client/src/app.ts @@ -46,8 +46,7 @@ class WebsocketClientApp { private readonly serverConfigs = { websocket: { name: 'WebSocket Server', - baseUrl: 'http://10.110.41.36:7860', - // baseUrl: 'http://localhost:7860', + baseUrl: 'http://localhost:7860', port: 8765 }, fastapi: { From 6ff5302f7d23bc50efc99a8d3e528ca354349ec7 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 13:20:37 -0400 Subject: [PATCH 07/47] update readme Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index e64824156a34..6d1f1dfdd505 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -55,6 +55,8 @@ Edit the `server/server_config.yaml` file to configure the server, for example: - Configure the LLM parameters, such as temperature, max tokens, etc. - Distribute different components to different GPUs if you have more than one. - Adjust VAD parameters for sensitivity and end-of-turn detection timeout. +- If you want to access the client from another machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine. + ### Start the server @@ -115,6 +117,7 @@ We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to ge - If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. + ## ☁️ NVIDIA NIM Services You can also modify the `server/bot_websocket_server.py` to use NVIDIA NIM services for better LLM,ASR and TTS performance, by refering to these Pipecat services: From b45cb1aab8d40a84164ebd9013baa3f881cf42cc Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 13:31:28 -0400 Subject: [PATCH 08/47] update Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 6 +++--- examples/voice_agent/server/server_config.yaml | 12 ++++++------ .../voice_agent/pipecat/services/nemo/diar.py | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 6d1f1dfdd505..45c1a37e7d0d 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -8,13 +8,13 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim - Open-source, local deployment, and flexible customization. - Talk to most LLMs from HuggingFace, use different prompts to configure the agent. -- Speaker diarization up to 4 speakers. - Streaming speech recognition. - FastPitch-HiFiGAN TTS. - WebSocket server for easy deployment. ## 💡 Upcoming Next +- Speaker diarization up to 4 speakers (checkpoint will be released verysoon). - More accurate and noise-robust streaming ASR and diarization models. - Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). - Better streaming ASR and diarization pipeline. @@ -83,7 +83,7 @@ npm run dev ### Connect to the client via browser -Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. +Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. ## 📑 Supported Models @@ -114,7 +114,7 @@ We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to ge - If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. -- If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. + diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 5632bd83c472..b29ae7193d6b 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -15,15 +15,15 @@ vad: stt: type: nemo - model: "/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms.nemo" # "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi" + model: "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi" device: "cuda" att_context_size: [70, 1] frame_len_in_secs: 0.08 # default for FastConformer, do not change diar: type: nemo - enabled: true - model: "/home/heh/codes/niva-kunal/im417-normNA-ft3-mem14_epoch23-36.apr23_2025.nemo" + enabled: false + model: null device: "cuda" threshold: 0.4 frame_len_in_secs: 0.08 # default for FastConformer, do not change @@ -33,7 +33,7 @@ turn_taking: llm: type: hf - model: "/media/data/cache2/meta-llama/Meta-Llama-3-8B-Instruct" # "meta-llama/Meta-Llama-3-8B-Instruct" + model: "meta-llama/Meta-Llama-3-8B-Instruct" device: "cuda" temperature: 0.7 # LLM sampling params top_p: 0.9 # LLM sampling params @@ -42,7 +42,7 @@ llm: tts: type: nemo model: fastpitch-hifigan - fastpitch_model: "/media/data/cache2/nvidia/tts_en_fastpitch/tts_en_fastpitch.nemo" # "nvidia/tts_en_fastpitch" - hifigan_model: "/media/data/cache2/nvidia/tts_hifigan/tts_hifigan.nemo" # "nvidia/tts_hifigan" + fastpitch_model: "nvidia/tts_en_fastpitch" + hifigan_model: "nvidia/tts_hifigan" device: "cuda" extra_separator: ":,?!" # some punctuations to chunk LLM response into segments for faster TTS output diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py index 168950ef5e76..df002cc9b7c1 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py @@ -107,8 +107,9 @@ def __init__( self._vad_user_speaking = True def _load_model(self): - if not self._enabled: + if not self._enabled or not self._model_name: self._model = None + self._enabled = False return if self._backend == "legacy": From 6c21c77bf5e756601e20104ba6c5c927ca5535bd Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 13:38:44 -0400 Subject: [PATCH 09/47] clean up Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/submodules/rnnt_decoding.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index 3dcc6b95ed8b..44b532e25e5c 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -1003,8 +1003,6 @@ def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = num_flattened_tokens += len([c for c in char_offsets[t]['char'] if c != self.blank_id]) if num_flattened_tokens != len(hypothesis.text): - print(f"alignments: {alignments}") - print(f"token_repetitions: {token_repetitions}") raise ValueError( f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}" " have to be of the same length, but are: " From 6118ac995ec17837d20621ce4e111e56b42aba1a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 13:41:47 -0400 Subject: [PATCH 10/47] clean up Signed-off-by: stevehuang52 --- .../voice_agent/pipecat/services/__init__.py | 13 +++++++++++++ .../voice_agent/pipecat/services/nemo/diar.py | 12 ++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 nemo/collections/voice_agent/pipecat/services/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/services/__init__.py b/nemo/collections/voice_agent/pipecat/services/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/collections/voice_agent/pipecat/services/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py index df002cc9b7c1..b2fa9dbc8b9c 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py @@ -1,8 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # -# Copyright (c) 2024–2025, Daily +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# SPDX-License-Identifier: BSD 2-Clause License +# http://www.apache.org/licenses/LICENSE-2.0 # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import asyncio From 7e1e62a17c5a06fb3bb8e85d2e50c321b72e78ed Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 13:44:59 -0400 Subject: [PATCH 11/47] fix typo Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 45c1a37e7d0d..cce34463774f 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -1,6 +1,6 @@ # NeMo Voice Agent -A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everthing is deployed locally so you can have your own voice agent. +A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is deployed locally so you can have your own voice agent. From ab723a0d444547da8d5446181edccca619946b14 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 14:14:34 -0400 Subject: [PATCH 12/47] fix codeQL Signed-off-by: stevehuang52 --- examples/voice_agent/requirements.txt | 3 +++ .../server/bot_websocket_server.py | 19 -------------- examples/voice_agent/server/requirements.txt | 4 --- .../voice_agent/pipecat/frames/frames.py | 2 +- .../pipecat/services/nemo/__init__.py | 2 -- .../voice_agent/pipecat/services/nemo/diar.py | 26 ++----------------- .../pipecat/services/nemo/legacy_asr.py | 2 -- .../voice_agent/pipecat/services/nemo/stt.py | 14 +++------- .../voice_agent/pipecat/services/nemo/tts.py | 2 +- .../pipecat/services/nemo/turn_taking.py | 3 --- .../pipecat/transports/base_input.py | 2 -- .../transports/network/websocket_server.py | 18 ++----------- 12 files changed, 13 insertions(+), 84 deletions(-) delete mode 100644 examples/voice_agent/server/requirements.txt diff --git a/examples/voice_agent/requirements.txt b/examples/voice_agent/requirements.txt index 6415c597d999..c78bc299602d 100644 --- a/examples/voice_agent/requirements.txt +++ b/examples/voice_agent/requirements.txt @@ -1,5 +1,8 @@ +fastapi[all] huggingface-hub nemo-toolkit onnxruntime pipecat-ai +python-dotenv +uvicorn websockets diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index a9ab1d710a35..0eb6e23882d6 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -270,25 +270,6 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg logger.info("Setting up pipeline...") - class MetricsLogger(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - - if isinstance(frame, MetricsFrame): - for d in frame.data: - if isinstance(d, TTFBMetricsData): - logger.debug(f"TTFB Metrics: {d.processor} = {d.value:.3f}s") - elif isinstance(d, ProcessingMetricsData): - logger.debug(f"Processing Metrics: {d.processor} = {d.value:.3f}s") - elif isinstance(d, LLMUsageMetricsData): - tokens = d.value - logger.debug( - f"LLM Usage: {d.processor} - prompt: {tokens.prompt_tokens}, completion: {tokens.completion_tokens}" - ) - elif isinstance(d, TTSUsageMetricsData): - logger.debug(f"TTS Usage: {d.processor} = {d.value} characters") - await self.push_frame(frame, direction) - pipeline = Pipeline( [ ws_transport.input(), diff --git a/examples/voice_agent/server/requirements.txt b/examples/voice_agent/server/requirements.txt deleted file mode 100644 index 270c9195f11b..000000000000 --- a/examples/voice_agent/server/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -fastapi[all] -pipecat-ai[silero,websocket] -python-dotenv -uvicorn diff --git a/nemo/collections/voice_agent/pipecat/frames/frames.py b/nemo/collections/voice_agent/pipecat/frames/frames.py index fbe80fe7fecc..df5f1c2c6fef 100644 --- a/nemo/collections/voice_agent/pipecat/frames/frames.py +++ b/nemo/collections/voice_agent/pipecat/frames/frames.py @@ -13,7 +13,7 @@ # limitations under the License. -from dataclasses import dataclass, field +from dataclasses import dataclass import numpy as np from pipecat.frames.frames import DataFrame diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py b/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py index 8b8ab6f75eee..2830b8a94443 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py @@ -13,8 +13,6 @@ # limitations under the License. -import sys - from .diar import NemoDiarService from .llm import HuggingFaceLLMService from .stt import NemoSTTService diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py index b2fa9dbc8b9c..51edc717f74b 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py @@ -14,26 +14,20 @@ import asyncio -import time -from typing import AsyncGenerator, List, Mapping, Optional, Tuple +from typing import AsyncGenerator, Optional import numpy as np -import torch from loguru import logger -from omegaconf import OmegaConf from pipecat.frames.frames import ( - AudioRawFrame, CancelFrame, EndFrame, ErrorFrame, Frame, - InterimTranscriptionFrame, StartFrame, - TranscriptionFrame, VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame, ) -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.processors.frame_processor import FrameDirection from pipecat.services.stt_service import STTService from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 @@ -43,22 +37,6 @@ from nemo.collections.voice_agent.pipecat.frames.frames import DiarResultFrame from nemo.collections.voice_agent.pipecat.services.nemo.legacy_diar import DiarizationConfig, NeMoLegacyDiarService -try: - import nemo.collections.asr as nemo_asr - from nemo.collections.asr.models import ASRModel - - # disable nemo logging - from nemo.utils import logging - - level = logging.getEffectiveLevel() - logging.setLevel(logging.CRITICAL) - - -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error('In order to use NVIDIA NeMo STT, you need to `pip install "nemo_toolkit[asr]"`.') - raise Exception(f"Missing module: {e}") - class NeMoDiarInputParams(BaseModel): threshold: Optional[float] = 0.5 diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py index 8ce3bc886e2c..cd759608a73d 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -207,8 +207,6 @@ def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: in # Convert bytes to numpy array audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0 - audio_len_in_secs = len(audio_array) / 16000 - self._audio_buffer.update(audio_array) features = self._audio_buffer.get_feature_buffer() diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/stt.py b/nemo/collections/voice_agent/pipecat/services/nemo/stt.py index e3c207c675e0..1e77bdb20cec 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/stt.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/stt.py @@ -14,14 +14,10 @@ import asyncio -from typing import AsyncGenerator, List, Mapping, Optional, Tuple +from typing import AsyncGenerator, List, Optional -import numpy as np -import torch from loguru import logger -from omegaconf import OmegaConf from pipecat.frames.frames import ( - AudioRawFrame, CancelFrame, EndFrame, ErrorFrame, @@ -32,7 +28,7 @@ VADUserStoppedSpeakingFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.stt_service import SegmentedSTTService, STTService +from pipecat.services.stt_service import STTService from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 from pipecat.utils.tracing.service_decorators import traced_stt @@ -41,9 +37,6 @@ from nemo.collections.voice_agent.pipecat.services.nemo.legacy_asr import NemoLegacyASRService try: - import nemo.collections.asr as nemo_asr - from nemo.collections.asr.models import ASRModel - # disable nemo logging from nemo.utils import logging @@ -53,7 +46,7 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error('In order to use NVIDIA NeMo STT, you need to `pip install "nemo_toolkit[asr]"`.') + logger.error('In order to use NVIDIA NeMo STT, you need to `pip install "nemo_toolkit[all]"`.') raise Exception(f"Missing module: {e}") @@ -158,6 +151,7 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: await self.start_processing_metrics() try: + is_final = False transcription = None self.audio_buffer.append(audio) if len(self.audio_buffer) >= self._params.buffer_size: diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py index 57dfaf35b242..37a28f1c873f 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py @@ -15,7 +15,7 @@ import asyncio import inspect from collections.abc import AsyncGenerator -from typing import Iterator, Union +from typing import Iterator import numpy as np import torch diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py index 90eff8b65d3e..3eb1dfc1867f 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py @@ -16,15 +16,12 @@ from loguru import logger from pipecat.frames.frames import ( - AudioRawFrame, BotStartedSpeakingFrame, BotStoppedSpeakingFrame, Frame, InterimTranscriptionFrame, - StartFrame, StartInterruptionFrame, StopInterruptionFrame, - TextFrame, TranscriptionFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, diff --git a/nemo/collections/voice_agent/pipecat/transports/base_input.py b/nemo/collections/voice_agent/pipecat/transports/base_input.py index 73f964cf52ee..79a477ad3416 100644 --- a/nemo/collections/voice_agent/pipecat/transports/base_input.py +++ b/nemo/collections/voice_agent/pipecat/transports/base_input.py @@ -29,8 +29,6 @@ class BaseInputTransport(_BaseInputTransport): async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState): """Handle Voice Activity Detection results and generate appropriate frames.""" new_vad_state = await self._vad_analyze(audio_frame) - # if new_vad_state != VADState.QUIET and vad_state != VADState.QUIET: - # logger.debug(f"base_input: VAD state changed from {vad_state} to {new_vad_state}") if new_vad_state != vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: frame = None # If the turn analyser is enabled, this will prevent: diff --git a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py b/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py index 5b8518fa37f5..bc52c579cf73 100644 --- a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py +++ b/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py @@ -14,24 +14,10 @@ import asyncio -import io -import time -import wave -from typing import Awaitable, Callable, Optional +from typing import Optional from loguru import logger -from pipecat.frames.frames import ( - CancelFrame, - EndFrame, - Frame, - InputAudioRawFrame, - OutputAudioRawFrame, - StartFrame, - StartInterruptionFrame, - StopInterruptionFrame, - TransportMessageFrame, - TransportMessageUrgentFrame, -) +from pipecat.frames.frames import CancelFrame, EndFrame, InputAudioRawFrame, StartFrame from pipecat.serializers.base_serializer import FrameSerializer from pipecat.transports.base_transport import BaseTransport from pipecat.transports.network.websocket_server import ( From af9b5237537887c1adb4b6181c318100ab73ef48 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 21:27:40 -0400 Subject: [PATCH 13/47] update cfg Signed-off-by: stevehuang52 --- examples/voice_agent/server/server_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index b29ae7193d6b..5157c1fd9722 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -45,4 +45,4 @@ tts: fastpitch_model: "nvidia/tts_en_fastpitch" hifigan_model: "nvidia/tts_hifigan" device: "cuda" - extra_separator: ":,?!" # some punctuations to chunk LLM response into segments for faster TTS output + extra_separator: null # additional punctuations to chunk LLM response into segments for faster TTS output, e.g., "," From 08ba934d6a41ba0422f4b94606f8b06b226097b6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Jul 2025 21:29:30 -0400 Subject: [PATCH 14/47] remove unused Signed-off-by: stevehuang52 --- examples/voice_agent/server/bot_websocket_server.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index 0eb6e23882d6..d63ce71236d8 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -36,13 +36,11 @@ shutdown_event = asyncio.Event() from pipecat.audio.vad.silero import SileroVADAnalyzer, VADParams -from pipecat.frames.frames import EndTaskFrame, MetricsFrame -from pipecat.metrics.metrics import LLMUsageMetricsData, ProcessingMetricsData, TTFBMetricsData, TTSUsageMetricsData +from pipecat.frames.frames import EndTaskFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.frame_processor import Frame, FrameDirection, FrameProcessor from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor from pipecat.serializers.protobuf import ProtobufFrameSerializer From d23e113f30aa4f00c00d55969765aa2c51f58d75 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 15:57:18 -0400 Subject: [PATCH 15/47] update readme Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index cce34463774f..130d1a48188a 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -10,11 +10,11 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim - Talk to most LLMs from HuggingFace, use different prompts to configure the agent. - Streaming speech recognition. - FastPitch-HiFiGAN TTS. +- Speaker diarization up to 4 speakers (checkpoint will be released very soon). - WebSocket server for easy deployment. ## 💡 Upcoming Next -- Speaker diarization up to 4 speakers (checkpoint will be released verysoon). - More accurate and noise-robust streaming ASR and diarization models. - Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). - Better streaming ASR and diarization pipeline. @@ -103,7 +103,7 @@ We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) t ### 💬 Diarization -We use [streaming Sortformer](https://arxiv.org/abs/2409.06656) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. +We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. ### 🔉 TTS From de7cacc2f2c66d5ca5126d0d86b2d65361e36857 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 18:30:13 -0400 Subject: [PATCH 16/47] change default models Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 2 +- examples/voice_agent/server/server_config.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 130d1a48188a..54c50ca376c2 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -46,7 +46,7 @@ Alternatively, you can install the dependencies manually in an existing environm ```bash pip install -r requirements.txt ``` -The incompatabilities errors from pip can be ignored. +The incompatability errors from pip can be ignored. ### Configure the server diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 5157c1fd9722..ca6ace3a0a70 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -15,7 +15,7 @@ vad: stt: type: nemo - model: "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi" + model: "stt_en_fastconformer_hybrid_large_streaming_80ms" device: "cuda" att_context_size: [70, 1] frame_len_in_secs: 0.08 # default for FastConformer, do not change @@ -33,7 +33,7 @@ turn_taking: llm: type: hf - model: "meta-llama/Meta-Llama-3-8B-Instruct" + model: "Qwen/Qwen3-8B" device: "cuda" temperature: 0.7 # LLM sampling params top_p: 0.9 # LLM sampling params From 371193827f9c4d680817f0d07657a81abb8d7595 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 18:42:07 -0400 Subject: [PATCH 17/47] fix diar diable Signed-off-by: stevehuang52 --- nemo/collections/voice_agent/pipecat/services/nemo/diar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py index 51edc717f74b..a2eb653462ad 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py @@ -82,7 +82,6 @@ def __init__( raise ValueError("params is required") self._load_model() - logger.info(f"Diarization service initialized on device: {self._model.device}") self._vad_user_speaking = False self._audio_buffer = [] @@ -106,6 +105,7 @@ def _load_model(self): ) else: raise ValueError(f"Invalid backend: {self._backend}") + logger.info(f"Diarization service initialized on device: {self._device}") def can_generate_metrics(self) -> bool: """Indicates whether this service can generate metrics. From 9200a520a689fe7765ff4645f760aef96af87eba Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 19:08:24 -0400 Subject: [PATCH 18/47] fix diar diable Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 2 + .../server/bot_websocket_server.py | 39 ++++++++++++------- .../voice_agent/server/server_config.yaml | 4 +- .../voice_agent/pipecat/services/nemo/diar.py | 3 -- .../pipecat/services/nemo/legacy_asr.py | 3 +- 5 files changed, 30 insertions(+), 21 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 54c50ca376c2..988abcb76490 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -81,6 +81,8 @@ npm install npm run dev ``` +If you see errors like `SyntaxError: Unexpected reserved word`, please update the Node.js version. + ### Connect to the client via browser Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index d63ce71236d8..4bdace40458e 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -182,15 +182,18 @@ async def run_bot_websocket_server(): ) logger.info("STT service initialized") - diar = NemoDiarService( - model=DIAR_MODEL, - device=STT_DEVICE, - params=diar_params, - sample_rate=SAMPLE_RATE, - backend="legacy", - enabled=USE_DIAR, - ) - logger.info("Diarization service initialized") + if USE_DIAR: + diar = NemoDiarService( + model=DIAR_MODEL, + device=STT_DEVICE, + params=diar_params, + sample_rate=SAMPLE_RATE, + backend="legacy", + enabled=USE_DIAR, + ) + logger.info("Diarization service initialized") + else: + diar = None turn_taking = NeMoTurnTakingService( use_vad=True, @@ -268,21 +271,27 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg logger.info("Setting up pipeline...") - pipeline = Pipeline( + pipeline = [ + ws_transport.input(), + rtvi, + stt, + ] + + if USE_DIAR: + pipeline.append(diar) + + pipeline.extend( [ - ws_transport.input(), - rtvi, - stt, - diar, turn_taking, user_context_aggregator, llm, # LLM tts, ws_transport.output(), - assistant_context_aggregator, ] ) + pipeline = Pipeline(pipeline) + task = PipelineTask( pipeline, params=PipelineParams( diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index ca6ace3a0a70..110d307eb116 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -22,8 +22,8 @@ stt: diar: type: nemo - enabled: false - model: null + enabled: false # the checkpoint is under release process + model: null # the checkpoint is under release process device: "cuda" threshold: 0.4 frame_len_in_secs: 0.08 # default for FastConformer, do not change diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py index a2eb653462ad..e923aeaee52d 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/diar.py @@ -165,11 +165,8 @@ def _diarization_processor(self): logger.debug("Received stop signal in background processor") break - # logger.debug(f"Processing audio chunk of size {len(audio)} bytes") - # Process diarization diar_result = self._model.diarize(audio) - # logger.debug(f"Diarization result: {diar_result is not None}") # Send result back to async loop asyncio.run_coroutine_threadsafe(self._response_queue.put(diar_result), self.get_event_loop()) diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py index cd759608a73d..39258b6dc8fe 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -159,7 +159,8 @@ def _load_model(self, model: str): if hasattr(asr_model.encoder, "set_default_att_context_size"): asr_model.encoder.set_default_att_context_size(att_context_size=self.att_context_size) - # chunk_size is set automatically for models trained for streaming. For models trained for offline mode with full context, we need to pass the chunk_size explicitly. + # chunk_size is set automatically for models trained for streaming. + # For models trained for offline mode with full context, we need to pass the chunk_size explicitly. if self.chunk_size > 0: if self.shift_size < 0: shift_size = self.chunk_size From af315bd7315e4d17907444fcfd59302c4b82f2e9 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 20:20:31 -0400 Subject: [PATCH 19/47] update ux Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 14 +++++-- .../server/bot_websocket_server.py | 11 ++--- .../voice_agent/server/server_config.yaml | 4 +- .../voice_agent/pipecat/services/nemo/tts.py | 40 ++++++++++++++++++- 4 files changed, 58 insertions(+), 11 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 988abcb76490..6de468e347a0 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -35,6 +35,13 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim ### Install dependencies +First, install or update the npm and node.js to the latest version, for example in Ubuntu: + +```bash +sudo apt-get update +sudo apt-get install -y npm nodejs +``` + Create a new conda environment with the dependencies: ```bash conda env create -f environment.yml @@ -55,7 +62,7 @@ Edit the `server/server_config.yaml` file to configure the server, for example: - Configure the LLM parameters, such as temperature, max tokens, etc. - Distribute different components to different GPUs if you have more than one. - Adjust VAD parameters for sensitivity and end-of-turn detection timeout. -- If you want to access the client from another machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine. +- **If you want to access the client from another machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine.** @@ -81,8 +88,6 @@ npm install npm run dev ``` -If you see errors like `SyntaxError: Unexpected reserved word`, please update the Node.js version. - ### Connect to the client via browser Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. @@ -93,8 +98,9 @@ Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can m ### 🤖 LLM Most LLMs from HuggingFace are supported. A few examples are: -- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) +- [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) - [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) - [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) ### 🎤 ASR diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index 4bdace40458e..3cab7f4839e8 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -113,13 +113,13 @@ LLM_TEMPERATURE = server_config.llm.temperature LLM_MAX_TOKENS = server_config.llm.max_tokens LLM_TOP_P = server_config.llm.top_p - +SYSTEM_ROLE = server_config.llm.get("system_role", "system") TTS_FASTPITCH_MODEL = server_config.tts.fastpitch_model TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model TTS_DEVICE = server_config.tts.device - -EXTRA_SEPARATOR = server_config.tts.get("extra_separator", None) +TTS_THINK_TOKENS = server_config.tts.get("think_tokens", None) +TTS_EXTRA_SEPARATOR = server_config.tts.get("extra_separator", None) ################ End of Configuration ################# @@ -213,13 +213,14 @@ async def run_bot_websocket_server(): ) logger.info("LLM service initialized") - text_aggregator = SimpleSegmentedTextAggregator(punctuation_marks=EXTRA_SEPARATOR) + text_aggregator = SimpleSegmentedTextAggregator(punctuation_marks=TTS_EXTRA_SEPARATOR) tts = NeMoFastPitchHiFiGANTTSService( fastpitch_model=TTS_FASTPITCH_MODEL, hifigan_model=TTS_HIFIGAN_MODEL, device=TTS_DEVICE, text_aggregator=text_aggregator, + think_tokens=TTS_THINK_TOKENS, ) logger.info("TTS service initialized") @@ -227,7 +228,7 @@ async def run_bot_websocket_server(): context = OpenAILLMContext( [ { - "role": "system", + "role": SYSTEM_ROLE, "content": BOT_PROMPT, } ], diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 110d307eb116..aea6673eb843 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -33,8 +33,9 @@ turn_taking: llm: type: hf - model: "Qwen/Qwen3-8B" + model: "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" device: "cuda" + system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt temperature: 0.7 # LLM sampling params top_p: 0.9 # LLM sampling params max_tokens: 128 # max num of tokens per LLM output @@ -46,3 +47,4 @@ tts: hifigan_model: "nvidia/tts_hifigan" device: "cuda" extra_separator: null # additional punctuations to chunk LLM response into segments for faster TTS output, e.g., "," + think_tokens: ["", ""] # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py index 37a28f1c873f..64319266f6bf 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py @@ -15,7 +15,7 @@ import asyncio import inspect from collections.abc import AsyncGenerator -from typing import Iterator +from typing import Iterator, List, Optional import numpy as np import torch @@ -54,12 +54,18 @@ def __init__( model, device: str = "cuda", sample_rate: int = 22050, + think_tokens: Optional[List[str]] = None, **kwargs, ): super().__init__(sample_rate=sample_rate, **kwargs) self._model_name = model self._device = device self._model = self._setup_model() + self._think_tokens = think_tokens + if think_tokens is not None: + assert ( + isinstance(think_tokens, list) and len(think_tokens) == 2 + ), "think_tokens must be a list of two strings" # Background processing infrastructure - no response handler needed self._tts_queue = asyncio.Queue() @@ -68,6 +74,7 @@ def __init__( # Track pending requests with their response queues self._pending_requests = {} + self._have_seen_think_tokens = False def _setup_model(self): raise NotImplementedError("Subclass must implement _setup_model") @@ -173,8 +180,39 @@ async def _processing_task_handler(self): finally: self._processing_running = False + def _handle_think_tokens(self, text: str) -> Optional[str]: + if not self._think_tokens: + return text + elif self._have_seen_think_tokens: + # LLM is thinking + if self._think_tokens[1] not in text: + # LLM is still thinking + return None + else: + # LLM is done thinking + idx = text.index(self._think_tokens[1]) + # only return the text after the end of thinking tokens + text = text[idx + len(self._think_tokens[1]) :] + self._have_seen_think_tokens = False + return text + elif self._think_tokens[0] in text: + # LLM now starts thinking + self._have_seen_think_tokens = True + # return text before the start of thinking tokens + idx = text.index(self._think_tokens[0]) + text = text[:idx] + return text + else: + return text + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using the Nemo TTS model.""" + text = self._handle_think_tokens(text) + + if not text: + yield None + return + logger.debug(f"{self}: Generating TTS [{text}]") try: From c22f41e53c9ab6d099f891959660224b4a5fd350 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 20:23:06 -0400 Subject: [PATCH 20/47] update tts Signed-off-by: stevehuang52 --- .../collections/voice_agent/pipecat/services/nemo/tts.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py index 64319266f6bf..ba65d239ff38 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py @@ -181,6 +181,14 @@ async def _processing_task_handler(self): self._processing_running = False def _handle_think_tokens(self, text: str) -> Optional[str]: + """ + Handle the thinking tokens for TTS. + If the thinking tokens are not provided, return the text as is. + If the thinking tokens are provided, and the LLM is thinking, return None. + If the thinking tokens are provided, and the LLM is done thinking, return the text after the end of thinking tokens. + If the thinking tokens are provided, and the LLM starts thinking, return the text before the start of thinking tokens. + If the thinking tokens are provided, and the LLM is not thinking, return the text as is. + """ if not self._think_tokens: return text elif self._have_seen_think_tokens: @@ -203,6 +211,7 @@ def _handle_think_tokens(self, text: str) -> Optional[str]: text = text[:idx] return text else: + # LLM is not thinking return text async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: From d2451b46e6e7cf4f5c22942e12a24d95a0462d27 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 25 Jul 2025 21:48:53 -0400 Subject: [PATCH 21/47] update readme Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 6de468e347a0..64535e00c7c0 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -42,6 +42,14 @@ sudo apt-get update sudo apt-get install -y npm nodejs ``` +or: + +```bash +curl -fsSL https://fnm.vercel.app/install | bash +. ~/.bashrc +fnm use --install-if-missing 20 +``` + Create a new conda environment with the dependencies: ```bash conda env create -f environment.yml @@ -62,7 +70,8 @@ Edit the `server/server_config.yaml` file to configure the server, for example: - Configure the LLM parameters, such as temperature, max tokens, etc. - Distribute different components to different GPUs if you have more than one. - Adjust VAD parameters for sensitivity and end-of-turn detection timeout. -- **If you want to access the client from another machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine.** + +**If you want to access the client from another machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine.** @@ -90,7 +99,9 @@ npm run dev ### Connect to the client via browser -Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. If using chrome browser, you might need to allow microphone access in the browser settings and add the ip address of the machine to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. +Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. + +If using chrome browser, you need to add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. ## 📑 Supported Models @@ -118,11 +129,12 @@ We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the spe We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, more TTS models will be supported in the future. -## 📝 Notes +## 📝 Notes & FAQ - If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. - +- If you see errors like `SyntaxError: Unexpected reserved word` when running `npm run dev`, please update the Node.js version. +- If you see the error `Error connecting: Cannot read properties of undefined (reading 'enumerateDevices')`, it usually means the browser is not allowed to access the microphone. Please check the browser settings and add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list, e.g., via `chrome://flags/#unsafely-treat-insecure-origin-as-secure` for chrome browser. From ff705be0193a6bc0a38ff059f262c08e177e13f7 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 28 Jul 2025 15:34:38 -0400 Subject: [PATCH 22/47] fix and update Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 5 +- .../server/bot_websocket_server.py | 20 ++----- .../voice_agent/server/server_config.yaml | 12 ++-- .../pipecat/services/nemo/legacy_asr.py | 17 ++---- .../voice_agent/pipecat/services/nemo/llm.py | 58 ++++++++++++------- .../voice_agent/pipecat/services/nemo/tts.py | 2 +- .../pipecat/services/nemo/turn_taking.py | 22 ++++++- 7 files changed, 78 insertions(+), 58 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 64535e00c7c0..2a56609d445f 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -23,8 +23,6 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim - - ## 🚀 Quick Start ### Hardware requirements @@ -117,7 +115,6 @@ Most LLMs from HuggingFace are supported. A few examples are: ### 🎤 ASR We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing Englishmodels for now: -- [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) - [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) ### 💬 Diarization @@ -130,7 +127,7 @@ We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to ge ## 📝 Notes & FAQ -- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. +- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. Same for TTS models. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. - If you see errors like `SyntaxError: Unexpected reserved word` when running `npm run dev`, please update the Node.js version. diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index 3cab7f4839e8..89b46d1f3561 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -110,15 +110,14 @@ LLM_MODEL = server_config.llm.model LLM_DEVICE = server_config.llm.device -LLM_TEMPERATURE = server_config.llm.temperature -LLM_MAX_TOKENS = server_config.llm.max_tokens -LLM_TOP_P = server_config.llm.top_p +LLM_GENERATION_KWARGS = OmegaConf.to_container(server_config.llm.get("generation_kwargs", {})) +LLM_APPLY_CHAT_TEMPLATE_KWARGS = OmegaConf.to_container(server_config.llm.get("apply_chat_template_kwargs", None)) SYSTEM_ROLE = server_config.llm.get("system_role", "system") TTS_FASTPITCH_MODEL = server_config.tts.fastpitch_model TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model TTS_DEVICE = server_config.tts.device -TTS_THINK_TOKENS = server_config.tts.get("think_tokens", None) +TTS_THINK_TOKENS = OmegaConf.to_container(server_config.tts.get("think_tokens", None)) TTS_EXTRA_SEPARATOR = server_config.tts.get("extra_separator", None) ################ End of Configuration ################# @@ -207,9 +206,8 @@ async def run_bot_websocket_server(): llm = HuggingFaceLLMService( model=LLM_MODEL, device=LLM_DEVICE, - temperature=LLM_TEMPERATURE, - max_tokens=LLM_MAX_TOKENS, - top_p=LLM_TOP_P, + generation_kwargs=LLM_GENERATION_KWARGS, + apply_chat_template_kwargs=LLM_APPLY_CHAT_TEMPLATE_KWARGS, ) logger.info("LLM service initialized") @@ -282,13 +280,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg pipeline.append(diar) pipeline.extend( - [ - turn_taking, - user_context_aggregator, - llm, # LLM - tts, - ws_transport.output(), - ] + [turn_taking, user_context_aggregator, llm, tts, ws_transport.output(), assistant_context_aggregator] # LLM ) pipeline = Pipeline(pipeline) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index aea6673eb843..6799c3a99e2d 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -29,16 +29,20 @@ diar: frame_len_in_secs: 0.08 # default for FastConformer, do not change turn_taking: - max_buffer_size: 2 + max_buffer_size: 2 # num of words more than this amount will interrupt the LLM immediately + bot_stop_delay: 0.5 # in seconds, a delay between server and client audio output llm: type: hf model: "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" device: "cuda" system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt - temperature: 0.7 # LLM sampling params - top_p: 0.9 # LLM sampling params - max_tokens: 128 # max num of tokens per LLM output + apply_chat_template_kwargs: null # please refer to the model page of each HF LLM model to set them correctly, by default `tokenize=False` and `add_generation_prompt=True` are applied + generation_kwargs: # kwargs that will be passed into model.generate() function of HF models + temperature: 0.7 # LLM sampling params + top_p: 0.9 # LLM sampling params + max_tokens: 128 # max num of tokens per LLM output + do_sample: true tts: type: nemo diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py index 39258b6dc8fe..ea59791bc72e 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -88,7 +88,6 @@ def __init__( ) self._reset_cache() self._previous_hypotheses = self._get_blank_hypothesis() - self._prev_num_tokens = 0 def _reset_cache(self): ( @@ -103,12 +102,9 @@ def _get_blank_hypothesis(self) -> List[Hypothesis]: blank_hypothesis = Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) return [blank_hypothesis] - def calc_drop_extra_pre_encoded(self, step_num) -> int: - # for the first step there is no need to drop any tokens after the downsampling as no caching is being used - if step_num == 0 and not self.pad_and_drop_preencoded: - return 0 - else: - return self.asr_model.encoder.streaming_cfg.drop_extra_pre_encoded + @property + def drop_extra_pre_encoded(self): + return self.asr_model.encoder.streaming_cfg.drop_extra_pre_encoded def get_blank_id(self): return len(self.tokenizer.vocab) @@ -149,7 +145,7 @@ def _load_model(self, model: str): decoding_cfg = asr_model.cfg.decoding with open_dict(decoding_cfg): decoding_cfg.strategy = "greedy" - decoding_cfg.compute_timestamps = None + decoding_cfg.compute_timestamps = False decoding_cfg.preserve_alignments = True if hasattr(asr_model, 'joint'): # if an RNNT model decoding_cfg.greedy.max_symbols = 10 @@ -213,8 +209,6 @@ def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: in features = self._audio_buffer.get_feature_buffer() feature_lengths = torch.tensor([features.shape[1]], device=self.device) features = features.unsqueeze(0) # Add batch dimension - is_first_step = self._audio_buffer.is_buffer_empty() - step_num = int(not is_first_step) keep_all_outputs = False with torch.no_grad(): @@ -231,7 +225,7 @@ def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: in cache_last_time=self._cache_last_time, cache_last_channel_len=self._cache_last_channel_len, keep_all_outputs=False, - drop_extra_pre_encoded=self.calc_drop_extra_pre_encoded(step_num), + drop_extra_pre_encoded=self.drop_extra_pre_encoded, ) if valid_out_len and not keep_all_outputs: @@ -260,4 +254,3 @@ def reset_state(self, stream_id: str = "default"): self._audio_buffer.reset() self._reset_cache() self._previous_hypotheses = self._get_blank_hypothesis() - self._prev_num_tokens = 0 diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py index 1e8d048c8db5..84241be2c3c9 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py @@ -25,31 +25,53 @@ from pipecat.services.openai.llm import OpenAILLMService from transformers import AsyncTextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer +DEFAULT_GENERATION_KWARGS = { + "max_new_tokens": 256, + "temperature": 0.7, + "top_p": 0.9, + "do_sample": True, +} + class HuggingFaceLLMLocalService: def __init__( self, model: str = "meta-llama/Meta-Llama-3-8B-Instruct", device: str = "cuda:0", - temperature=0.7, - max_tokens=256, - top_p=0.9, + generation_kwargs: dict = None, + apply_chat_template_kwargs: dict = None, ): self.device = device self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForCausalLM.from_pretrained( model, device_map=device, torch_dtype=torch.bfloat16 ) # type: AutoModelForCausalLM - self.temperature = temperature - self.max_tokens = max_tokens - self.top_p = top_p + + self.generation_kwargs = generation_kwargs if generation_kwargs else DEFAULT_GENERATION_KWARGS + self.apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs else {} + print(f"LLM generation kwargs: {self.generation_kwargs}") + + if "tokenize" in self.apply_chat_template_kwargs: + logger.warning( + f"`tokenize` is not configurable in apply_chat_template_kwargs, it will be ignored and forced to False" + ) + self.apply_chat_template_kwargs.pop("tokenize") + + if "add_generation_prompt" in self.apply_chat_template_kwargs: + logger.warning( + f"`add_generation_prompt` is not configurable in apply_chat_template_kwargs, it will be ignored and forced to True" + ) + self.apply_chat_template_kwargs.pop("add_generation_prompt") + + print(f"LLM apply_chat_template kwargs: {self.apply_chat_template_kwargs}") async def generate_stream( self, messages: List[ChatCompletionMessageParam], **kwargs ) -> AsyncGenerator[ChatCompletionChunk, None]: # Convert messages to prompt format - - prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + prompt = self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs + ) logger.debug(f"LLM prompt: {prompt}") @@ -60,10 +82,7 @@ async def generate_stream( generation_kwargs = { **inputs, "streamer": streamer, - "max_new_tokens": self.max_tokens, - "temperature": self.temperature, - "top_p": self.top_p, - "do_sample": True, + **self.generation_kwargs, } # Start generation in background @@ -92,25 +111,22 @@ def __init__( *, model: str = "google/gemma-7b-it", device: str = "cuda", - temperature=0.7, - max_tokens=256, - top_p=0.9, + generation_kwargs: dict = None, + apply_chat_template_kwargs: dict = None, **kwargs, ): self.model = model self.device = device - self.temperature = temperature - self.max_tokens = max_tokens - self.top_p = top_p + self.generation_kwargs = generation_kwargs if generation_kwargs is not None else DEFAULT_GENERATION_KWARGS + self.apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs is not None else {} super().__init__(model=model, **kwargs) def create_client(self, api_key=None, base_url=None, **kwargs): return HuggingFaceLLMLocalService( model=self.model, device=self.device, - temperature=self.temperature, - max_tokens=self.max_tokens, - top_p=self.top_p, + generation_kwargs=self.generation_kwargs, + apply_chat_template_kwargs=self.apply_chat_template_kwargs, ) async def _process_context(self, context: OpenAILLMContext): diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py index ba65d239ff38..a2954e9ca49f 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/tts.py @@ -65,7 +65,7 @@ def __init__( if think_tokens is not None: assert ( isinstance(think_tokens, list) and len(think_tokens) == 2 - ), "think_tokens must be a list of two strings" + ), f"think_tokens must be a list of two strings: {think_tokens}" # Background processing infrastructure - no response handler needed self._tts_queue = asyncio.Queue() diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py index 3eb1dfc1867f..a1e186cd015d 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time from typing import List from loguru import logger @@ -130,6 +131,7 @@ def __init__( use_diar: bool = False, max_buffer_size: int = 5, backchannel_phrases: List[str] = DEFAULT_BACKCHANNEL_PHRASES, + bot_stop_delay: float = 0.5, **kwargs, ): super().__init__(**kwargs) @@ -141,10 +143,11 @@ def __init__( self.max_buffer_size = max_buffer_size self.backchannel_phrases = backchannel_phrases self.backchannel_phrases_nopc = set([self.clean_text(phrase) for phrase in self.backchannel_phrases]) - + self.bot_stop_delay = bot_stop_delay # internal data self._current_speaker_id = None self._prev_speaker_id = None + self._bot_stop_time = None self._bot_speaking = False self._vad_user_speaking = False self._have_sent_user_started_speaking = False @@ -179,6 +182,17 @@ def is_backchannel(self, text: str) -> bool: async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) + + if self._bot_stop_time is not None: + # check if the bot has stopped speaking for more than the delay + if time.time() - self._bot_stop_time > self.bot_stop_delay: + # set the _bot_speaking flag to False to actually consider the bot as stopped speaking + logger.debug( + f"Bot stopped speaking for more than {self.bot_stop_delay} seconds, setting _bot_speaking to False" + ) + self._bot_stop_time = None + self._bot_speaking = False + if isinstance(frame, (TranscriptionFrame, InterimTranscriptionFrame)): await self._handle_transcription(frame, direction) elif isinstance(frame, VADUserStartedSpeakingFrame): @@ -190,7 +204,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): self._bot_speaking = True elif isinstance(frame, BotStoppedSpeakingFrame): logger.debug("BotStoppedSpeakingFrame received") - self._bot_speaking = False + self._bot_stop_time = time.time() + if self.bot_stop_delay is None or self.bot_stop_delay <= 0: + # only set the flag if the delay is not set or is 0 + self._bot_speaking = False + logger.debug(f"Setting _bot_speaking to False") elif isinstance(frame, DiarResultFrame): logger.debug("DiarResultFrame received") await self._handle_diar_result(frame, direction) From 79f799651b8a15fba306d47378f96a24199bde5c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 28 Jul 2025 17:03:51 -0400 Subject: [PATCH 23/47] fix asr Signed-off-by: stevehuang52 --- .../server/bot_websocket_server.py | 45 ++++++++++++------- .../voice_agent/server/server_config.yaml | 4 +- .../pipecat/services/nemo/legacy_asr.py | 25 ++++++----- 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index 89b46d1f3561..c159f5c11226 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -73,17 +73,9 @@ """ ################ Start of Configuration ################# -if server_config.get("bot_prompt", None) is not None: - bot_prompt = server_config.bot_prompt - if os.path.isfile(bot_prompt): - with open(bot_prompt, "r") as f: - bot_prompt = f.read() - BOT_PROMPT = bot_prompt - -logger.info(f"BOT_PROMPT: {BOT_PROMPT}") - TRANSPORT_AUDIO_OUT_10MS_CHUNKS = server_config.transport.audio_out_10ms_chunks +### VAD vad_params = VADParams( confidence=server_config.vad.confidence, start_secs=server_config.vad.start_secs, @@ -91,6 +83,7 @@ min_volume=server_config.vad.min_volume, ) +### STT STT_MODEL_PATH = server_config.stt.model STT_DEVICE = server_config.stt.device stt_params = NeMoSTTInputParams( @@ -99,6 +92,7 @@ raw_audio_frame_len_in_secs=RAW_AUDIO_FRAME_LEN_IN_SECS, ) +### Diarization DIAR_MODEL = server_config.diar.model USE_DIAR = server_config.diar.enabled diar_params = NeMoDiarInputParams( @@ -106,19 +100,39 @@ threshold=server_config.diar.threshold, ) +### Turn taking TURN_TAKING_MAX_BUFFER_SIZE = server_config.turn_taking.max_buffer_size +TURN_TAKING_BOT_STOP_DELAY = server_config.turn_taking.bot_stop_delay -LLM_MODEL = server_config.llm.model -LLM_DEVICE = server_config.llm.device -LLM_GENERATION_KWARGS = OmegaConf.to_container(server_config.llm.get("generation_kwargs", {})) -LLM_APPLY_CHAT_TEMPLATE_KWARGS = OmegaConf.to_container(server_config.llm.get("apply_chat_template_kwargs", None)) +### LLM SYSTEM_ROLE = server_config.llm.get("system_role", "system") +if server_config.get("bot_prompt", None) is not None: + bot_prompt = server_config.bot_prompt + if os.path.isfile(bot_prompt): + with open(bot_prompt, "r") as f: + bot_prompt = f.read() + BOT_PROMPT = bot_prompt +logger.info(f"BOT_PROMPT: {BOT_PROMPT}") +LLM_MODEL = server_config.llm.model +LLM_DEVICE = server_config.llm.device +LLM_GENERATION_KWARGS = server_config.llm.get("generation_kwargs", {}) +if LLM_GENERATION_KWARGS is not None: + LLM_GENERATION_KWARGS = OmegaConf.to_container(LLM_GENERATION_KWARGS) +LLM_APPLY_CHAT_TEMPLATE_KWARGS = server_config.llm.get("apply_chat_template_kwargs", None) +if LLM_APPLY_CHAT_TEMPLATE_KWARGS is not None: + LLM_APPLY_CHAT_TEMPLATE_KWARGS = OmegaConf.to_container(LLM_APPLY_CHAT_TEMPLATE_KWARGS) + +### TTS TTS_FASTPITCH_MODEL = server_config.tts.fastpitch_model TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model TTS_DEVICE = server_config.tts.device -TTS_THINK_TOKENS = OmegaConf.to_container(server_config.tts.get("think_tokens", None)) +TTS_THINK_TOKENS = server_config.tts.get("think_tokens", None) +if TTS_THINK_TOKENS is not None: + TTS_THINK_TOKENS = OmegaConf.to_container(TTS_THINK_TOKENS) TTS_EXTRA_SEPARATOR = server_config.tts.get("extra_separator", None) +if TTS_EXTRA_SEPARATOR is not None: + TTS_EXTRA_SEPARATOR = OmegaConf.to_container(TTS_EXTRA_SEPARATOR) ################ End of Configuration ################# @@ -198,6 +212,7 @@ async def run_bot_websocket_server(): use_vad=True, use_diar=USE_DIAR, max_buffer_size=TURN_TAKING_MAX_BUFFER_SIZE, + bot_stop_delay=TURN_TAKING_BOT_STOP_DELAY, ) logger.info("Turn taking service initialized") @@ -280,7 +295,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg pipeline.append(diar) pipeline.extend( - [turn_taking, user_context_aggregator, llm, tts, ws_transport.output(), assistant_context_aggregator] # LLM + [turn_taking, user_context_aggregator, llm, tts, ws_transport.output(), assistant_context_aggregator] ) pipeline = Pipeline(pipeline) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 6799c3a99e2d..5ad3b68c4ddf 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -41,8 +41,8 @@ llm: generation_kwargs: # kwargs that will be passed into model.generate() function of HF models temperature: 0.7 # LLM sampling params top_p: 0.9 # LLM sampling params - max_tokens: 128 # max num of tokens per LLM output - do_sample: true + max_new_tokens: 128 # max num of output tokens from LLM + do_sample: true # enable sampling tts: type: nemo diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py index ea59791bc72e..709d1bb6baa9 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -40,6 +40,7 @@ def __init__( sample_rate: int = 16000, frame_len_in_secs: float = 0.08, use_amp: bool = False, + chunk_size_in_secs: float = 0.08, ): self.model = model self.eou_string = eou_string @@ -55,6 +56,7 @@ def __init__( self.use_amp = use_amp self.pad_and_drop_preencoded = False self.blank_id = self.get_blank_id() + self.chunk_size_in_secs = chunk_size_in_secs print("NemoLegacyASRService initialized") @@ -66,19 +68,26 @@ def __init__( self.att_context_size[1] >= 0 ), f"Right att context size must be greater than 0: {self.att_context_size[1]}" - self.buffer_size_in_secs = (1 + sum(self.att_context_size)) * frame_len_in_secs - self.chunk_size_in_secs = frame_len_in_secs # (1 + self.att_context_size[1]) * frame_len_in_secs - window_stride_in_secs = self.asr_model.cfg.preprocessor.window_stride model_stride = self.asr_model.cfg.encoder.subsampling_factor - self.tokens_per_frame = math.ceil(np.trunc(self.chunk_size_in_secs / window_stride_in_secs) / model_stride) self.model_chunk_size = self.asr_model.encoder.streaming_cfg.chunk_size if isinstance(self.model_chunk_size, list): self.model_chunk_size = self.model_chunk_size[1] + self.pre_encode_cache_size = self.asr_model.encoder.streaming_cfg.pre_encode_cache_size + if isinstance(self.pre_encode_cache_size, list): + self.pre_encode_cache_size = self.pre_encode_cache_size[1] + self.pre_encode_cache_size_in_secs = self.pre_encode_cache_size * window_stride_in_secs + + self.tokens_per_frame = math.ceil(np.trunc(self.chunk_size_in_secs / window_stride_in_secs) / model_stride) # overwrite the encoder streaming params with proper shift size for cache aware streaming self.asr_model.encoder.setup_streaming_params( chunk_size=self.model_chunk_size // model_stride, shift_size=self.tokens_per_frame ) + + model_chunk_size_in_secs = self.model_chunk_size * window_stride_in_secs + + self.buffer_size_in_secs = self.pre_encode_cache_size_in_secs + model_chunk_size_in_secs + self._audio_buffer = CacheFeatureBufferer( sample_rate=sample_rate, buffer_size_in_secs=self.buffer_size_in_secs, @@ -200,7 +209,7 @@ def _get_tokens_from_alignments(self, alignments): raise ValueError("Decoder type not supported for this model.") return tokens - def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: int = 1) -> str: + def transcribe(self, audio: bytes, stream_id: str = "default") -> str: # Convert bytes to numpy array audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0 @@ -209,7 +218,6 @@ def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: in features = self._audio_buffer.get_feature_buffer() feature_lengths = torch.tensor([features.shape[1]], device=self.device) features = features.unsqueeze(0) # Add batch dimension - keep_all_outputs = False with torch.no_grad(): ( @@ -228,11 +236,6 @@ def transcribe(self, audio: bytes, stream_id: str = "default", valid_out_len: in drop_extra_pre_encoded=self.drop_extra_pre_encoded, ) - if valid_out_len and not keep_all_outputs: - # drop right context if any - encoded = encoded[:, :, :valid_out_len] - encoded_len = torch.ones_like(encoded_len) * valid_out_len - best_hyp = self._get_best_hypothesis(encoded, encoded_len, partial_hypotheses=self._previous_hypotheses) self._previous_hypotheses = best_hyp From 04e6bb6894948dbc69be89e101fc3758223e9d72 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 28 Jul 2025 17:14:00 -0400 Subject: [PATCH 24/47] update readmme Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 2a56609d445f..a5c86fb8218a 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -69,7 +69,7 @@ Edit the `server/server_config.yaml` file to configure the server, for example: - Distribute different components to different GPUs if you have more than one. - Adjust VAD parameters for sensitivity and end-of-turn detection timeout. -**If you want to access the client from another machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine.** +**If you want to access the server from a different machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine.** @@ -81,13 +81,14 @@ Open a terminal and run the server via: NEMO_PATH=??? # Use your local NeMo path for the latest version export PYTHONPATH=$NEMO_PATH:$PYTHONPATH -export HF_TOKEN=??? # Use your own HuggingFace token if needed -export WEBSOCKET_SERVER=websocket_server # currently only support websocket_server +# export HF_TOKEN="hf_..." # Use your own HuggingFace API token if needed, as some models may require. +# export HF_HUB_CACHE="/path/to/your/huggingface/cache" # change where HF cache is stored if you don't want to use the default cache +export WEBSOCKET_SERVER=websocket_server # currently only support `websocket_server` mode python ./server/server.py ``` ### Launch the client -In another terminal, run the client via: +In another terminal on the server machine, start the client via: ```bash cd client @@ -99,7 +100,7 @@ npm run dev Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. -If using chrome browser, you need to add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`. +**If using chrome browser, you need to add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`.** ## 📑 Supported Models @@ -116,6 +117,7 @@ Most LLMs from HuggingFace are supported. A few examples are: We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing Englishmodels for now: - [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) +- [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) ### 💬 Diarization From a0df9f3f08c60670449f7294c33e6982657de2ba Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Jul 2025 11:21:28 -0400 Subject: [PATCH 25/47] update doc and llm dtype Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 8 +++++-- .../server/bot_websocket_server.py | 10 ++++++++ .../voice_agent/server/server_config.yaml | 7 +++--- .../voice_agent/pipecat/services/nemo/llm.py | 23 +++++++++++-------- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index a5c86fb8218a..97ba5096aaa8 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -108,15 +108,17 @@ Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can m ### 🤖 LLM Most LLMs from HuggingFace are supported. A few examples are: -- [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) +- [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) (default) - [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) - [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) - [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) +Please refer to the HuggingFace webpage of each model to configure the model parameters `llm.generation_kwargs` and `llm.apply_chat_template_kwargs` in `server/server_config.yaml` as needed. + ### 🎤 ASR We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing Englishmodels for now: -- [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) +- [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) (default) - [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) ### 💬 Diarization @@ -139,6 +141,8 @@ We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to ge ## ☁️ NVIDIA NIM Services +NVIDIA also provides a variety of [NIM](https://developer.nvidia.com/nim?sortBy=developer_learning_library%2Fsort%2Ffeatured_in.nim%3Adesc%2Ctitle%3Aasc&hitsPerPage=12) services for better ASR, TTS and LLM performance with more efficient deployment on either cloud or local servers. + You can also modify the `server/bot_websocket_server.py` to use NVIDIA NIM services for better LLM,ASR and TTS performance, by refering to these Pipecat services: - [NVIDIA NIM LLM Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/nim/llm.py) - [NVIDIA Riva ASR Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py) diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index c159f5c11226..24263d79f9f3 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -73,8 +73,11 @@ """ ################ Start of Configuration ################# + +### Transport TRANSPORT_AUDIO_OUT_10MS_CHUNKS = server_config.transport.audio_out_10ms_chunks + ### VAD vad_params = VADParams( confidence=server_config.vad.confidence, @@ -83,6 +86,7 @@ min_volume=server_config.vad.min_volume, ) + ### STT STT_MODEL_PATH = server_config.stt.model STT_DEVICE = server_config.stt.device @@ -92,6 +96,7 @@ raw_audio_frame_len_in_secs=RAW_AUDIO_FRAME_LEN_IN_SECS, ) + ### Diarization DIAR_MODEL = server_config.diar.model USE_DIAR = server_config.diar.enabled @@ -100,10 +105,12 @@ threshold=server_config.diar.threshold, ) + ### Turn taking TURN_TAKING_MAX_BUFFER_SIZE = server_config.turn_taking.max_buffer_size TURN_TAKING_BOT_STOP_DELAY = server_config.turn_taking.bot_stop_delay + ### LLM SYSTEM_ROLE = server_config.llm.get("system_role", "system") if server_config.get("bot_prompt", None) is not None: @@ -116,6 +123,7 @@ LLM_MODEL = server_config.llm.model LLM_DEVICE = server_config.llm.device +LLM_DTYPE = server_config.llm.dtype LLM_GENERATION_KWARGS = server_config.llm.get("generation_kwargs", {}) if LLM_GENERATION_KWARGS is not None: LLM_GENERATION_KWARGS = OmegaConf.to_container(LLM_GENERATION_KWARGS) @@ -123,6 +131,7 @@ if LLM_APPLY_CHAT_TEMPLATE_KWARGS is not None: LLM_APPLY_CHAT_TEMPLATE_KWARGS = OmegaConf.to_container(LLM_APPLY_CHAT_TEMPLATE_KWARGS) + ### TTS TTS_FASTPITCH_MODEL = server_config.tts.fastpitch_model TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model @@ -221,6 +230,7 @@ async def run_bot_websocket_server(): llm = HuggingFaceLLMService( model=LLM_MODEL, device=LLM_DEVICE, + dtype=LLM_DTYPE, generation_kwargs=LLM_GENERATION_KWARGS, apply_chat_template_kwargs=LLM_APPLY_CHAT_TEMPLATE_KWARGS, ) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 5ad3b68c4ddf..69f28e584aff 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -1,10 +1,10 @@ -# bot_prompt: /path/to/prompt.txt +# bot_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt.s bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker." transport: - audio_out_10ms_chunks: 8 # use 4 as websocket default, but increasing to larger number might have less glitches in TTS audio + audio_out_10ms_chunks: 8 # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output vad: type: silero @@ -34,7 +34,8 @@ turn_taking: llm: type: hf - model: "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" + dtype: bfloat16 # torch.dtype for LLM + model: "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" # model name for HF models, will be used via `AutoModelForCausalLM.from_pretrained()` device: "cuda" system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt apply_chat_template_kwargs: null # please refer to the model page of each HF LLM model to set them correctly, by default `tokenize=False` and `add_generation_prompt=True` are applied diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py index 84241be2c3c9..7f40c9fff26e 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py @@ -38,13 +38,15 @@ def __init__( self, model: str = "meta-llama/Meta-Llama-3-8B-Instruct", device: str = "cuda:0", + dtype: str = "bfloat16", generation_kwargs: dict = None, apply_chat_template_kwargs: dict = None, ): self.device = device + self.dtype = dtype self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForCausalLM.from_pretrained( - model, device_map=device, torch_dtype=torch.bfloat16 + model, device_map=device, torch_dtype=dtype ) # type: AutoModelForCausalLM self.generation_kwargs = generation_kwargs if generation_kwargs else DEFAULT_GENERATION_KWARGS @@ -111,22 +113,25 @@ def __init__( *, model: str = "google/gemma-7b-it", device: str = "cuda", + dtype: str = "bfloat16", generation_kwargs: dict = None, apply_chat_template_kwargs: dict = None, **kwargs, ): - self.model = model - self.device = device - self.generation_kwargs = generation_kwargs if generation_kwargs is not None else DEFAULT_GENERATION_KWARGS - self.apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs is not None else {} + self._model_name = model + self._device = device + self._dtype = dtype + self._generation_kwargs = generation_kwargs if generation_kwargs is not None else DEFAULT_GENERATION_KWARGS + self._apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs is not None else {} super().__init__(model=model, **kwargs) def create_client(self, api_key=None, base_url=None, **kwargs): return HuggingFaceLLMLocalService( - model=self.model, - device=self.device, - generation_kwargs=self.generation_kwargs, - apply_chat_template_kwargs=self.apply_chat_template_kwargs, + model=self._model_name, + device=self._device, + dtype=self._dtype, + generation_kwargs=self._generation_kwargs, + apply_chat_template_kwargs=self._apply_chat_template_kwargs, ) async def _process_context(self, context: OpenAILLMContext): From 5c51c2914eef1016cfe1b96b8b22e06a4d6b4382 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Jul 2025 11:33:14 -0400 Subject: [PATCH 26/47] refactor and add example prompts Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 1 + examples/voice_agent/client/README.md | 27 ------------------- .../example_prompts/simple_chatbot.txt | 3 +++ .../example_prompts/simple_chatbot_diar.txt | 6 +++++ .../voice_agent/server/server_config.yaml | 4 +-- 5 files changed, 12 insertions(+), 29 deletions(-) delete mode 100644 examples/voice_agent/client/README.md create mode 100644 examples/voice_agent/example_prompts/simple_chatbot.txt create mode 100644 examples/voice_agent/example_prompts/simple_chatbot_diar.txt diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 97ba5096aaa8..52f60092d164 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -83,6 +83,7 @@ export PYTHONPATH=$NEMO_PATH:$PYTHONPATH # export HF_TOKEN="hf_..." # Use your own HuggingFace API token if needed, as some models may require. # export HF_HUB_CACHE="/path/to/your/huggingface/cache" # change where HF cache is stored if you don't want to use the default cache +# export SERVER_CONFIG_PATH="/path/to/your/server_config.yaml" # change where the server config is stored if you have a couple of different configs export WEBSOCKET_SERVER=websocket_server # currently only support `websocket_server` mode python ./server/server.py ``` diff --git a/examples/voice_agent/client/README.md b/examples/voice_agent/client/README.md deleted file mode 100644 index 753c6d563780..000000000000 --- a/examples/voice_agent/client/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# JavaScript Implementation - -Basic implementation using the [Pipecat JavaScript SDK](https://docs.pipecat.ai/client/js/introduction). - -## Setup - -1. Run the bot server. See the [server README](../README). - -2. Navigate to the `client/javascript` directory: - -```bash -cd client/javascript -``` - -3. Install dependencies: - -```bash -npm install -``` - -4. Run the client app: - -``` -npm run dev -``` - -5. Visit http://localhost:5173 in your browser. diff --git a/examples/voice_agent/example_prompts/simple_chatbot.txt b/examples/voice_agent/example_prompts/simple_chatbot.txt new file mode 100644 index 000000000000..9ded9d0e841d --- /dev/null +++ b/examples/voice_agent/example_prompts/simple_chatbot.txt @@ -0,0 +1,3 @@ +You are a helpful AI agent named Lisa. +Start by greeting the user warmly and introducing yourself within one sentence. +Your answer should be concise and to the point. \ No newline at end of file diff --git a/examples/voice_agent/example_prompts/simple_chatbot_diar.txt b/examples/voice_agent/example_prompts/simple_chatbot_diar.txt new file mode 100644 index 000000000000..abe8db2f3ee3 --- /dev/null +++ b/examples/voice_agent/example_prompts/simple_chatbot_diar.txt @@ -0,0 +1,6 @@ +You are a helpful AI agent named Lisa. +Start by greeting the user warmly and introducing yourself within one sentence. +Your answer should be concise and to the point. +You might also see speaker tags (, , etc.) in the user context. +You should respond to the user based on the speaker tag and the context of that speaker. +Do not include the speaker tags in your response, use them only to identify the speaker. \ No newline at end of file diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 69f28e584aff..a2670166f7d4 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -1,6 +1,6 @@ -# bot_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt.s -bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker." +# bot_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` +bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point." transport: From 025244d3505f1e16ebfb86988600a06110d6880a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Jul 2025 14:43:16 -0400 Subject: [PATCH 27/47] update readme Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 52f60092d164..dde37108276a 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -1,6 +1,6 @@ # NeMo Voice Agent -A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is deployed locally so you can have your own voice agent. +A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is open-source and deployed locally so you can have your own voice agent. Feel free to explore the code and see how different speech technologies can be integrated with LLMs to create a seamless conversation experience. @@ -20,6 +20,7 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim - Better streaming ASR and diarization pipeline. - Better TTS model with more natural voice. - Joint ASR and diarization model. +- Function calling, RAG, etc. @@ -33,7 +34,7 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim ### Install dependencies -First, install or update the npm and node.js to the latest version, for example in Ubuntu: +First, install or update the npm and node.js to the latest version, for example: ```bash sudo apt-get update @@ -48,18 +49,19 @@ curl -fsSL https://fnm.vercel.app/install | bash fnm use --install-if-missing 20 ``` -Create a new conda environment with the dependencies: +Second, create a new conda environment with the dependencies: + ```bash conda env create -f environment.yml ``` -Activate the environment via `conda activate nemo-voice` +Then you can activate the environment via `conda activate nemo-voice`. -Alternatively, you can install the dependencies manually in an existing environment: +Alternatively, you can install the dependencies manually in an existing environment via: ```bash pip install -r requirements.txt ``` -The incompatability errors from pip can be ignored. +The incompatability errors from pip can be ignored, if any. ### Configure the server @@ -84,7 +86,6 @@ export PYTHONPATH=$NEMO_PATH:$PYTHONPATH # export HF_TOKEN="hf_..." # Use your own HuggingFace API token if needed, as some models may require. # export HF_HUB_CACHE="/path/to/your/huggingface/cache" # change where HF cache is stored if you don't want to use the default cache # export SERVER_CONFIG_PATH="/path/to/your/server_config.yaml" # change where the server config is stored if you have a couple of different configs -export WEBSOCKET_SERVER=websocket_server # currently only support `websocket_server` mode python ./server/server.py ``` @@ -132,6 +133,7 @@ We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to ge ## 📝 Notes & FAQ +- Only one connection to the server is supported at a time, a new connection will disconnect the previous one, but the context will be preserved. - If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. Same for TTS models. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. From 216278a055de22999a20666640a6e60424a412a9 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Jul 2025 14:45:02 -0400 Subject: [PATCH 28/47] update readme Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index dde37108276a..c43a35fb9593 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -119,7 +119,7 @@ Please refer to the HuggingFace webpage of each model to configure the model par ### 🎤 ASR -We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing Englishmodels for now: +We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing English models for now: - [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) (default) - [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) @@ -129,7 +129,7 @@ We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the spe ### 🔉 TTS -We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, more TTS models will be supported in the future. +We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, and it only supports English output. More TTS models will be supported in the future. ## 📝 Notes & FAQ From 72d0c67d1e3ca93f3a77642fed706409c740feeb Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Jul 2025 16:56:43 -0400 Subject: [PATCH 29/47] clean up Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/mixins/mixins.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index de32784a4032..d99ae3cc70b4 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -18,7 +18,7 @@ import tarfile import unicodedata from abc import ABC, abstractmethod -from typing import List, Optional +from typing import List import torch from omegaconf import DictConfig, OmegaConf, open_dict @@ -602,7 +602,6 @@ def conformer_stream_step( drop_extra_pre_encoded: int = None, return_transcription: bool = True, return_log_probs: bool = False, - valid_out_len: Optional[int] = None, ): """ It simulates a forward step with caching for streaming purposes. @@ -660,11 +659,6 @@ def conformer_stream_step( drop_extra_pre_encoded=drop_extra_pre_encoded, ) - if valid_out_len and not keep_all_outputs: - # drop right context if any - encoded = encoded[:, :, :valid_out_len] - encoded_len = torch.ones_like(encoded_len) * valid_out_len - if isinstance(self, asr_models.EncDecCTCModel) or ( isinstance(self, asr_models.EncDecHybridRNNTCTCModel) and self.cur_decoder == "ctc" ): From 149487e5895dc430f6287ac93676e47153934786 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 1 Aug 2025 18:19:30 -0400 Subject: [PATCH 30/47] clean up Signed-off-by: stevehuang52 --- nemo/collections/voice_agent/pipecat/services/nemo/llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py index 7f40c9fff26e..8feb62cadc0e 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py +++ b/nemo/collections/voice_agent/pipecat/services/nemo/llm.py @@ -17,7 +17,6 @@ from threading import Thread from typing import AsyncGenerator, List -import torch from loguru import logger from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam from pipecat.frames.frames import LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMTextFrame From 9c1470275d92d871bec36c69ced7d019d80c270a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 19 Aug 2025 11:56:33 -0400 Subject: [PATCH 31/47] update info on streaming sortformer Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 5 +++-- examples/voice_agent/server/server_config.yaml | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index c43a35fb9593..50b6013f9af8 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -10,7 +10,7 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim - Talk to most LLMs from HuggingFace, use different prompts to configure the agent. - Streaming speech recognition. - FastPitch-HiFiGAN TTS. -- Speaker diarization up to 4 speakers (checkpoint will be released very soon). +- Speaker diarization up to 4 speakers. - WebSocket server for easy deployment. @@ -125,7 +125,8 @@ We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) t ### 💬 Diarization -We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. +We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. Currently supported models are: + - [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) (default) ### 🔉 TTS diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index a2670166f7d4..9cbb36ec77b4 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -1,6 +1,6 @@ # bot_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` -bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point." +bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker." transport: @@ -22,8 +22,8 @@ stt: diar: type: nemo - enabled: false # the checkpoint is under release process - model: null # the checkpoint is under release process + enabled: true # set to false to disable + model: "nvidia/diar_streaming_sortformer_4spk-v2" device: "cuda" threshold: 0.4 frame_len_in_secs: 0.08 # default for FastConformer, do not change From de1b138e70267b2576fe84f0d67d2f9e2a9b4289 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 19 Aug 2025 21:37:19 -0400 Subject: [PATCH 32/47] move code to 'nemo/agents/voice_agent' Signed-off-by: stevehuang52 --- .../voice_agent/server/bot_websocket_server.py | 14 +++++++------- .../voice_agent => agents}/__init__.py | 0 .../frames => agents/voice_agent}/__init__.py | 0 .../voice_agent/pipecat/__init__.py | 0 .../voice_agent/pipecat/frames}/__init__.py | 0 .../voice_agent/pipecat/frames/frames.py | 0 .../voice_agent/pipecat/services}/__init__.py | 0 .../voice_agent/pipecat/services/nemo/__init__.py | 0 .../voice_agent/pipecat/services/nemo/diar.py | 4 ++-- .../pipecat/services/nemo/legacy_asr.py | 2 +- .../pipecat/services/nemo/legacy_diar.py | 2 +- .../voice_agent/pipecat/services/nemo/llm.py | 0 .../voice_agent/pipecat/services/nemo/stt.py | 2 +- .../voice_agent/pipecat/services/nemo/tts.py | 0 .../pipecat/services/nemo/turn_taking.py | 2 +- .../voice_agent/pipecat/services/nemo/utils.py | 0 .../voice_agent/pipecat/transports}/__init__.py | 0 .../voice_agent/pipecat/transports/base_input.py | 0 .../pipecat/transports/base_transport.py | 0 .../pipecat/transports/network}/__init__.py | 0 .../pipecat/transports/network/websocket_server.py | 4 ++-- .../voice_agent/pipecat/utils}/__init__.py | 0 .../voice_agent/pipecat/utils/text/__init__.py | 13 +++++++++++++ .../pipecat/utils/text/simple_text_aggregator.py | 0 24 files changed, 28 insertions(+), 15 deletions(-) rename nemo/{collections/voice_agent => agents}/__init__.py (100%) rename nemo/{collections/voice_agent/pipecat/frames => agents/voice_agent}/__init__.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/__init__.py (100%) rename nemo/{collections/voice_agent/pipecat/services => agents/voice_agent/pipecat/frames}/__init__.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/frames/frames.py (100%) rename nemo/{collections/voice_agent/pipecat/transports => agents/voice_agent/pipecat/services}/__init__.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/__init__.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/diar.py (98%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/legacy_asr.py (99%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/legacy_diar.py (99%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/llm.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/stt.py (98%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/tts.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/turn_taking.py (99%) rename nemo/{collections => agents}/voice_agent/pipecat/services/nemo/utils.py (100%) rename nemo/{collections/voice_agent/pipecat/transports/network => agents/voice_agent/pipecat/transports}/__init__.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/transports/base_input.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/transports/base_transport.py (100%) rename nemo/{collections/voice_agent/pipecat/utils => agents/voice_agent/pipecat/transports/network}/__init__.py (100%) rename nemo/{collections => agents}/voice_agent/pipecat/transports/network/websocket_server.py (98%) rename nemo/{collections/voice_agent/pipecat/utils/text => agents/voice_agent/pipecat/utils}/__init__.py (100%) create mode 100644 nemo/agents/voice_agent/pipecat/utils/text/__init__.py rename nemo/{collections => agents}/voice_agent/pipecat/utils/text/simple_text_aggregator.py (100%) diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index 24263d79f9f3..4a28e3a62a15 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -44,16 +44,16 @@ from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor from pipecat.serializers.protobuf import ProtobufFrameSerializer -from nemo.collections.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService -from nemo.collections.voice_agent.pipecat.services.nemo.llm import HuggingFaceLLMService -from nemo.collections.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService -from nemo.collections.voice_agent.pipecat.services.nemo.tts import NeMoFastPitchHiFiGANTTSService -from nemo.collections.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService -from nemo.collections.voice_agent.pipecat.transports.network.websocket_server import ( +from nemo.agents.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService +from nemo.agents.voice_agent.pipecat.services.nemo.llm import HuggingFaceLLMService +from nemo.agents.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService +from nemo.agents.voice_agent.pipecat.services.nemo.tts import NeMoFastPitchHiFiGANTTSService +from nemo.agents.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService +from nemo.agents.voice_agent.pipecat.transports.network.websocket_server import ( WebsocketServerParams, WebsocketServerTransport, ) -from nemo.collections.voice_agent.pipecat.utils.text.simple_text_aggregator import SimpleSegmentedTextAggregator +from nemo.agents.voice_agent.pipecat.utils.text.simple_text_aggregator import SimpleSegmentedTextAggregator SERVER_CONFIG_PATH = os.environ.get( "SERVER_CONFIG_PATH", f"{os.path.dirname(os.path.abspath(__file__))}/server_config.yaml" diff --git a/nemo/collections/voice_agent/__init__.py b/nemo/agents/__init__.py similarity index 100% rename from nemo/collections/voice_agent/__init__.py rename to nemo/agents/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/frames/__init__.py b/nemo/agents/voice_agent/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/frames/__init__.py rename to nemo/agents/voice_agent/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/__init__.py b/nemo/agents/voice_agent/pipecat/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/__init__.py rename to nemo/agents/voice_agent/pipecat/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/services/__init__.py b/nemo/agents/voice_agent/pipecat/frames/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/services/__init__.py rename to nemo/agents/voice_agent/pipecat/frames/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/frames/frames.py b/nemo/agents/voice_agent/pipecat/frames/frames.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/frames/frames.py rename to nemo/agents/voice_agent/pipecat/frames/frames.py diff --git a/nemo/collections/voice_agent/pipecat/transports/__init__.py b/nemo/agents/voice_agent/pipecat/services/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/transports/__init__.py rename to nemo/agents/voice_agent/pipecat/services/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/__init__.py b/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/services/nemo/__init__.py rename to nemo/agents/voice_agent/pipecat/services/nemo/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/diar.py similarity index 98% rename from nemo/collections/voice_agent/pipecat/services/nemo/diar.py rename to nemo/agents/voice_agent/pipecat/services/nemo/diar.py index e923aeaee52d..83c717fdb998 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/diar.py @@ -34,8 +34,8 @@ from pipecat.utils.tracing.service_decorators import traced_stt from pydantic import BaseModel -from nemo.collections.voice_agent.pipecat.frames.frames import DiarResultFrame -from nemo.collections.voice_agent.pipecat.services.nemo.legacy_diar import DiarizationConfig, NeMoLegacyDiarService +from nemo.agents.voice_agent.pipecat.frames.frames import DiarResultFrame +from nemo.agents.voice_agent.pipecat.services.nemo.legacy_diar import DiarizationConfig, NeMoLegacyDiarService class NeMoDiarInputParams(BaseModel): diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py similarity index 99% rename from nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py rename to nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py index 709d1bb6baa9..118a9567b688 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -20,9 +20,9 @@ from omegaconf import open_dict import nemo.collections.asr as nemo_asr +from nemo.agents.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer -from nemo.collections.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer class NemoLegacyASRService: diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py similarity index 99% rename from nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py rename to nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py index 5d869f2e89b5..c345c700d073 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/legacy_diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -19,8 +19,8 @@ import torch from torch import Tensor +from nemo.agents.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer from nemo.collections.asr.models import SortformerEncLabelModel -from nemo.collections.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer @dataclass diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/llm.py b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/services/nemo/llm.py rename to nemo/agents/voice_agent/pipecat/services/nemo/llm.py diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/stt.py b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py similarity index 98% rename from nemo/collections/voice_agent/pipecat/services/nemo/stt.py rename to nemo/agents/voice_agent/pipecat/services/nemo/stt.py index 1e77bdb20cec..916f5e2fcd75 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/stt.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py @@ -34,7 +34,7 @@ from pipecat.utils.tracing.service_decorators import traced_stt from pydantic import BaseModel -from nemo.collections.voice_agent.pipecat.services.nemo.legacy_asr import NemoLegacyASRService +from nemo.agents.voice_agent.pipecat.services.nemo.legacy_asr import NemoLegacyASRService try: # disable nemo logging diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/tts.py b/nemo/agents/voice_agent/pipecat/services/nemo/tts.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/services/nemo/tts.py rename to nemo/agents/voice_agent/pipecat/services/nemo/tts.py diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py similarity index 99% rename from nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py rename to nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py index a1e186cd015d..dfc062c4c559 100644 --- a/nemo/collections/voice_agent/pipecat/services/nemo/turn_taking.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py @@ -33,7 +33,7 @@ from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 -from nemo.collections.voice_agent.pipecat.frames.frames import DiarResultFrame +from nemo.agents.voice_agent.pipecat.frames.frames import DiarResultFrame DEFAULT_BACKCHANNEL_PHRASES = [ "cool", diff --git a/nemo/collections/voice_agent/pipecat/services/nemo/utils.py b/nemo/agents/voice_agent/pipecat/services/nemo/utils.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/services/nemo/utils.py rename to nemo/agents/voice_agent/pipecat/services/nemo/utils.py diff --git a/nemo/collections/voice_agent/pipecat/transports/network/__init__.py b/nemo/agents/voice_agent/pipecat/transports/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/transports/network/__init__.py rename to nemo/agents/voice_agent/pipecat/transports/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/transports/base_input.py b/nemo/agents/voice_agent/pipecat/transports/base_input.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/transports/base_input.py rename to nemo/agents/voice_agent/pipecat/transports/base_input.py diff --git a/nemo/collections/voice_agent/pipecat/transports/base_transport.py b/nemo/agents/voice_agent/pipecat/transports/base_transport.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/transports/base_transport.py rename to nemo/agents/voice_agent/pipecat/transports/base_transport.py diff --git a/nemo/collections/voice_agent/pipecat/utils/__init__.py b/nemo/agents/voice_agent/pipecat/transports/network/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/utils/__init__.py rename to nemo/agents/voice_agent/pipecat/transports/network/__init__.py diff --git a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py b/nemo/agents/voice_agent/pipecat/transports/network/websocket_server.py similarity index 98% rename from nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py rename to nemo/agents/voice_agent/pipecat/transports/network/websocket_server.py index bc52c579cf73..800d9ddb860e 100644 --- a/nemo/collections/voice_agent/pipecat/transports/network/websocket_server.py +++ b/nemo/agents/voice_agent/pipecat/transports/network/websocket_server.py @@ -26,8 +26,8 @@ WebsocketServerParams, ) -from nemo.collections.voice_agent.pipecat.transports.base_input import BaseInputTransport -from nemo.collections.voice_agent.pipecat.transports.base_transport import TransportParams +from nemo.agents.voice_agent.pipecat.transports.base_input import BaseInputTransport +from nemo.agents.voice_agent.pipecat.transports.base_transport import TransportParams try: import websockets diff --git a/nemo/collections/voice_agent/pipecat/utils/text/__init__.py b/nemo/agents/voice_agent/pipecat/utils/__init__.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/utils/text/__init__.py rename to nemo/agents/voice_agent/pipecat/utils/__init__.py diff --git a/nemo/agents/voice_agent/pipecat/utils/text/__init__.py b/nemo/agents/voice_agent/pipecat/utils/text/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/utils/text/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/voice_agent/pipecat/utils/text/simple_text_aggregator.py b/nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py similarity index 100% rename from nemo/collections/voice_agent/pipecat/utils/text/simple_text_aggregator.py rename to nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py From 4997febf8b3c3a43cb33d71b287bca6c0f56e79e Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 21 Aug 2025 11:59:40 -0400 Subject: [PATCH 33/47] update doc Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 50b6013f9af8..156bdc9c64ae 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -1,16 +1,16 @@ # NeMo Voice Agent -A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is open-source and deployed locally so you can have your own voice agent. Feel free to explore the code and see how different speech technologies can be integrated with LLMs to create a seamless conversation experience. +A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is open-source and deployed locally so you can have your own voice agent. Feel free to explore the code and see how different speech technologies can be integrated with LLMs to create a seamless conversation experience. As of now, we only support English input and output, but more languages will be supported in the future. ## ✨ Key Features - Open-source, local deployment, and flexible customization. -- Talk to most LLMs from HuggingFace, use different prompts to configure the agent. -- Streaming speech recognition. -- FastPitch-HiFiGAN TTS. -- Speaker diarization up to 4 speakers. +- Allow users to talk to most LLMs from HuggingFace with configurable prompts. +- Streaming speech recognition with low latency. +- FastPitch-HiFiGAN TTS for fast audio response generation. +- Speaker diarization up to 4 speakers across different turns. - WebSocket server for easy deployment. @@ -128,6 +128,9 @@ We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) t We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. Currently supported models are: - [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) (default) + +Please note that in some circumstances, the diarization model might not work well in noisy environments, or it may confuse the speakers. In this case, you can disable the diarization by setting `diar.enabled` to `false` in `server/server_config.yaml`. + ### 🔉 TTS We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, and it only supports English output. More TTS models will be supported in the future. From f31bd13aa778dae03768a2cce608f5ce5d667b4a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 25 Aug 2025 13:47:39 -0400 Subject: [PATCH 34/47] clean up Signed-off-by: stevehuang52 --- .../pipecat/services/nemo/turn_taking.py | 38 +------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py index dfc062c4c559..f658dda0c190 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py @@ -129,7 +129,7 @@ def __init__( language: Language = Language.EN_US, use_vad: bool = True, use_diar: bool = False, - max_buffer_size: int = 5, + max_buffer_size: int = 3, backchannel_phrases: List[str] = DEFAULT_BACKCHANNEL_PHRASES, bot_stop_delay: float = 0.5, **kwargs, @@ -249,8 +249,7 @@ async def _handle_transcription( direction=FrameDirection.UPSTREAM, ) self._have_sent_user_started_speaking = False # treat it as if the user is not speaking - completed_text = "" - self._user_speaking_buffer = "" + self._user_speaking_buffer = "" # discard backchannel string and reset the buffer else: # if bot is not speaking, the backchannel string is not considered a backchannel phrase # user is still speaking, so we append the text segment to the buffer @@ -422,36 +421,3 @@ async def _handle_diar_result(self, frame: DiarResultFrame, direction: FrameDire self._user_speaking_buffer = f" {self._user_speaking_buffer}" logger.debug(f"Speaker changed from {last_speaker_id} to {new_speaker_id}") self._current_speaker_id = new_speaker_id - - -class NeMoTextTurnTakingService(NeMoTurnTakingService): - def __init__(self, **kwargs): - super().__init__(**kwargs) - - async def _handle_transcription( - self, frame: TranscriptionFrame | InterimTranscriptionFrame, direction: FrameDirection - ): - text_segment = frame.text - if self._vad_user_speaking: - self._user_speaking_buffer = " " + text_segment - is_backchannel = self.is_backchannel(self._user_speaking_buffer) - # num_words = len(self._user_speaking_buffer.strip().split()) - if isinstance(frame, TranscriptionFrame): - logger.debug(f"Completed user turn detected: `{self._user_speaking_buffer}`") - if is_backchannel: - logger.debug(f"Backchannel detected: `{self._user_speaking_buffer}`") - self._user_speaking_buffer = "" - self._have_sent_user_started_speaking = False - return - - logger.debug(f"Completed user turn: `{self._user_speaking_buffer}`") - completed_text = self._user_speaking_buffer - await self._handle_completed_text(completed_text, direction) - await self._handle_user_interruption(UserStoppedSpeakingFrame()) - self._have_sent_user_started_speaking = False - self._user_speaking_buffer = "" - - elif isinstance(frame, InterimTranscriptionFrame): - logger.debug(f"InterimTranscription Detected: `{self._user_speaking_buffer}`") - else: - logger.debug(f"User is not speaking, ignoring text segment: `{text_segment}`") From 5cc5e2668335723091c44138c4267a1d04314938 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 2 Sep 2025 15:15:24 -0400 Subject: [PATCH 35/47] refactor Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 23 ++-- .../server/backchannel_phrases.yaml | 79 ++++++++++++ .../server/bot_websocket_server.py | 5 +- .../voice_agent/server/server_config.yaml | 1 + .../pipecat/services/nemo/legacy_asr.py | 1 + .../pipecat/services/nemo/legacy_diar.py | 2 +- .../pipecat/services/nemo/turn_taking.py | 113 ++++-------------- .../pipecat/services/nemo/utils.py | 1 + 8 files changed, 127 insertions(+), 98 deletions(-) create mode 100644 examples/voice_agent/server/backchannel_phrases.yaml diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 156bdc9c64ae..33e272448984 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -15,11 +15,11 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim ## 💡 Upcoming Next -- More accurate and noise-robust streaming ASR and diarization models. +- More accurate and noise-robust streaming ASR models. - Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). -- Better streaming ASR and diarization pipeline. +- Better streaming ASR and speaker diarization pipeline. - Better TTS model with more natural voice. -- Joint ASR and diarization model. +- Joint ASR and speaker diarization model. - Function calling, RAG, etc. @@ -61,7 +61,7 @@ Alternatively, you can install the dependencies manually in an existing environm ```bash pip install -r requirements.txt ``` -The incompatability errors from pip can be ignored, if any. +The incompatibility errors from pip can be ignored. ### Configure the server @@ -119,13 +119,13 @@ Please refer to the HuggingFace webpage of each model to configure the model par ### 🎤 ASR -We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech. While new models are to be released, we use the existing English models for now: +We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech into text. While new models will be released soon, we use the existing English models for now: - [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) (default) - [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) -### 💬 Diarization +### 💬 Speaker Diarization -We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. Currently supported models are: +Speaker diarization aims to distinguish different speakers in the input speech audio. We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. Currently supported models are: - [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) (default) @@ -136,9 +136,16 @@ Please note that in some circumstances, the diarization model might not work wel We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, and it only supports English output. More TTS models will be supported in the future. +### Turn-taking + +As the new turn-taking prediction model is not yet released, we use the VAD-based turn-taking prediction for now. You can set the `vad.stop_secs` to the desired value in `server/server_config.yaml` to control the amount of silence needed to indicate the end of a user's turn. + +Additionally, the voice agent support ignoring back-channel phrases while the bot is talking, which it means phrases such as "uh-huh", "yeah", "okay" will not interrupt the bot while it's talking. To control the backchannel phrases to be used, you can set the `turn_taking.backchannel_phrases` to the desired list of phrases or a file path to a yaml file containing the list of phrases in `server/server_config.yaml`. Setting it to `null` will disable detecting the backchannel phrases, and that the VAD will interrupt the bot immediately when the user starts speaking. + + ## 📝 Notes & FAQ - Only one connection to the server is supported at a time, a new connection will disconnect the previous one, but the context will be preserved. -- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded via somehing like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. Same for TTS models. +- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded using a command like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. Same for TTS models. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. - If you see errors like `SyntaxError: Unexpected reserved word` when running `npm run dev`, please update the Node.js version. diff --git a/examples/voice_agent/server/backchannel_phrases.yaml b/examples/voice_agent/server/backchannel_phrases.yaml new file mode 100644 index 000000000000..38c7523a7153 --- /dev/null +++ b/examples/voice_agent/server/backchannel_phrases.yaml @@ -0,0 +1,79 @@ +- "absolutely" +- "ah" +- "all right" +- "alright" +- "but yeah" +- "cool" +- "definitely" +- "exactly" +- "go ahead" +- "good" +- "great" +- "great thanks" +- "ha ha" +- "hi" +- "hmm" +- "humm" +- "huh" +- "i know" +- "i know right" +- "i see" +- "indeed" +- "interesting" +- "mhmm" +- "mhmm mhmm" +- "mhmm right" +- "mhmm yeah" +- "mhmm yes" +- "mm hmm" +- "mmhmm" +- "nice" +- "of course" +- "oh" +- "oh dear" +- "oh man" +- "oh okay" +- "oh wow" +- "oh yes" +- "ok" +- "ok thanks" +- "okay" +- "okay okay" +- "okay thanks" +- "perfect" +- "really" +- "right" +- "right exactly" +- "right right" +- "right yeah" +- "so yeah" +- "sounds good" +- "sure" +- "sure thing" +- "thank you" +- "thanks" +- "that's awesome" +- "thats right" +- "thats true" +- "true" +- "uh huh" +- "uh-huh" +- "uh-huh yeah" +- "uhhuh" +- "uhhuh okay" +- "um-humm" +- "well" +- "what" +- "wow" +- "yeah" +- "yeah i know" +- "yeah i see" +- "yeah mhmm" +- "yeah okay" +- "yeah right" +- "yeah uh-huh" +- "yeah yeah" +- "yep" +- "yes" +- "yes please" +- "yes yes" diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index 4a28e3a62a15..ba1a74daca37 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -107,6 +107,7 @@ ### Turn taking +TURN_TAKING_BACKCHANNEL_PHRASES = server_config.turn_taking.backchannel_phrases TURN_TAKING_MAX_BUFFER_SIZE = server_config.turn_taking.max_buffer_size TURN_TAKING_BOT_STOP_DELAY = server_config.turn_taking.bot_stop_delay @@ -183,7 +184,8 @@ async def run_bot_websocket_server(): vad_analyzer=vad_analyzer, session_timeout=None, # Disable session timeout audio_in_sample_rate=SAMPLE_RATE, - can_create_user_frames=False, + can_create_user_frames=TURN_TAKING_BACKCHANNEL_PHRASES + is None, # if backchannel phrases are disabled, we can use VAD to interrupt the bot immediately audio_out_10ms_chunks=TRANSPORT_AUDIO_OUT_10MS_CHUNKS, ), host="0.0.0.0", # Bind to all interfaces @@ -222,6 +224,7 @@ async def run_bot_websocket_server(): use_diar=USE_DIAR, max_buffer_size=TURN_TAKING_MAX_BUFFER_SIZE, bot_stop_delay=TURN_TAKING_BOT_STOP_DELAY, + backchannel_phrases=TURN_TAKING_BACKCHANNEL_PHRASES, ) logger.info("Turn taking service initialized") diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 9cbb36ec77b4..7e6a622bd7a5 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -29,6 +29,7 @@ diar: frame_len_in_secs: 0.08 # default for FastConformer, do not change turn_taking: + backchannel_phrases: "./server/backchannel_phrases.yaml" # set it to the actual path of the file, or specify a list of backchannel phrases here max_buffer_size: 2 # num of words more than this amount will interrupt the LLM immediately bot_stop_delay: 0.5 # in seconds, a delay between server and client audio output diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py index 118a9567b688..ecb2632254db 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. import math from typing import List diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py index c345c700d073..8ab1c5149918 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +# NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. from dataclasses import dataclass from typing import Optional, Tuple diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py index f658dda0c190..be012fdf8eb3 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py @@ -13,8 +13,10 @@ # limitations under the License. import time -from typing import List +from pathlib import Path +from typing import List, Optional, Union +import yaml from loguru import logger from pipecat.frames.frames import ( BotStartedSpeakingFrame, @@ -35,102 +37,17 @@ from nemo.agents.voice_agent.pipecat.frames.frames import DiarResultFrame -DEFAULT_BACKCHANNEL_PHRASES = [ - "cool", - "huh", - "okay okay", - "mhmm", - "mmhmm", - 'uhhuh', - 'uhhuh okay', - 'sure thing', - 'uh huh', - 'mm hmm', - 'hmm', - 'humm', - 'absolutely', - 'ah', - 'all right', - 'alright', - 'but yeah', - 'definitely', - 'exactly', - 'go ahead', - 'good', - 'great', - 'great thanks', - 'ha ha', - 'hi', - 'i know', - 'i know right', - 'i see', - 'indeed', - 'interesting', - 'mhmm', - 'mhmm mhmm', - 'mhmm right', - 'mhmm yeah', - 'mhmm yes', - 'nice', - 'of course', - 'oh', - 'oh dear', - 'oh man', - 'oh okay', - 'oh wow', - 'oh yes', - 'ok', - 'ok thanks', - 'okay', - 'okay okay', - 'okay thanks', - 'perfect', - 'really', - 'right', - 'right exactly', - 'right right', - 'right yeah', - 'so yeah', - 'sounds good', - 'sure', - 'thank you', - 'thanks', - "that's awesome", - 'thats right', - 'thats true', - 'true', - 'uh-huh', - 'uh-huh yeah', - 'uhhuh', - 'um-humm', - 'well', - 'what', - 'wow', - 'yeah', - 'yeah i know', - 'yeah i see', - 'yeah mhmm', - 'yeah okay', - 'yeah right', - 'yeah uh-huh', - 'yeah yeah', - 'yep', - 'yes', - 'yes please', - 'yes yes', -] - class NeMoTurnTakingService(FrameProcessor): def __init__( self, + backchannel_phrases: Union[str, List[str]] = None, eou_string: str = "", eob_string: str = "", language: Language = Language.EN_US, use_vad: bool = True, use_diar: bool = False, max_buffer_size: int = 3, - backchannel_phrases: List[str] = DEFAULT_BACKCHANNEL_PHRASES, bot_stop_delay: float = 0.5, **kwargs, ): @@ -141,7 +58,8 @@ def __init__( self.use_vad = use_vad self.use_diar = use_diar self.max_buffer_size = max_buffer_size - self.backchannel_phrases = backchannel_phrases + + self.backchannel_phrases = self._load_backchannel_phrases(backchannel_phrases) self.backchannel_phrases_nopc = set([self.clean_text(phrase) for phrase in self.backchannel_phrases]) self.bot_stop_delay = bot_stop_delay # internal data @@ -156,6 +74,25 @@ def __init__( # if vad is not used, we assume the user is always speaking self._vad_user_speaking = True + def _load_backchannel_phrases(self, backchannel_phrases: Optional[Union[str, List[str]]] = None): + if not backchannel_phrases: + return [] + + if isinstance(backchannel_phrases, str) and Path(backchannel_phrases).is_file(): + logger.info(f"Loading backchannel phrases from file: {backchannel_phrases}") + if not Path(backchannel_phrases).exists(): + raise FileNotFoundError(f"Backchannel phrases file not found: {backchannel_phrases}") + with open(backchannel_phrases, "r") as f: + backchannel_phrases = yaml.safe_load(f) + if not isinstance(backchannel_phrases, list): + raise ValueError(f"Backchannel phrases must be a list, got {type(backchannel_phrases)}") + logger.info(f"Loaded {len(backchannel_phrases)} backchannel phrases from file: {backchannel_phrases}") + elif isinstance(backchannel_phrases, list): + logger.info(f"Using backchannel phrases from list: {backchannel_phrases}") + else: + raise ValueError(f"Invalid backchannel phrases: {backchannel_phrases}") + return backchannel_phrases + def clean_text(self, text: str) -> str: """ Clean the text so that it can be used for backchannel detection. diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/utils.py b/nemo/agents/voice_agent/pipecat/services/nemo/utils.py index e125d9efd577..421bf9823b5a 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/utils.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. import math From b22a1509715e49f025b9c93326401d7d4463bcb6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 3 Sep 2025 21:27:00 -0400 Subject: [PATCH 36/47] update doc Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 7 ++++++- examples/voice_agent/client/vite.config.js | 2 +- examples/voice_agent/server/server.py | 2 ++ examples/voice_agent/server/server_config.yaml | 6 +++--- nemo/agents/voice_agent/pipecat/services/nemo/diar.py | 4 +++- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 33e272448984..f530cc2ef0de 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -98,12 +98,17 @@ npm install npm run dev ``` +There should be a message in terminal showing the address and port of the client. + ### Connect to the client via browser -Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/`. You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. +Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/` (or whatever address and port is shown in the terminal where the client was launched). + +You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. **If using chrome browser, you need to add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`.** +If you want to use a different port for client connection, you can modify `client/vite.config.js` to change the `port` variable. ## 📑 Supported Models diff --git a/examples/voice_agent/client/vite.config.js b/examples/voice_agent/client/vite.config.js index 936725c03697..16c0f9648ff8 100644 --- a/examples/voice_agent/client/vite.config.js +++ b/examples/voice_agent/client/vite.config.js @@ -9,7 +9,7 @@ export default defineConfig({ proxy: { // Proxy /api requests to the backend server '/connect': { - target: 'http://0.0.0.0:7860', // Replace with your backend URL + target: 'http://0.0.0.0:7860', // Replace with your backend URL if needed changeOrigin: true, }, }, diff --git a/examples/voice_agent/server/server.py b/examples/voice_agent/server/server.py index d3e4df98125f..df6aab6af651 100644 --- a/examples/voice_agent/server/server.py +++ b/examples/voice_agent/server/server.py @@ -72,6 +72,8 @@ async def main(): try: if server_mode == "websocket_server": tasks.append(run_bot_websocket_server()) + else: + raise ValueError(f"Invalid server mode: {server_mode}") config = uvicorn.Config(app, host="0.0.0.0", port=7860) server = uvicorn.Server(config) tasks.append(server.serve()) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 7e6a622bd7a5..1bd0f082b5cc 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -18,15 +18,15 @@ stt: model: "stt_en_fastconformer_hybrid_large_streaming_80ms" device: "cuda" att_context_size: [70, 1] - frame_len_in_secs: 0.08 # default for FastConformer, do not change + frame_len_in_secs: 0.08 # default for FastConformer, do not change unless using other architechtures diar: type: nemo enabled: true # set to false to disable model: "nvidia/diar_streaming_sortformer_4spk-v2" device: "cuda" - threshold: 0.4 - frame_len_in_secs: 0.08 # default for FastConformer, do not change + threshold: 0.4 # threshold value used to determine if a speaker exists or not, setting it to a lower value will increaset the sensitivity of the model + frame_len_in_secs: 0.08 # default for Sortformer, do not change unless using other architechtures turn_taking: backchannel_phrases: "./server/backchannel_phrases.yaml" # set it to the actual path of the file, or specify a list of backchannel phrases here diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/diar.py index 83c717fdb998..912179fd93e0 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/diar.py @@ -39,7 +39,9 @@ class NeMoDiarInputParams(BaseModel): - threshold: Optional[float] = 0.5 + threshold: Optional[float] = ( + 0.4 # threshold value used to determine if a speaker exists or not, setting it to a lower value will increase the sensitivity of the diarization model + ) language: Optional[Language] = Language.EN_US frame_len_in_secs: Optional[float] = 0.08 # 80ms for FastConformer model config_path: Optional[str] = None # path to the Niva ASR config file From 98006c05f53e2d203008f82cb96430064c073b2f Mon Sep 17 00:00:00 2001 From: Weiqing Wang Date: Thu, 4 Sep 2025 09:49:45 -0700 Subject: [PATCH 37/47] remove the unnecessary streaming state conversion and import it from sortformer_modules, remove PostProcessingParams Signed-off-by: Weiqing Wang --- .../pipecat/services/nemo/legacy_diar.py | 112 ++---------------- 1 file changed, 8 insertions(+), 104 deletions(-) diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py index 8ab1c5149918..5d9041463d3d 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -22,23 +22,7 @@ from nemo.agents.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer from nemo.collections.asr.models import SortformerEncLabelModel - -@dataclass -class PostProcessingParams: - """ - Postprocessing parameters for end-to-end speaker diarization models. - These parameters can significantly affect DER performance depending on the evaluation style and the dataset. - It is recommended to tune these parameters based on the evaluation style and the dataset - to achieve the desired DER performance. - """ - - onset: float = 0.5 # Onset threshold for detecting the beginning and end of a speech - offset: float = 0.5 # Offset threshold for detecting the end of a speech - pad_onset: float = 0.0 # Adding durations before each speech segment - pad_offset: float = 0.0 # Adding durations after each speech segment - min_duration_on: float = 0.0 # Threshold for short speech segment deletion - min_duration_off: float = 0.0 # Threshold for small non-speech deletion - +from nemo.collections.asr.modules.sortformer_modules import StreamingSortformerState @dataclass class DiarizationConfig: @@ -57,48 +41,6 @@ class DiarizationConfig: chunk_right_context: int = 7 -@dataclass -class SortformerStreamingState: - """ - A dataclass that holds the streaming state for the Sortformer diarization model. - This is based on the streaming state in SortformerEncLabelModel in NeMo. - """ - - spkcache: Optional[Tensor] = None - spkcache_lengths: Optional[Tensor] = None - spkcache_preds: Optional[Tensor] = None - fifo: Optional[Tensor] = None - fifo_lengths: Optional[Tensor] = None - fifo_preds: Optional[Tensor] = None - spk_perm: Optional[Tensor] = None - - def to(self, device): - """ - Move all tensors to the specified device. - - Args: - device: The device to move the tensors to. - - Returns: - SortformerStreamingState: The state with tensors moved to the specified device. - """ - if self.spkcache is not None: - self.spkcache = self.spkcache.to(device) - if self.spkcache_lengths is not None: - self.spkcache_lengths = self.spkcache_lengths.to(device) - if self.spkcache_preds is not None: - self.spkcache_preds = self.spkcache_preds.to(device) - if self.fifo is not None: - self.fifo = self.fifo.to(device) - if self.fifo_lengths is not None: - self.fifo_lengths = self.fifo_lengths.to(device) - if self.fifo_preds is not None: - self.fifo_preds = self.fifo_preds.to(device) - if self.spk_perm is not None: - self.spk_perm = self.spk_perm.to(device) - return self - - class NeMoLegacyDiarService: def __init__( self, @@ -191,7 +133,7 @@ def reset_state(self, stream_id: str = "default"): self.streaming_state = self.init_streaming_state(batch_size=1) self.total_preds = torch.zeros((1, 0, self.max_num_speakers), device=self.diarizer.device) - def init_streaming_state(self, batch_size: int = 1) -> SortformerStreamingState: + def init_streaming_state(self, batch_size: int = 1) -> StreamingSortformerState: """ Initialize the streaming state for the diarization model. @@ -206,28 +148,17 @@ def init_streaming_state(self, batch_size: int = 1) -> SortformerStreamingState: batch_size=batch_size, async_streaming=self.diarizer.async_streaming, device=self.device ) - # Convert SortformerStreamingState format - state = SortformerStreamingState( - spkcache=nemo_state.spkcache, - spkcache_lengths=nemo_state.spkcache_lengths, - spkcache_preds=nemo_state.spkcache_preds, - fifo=nemo_state.fifo, - fifo_lengths=nemo_state.fifo_lengths, - fifo_preds=nemo_state.fifo_preds, - spk_perm=nemo_state.spk_perm, - ) - - return state + return nemo_state def stream_step( self, processed_signal: Tensor, processed_signal_length: Tensor, - streaming_state: SortformerStreamingState, + streaming_state: StreamingSortformerState, total_preds: Tensor, left_offset: int = 0, right_offset: int = 0, - ) -> Tuple[SortformerStreamingState, Tensor]: + ) -> Tuple[StreamingSortformerState, Tensor]: """ Execute a single streaming step for diarization. @@ -249,25 +180,9 @@ def stream_step( if processed_signal_length.device != self.device: processed_signal_length = processed_signal_length.to(self.device) - # Make sure state is on the correct device - streaming_state = streaming_state.to(self.device) - if total_preds is not None and total_preds.device != self.device: total_preds = total_preds.to(self.device) - # Convert SortformerStreamingState to NeMo's format - class NemoStreamingState: - def __init__(self, state): - self.spkcache = state.spkcache - self.spkcache_lengths = state.spkcache_lengths - self.spkcache_preds = state.spkcache_preds - self.fifo = state.fifo - self.fifo_lengths = state.fifo_lengths - self.fifo_preds = state.fifo_preds - self.spk_perm = state.spk_perm - - nemo_streaming_state = NemoStreamingState(streaming_state) - with ( torch.amp.autocast(device_type=self.device, dtype=self.compute_dtype, enabled=self.use_amp), torch.inference_mode(), @@ -275,10 +190,10 @@ def __init__(self, state): ): try: # Call the model's forward_streaming_step method - nemo_streaming_state, diar_pred_out_stream = self.diarizer.forward_streaming_step( + streaming_state, diar_pred_out_stream = self.diarizer.forward_streaming_step( processed_signal=processed_signal, processed_signal_length=processed_signal_length, - streaming_state=nemo_streaming_state, + streaming_state=streaming_state, total_preds=total_preds, left_offset=left_offset, right_offset=right_offset, @@ -292,15 +207,4 @@ def __init__(self, state): # Return the existing state and preds if there's an error return streaming_state, total_preds - # Convert back to SortformerStreamingState format - new_streaming_state = SortformerStreamingState( - spkcache=nemo_streaming_state.spkcache, - spkcache_lengths=nemo_streaming_state.spkcache_lengths, - spkcache_preds=nemo_streaming_state.spkcache_preds, - fifo=nemo_streaming_state.fifo, - fifo_lengths=nemo_streaming_state.fifo_lengths, - fifo_preds=nemo_streaming_state.fifo_preds, - spk_perm=nemo_streaming_state.spk_perm, - ) - - return new_streaming_state, diar_pred_out_stream + return streaming_state, diar_pred_out_stream From e5665b075c5a60c5fbdb18d18d7de698564db235 Mon Sep 17 00:00:00 2001 From: weiqingw4ng Date: Thu, 4 Sep 2025 16:50:42 +0000 Subject: [PATCH 38/47] Apply isort and black reformatting Signed-off-by: weiqingw4ng --- nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py index 5d9041463d3d..965941d671fe 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -24,6 +24,7 @@ from nemo.collections.asr.modules.sortformer_modules import StreamingSortformerState + @dataclass class DiarizationConfig: """Diarization configuration parameters for inference.""" From e23935dcddbdc1c7bd60de48ea926d1601947793 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 4 Sep 2025 12:53:22 -0400 Subject: [PATCH 39/47] update doc Signed-off-by: stevehuang52 --- .../voice_agent/example_prompts/simple_chatbot_diar.txt | 3 ++- examples/voice_agent/server/server_config.yaml | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/voice_agent/example_prompts/simple_chatbot_diar.txt b/examples/voice_agent/example_prompts/simple_chatbot_diar.txt index abe8db2f3ee3..2c6baa58e7a0 100644 --- a/examples/voice_agent/example_prompts/simple_chatbot_diar.txt +++ b/examples/voice_agent/example_prompts/simple_chatbot_diar.txt @@ -3,4 +3,5 @@ Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. -Do not include the speaker tags in your response, use them only to identify the speaker. \ No newline at end of file +Do not include the speaker tags in your response, use them only to identify the speaker. +If a speaker provides their name, use their name when addressing their requests. \ No newline at end of file diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 1bd0f082b5cc..dc72f77ab192 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -9,15 +9,15 @@ transport: vad: type: silero confidence: 0.6 # VAD threshold for detecting speech versus non-speech - start_secs: 0.1 # min amout of speech to trigger UserStartSpeaking - stop_secs: 0.8 # min about of silence to trigger UserStopSpeaking + start_secs: 0.1 # min amount of speech to trigger UserStartSpeaking + stop_secs: 0.8 # min amount of silence to trigger UserStopSpeaking min_volume: 0.4 # Microphone volumn threshold for VAD stt: type: nemo model: "stt_en_fastconformer_hybrid_large_streaming_80ms" device: "cuda" - att_context_size: [70, 1] + att_context_size: [70, 1] # left and right attention context sizes for streaming ASR frame_len_in_secs: 0.08 # default for FastConformer, do not change unless using other architechtures diar: From f752abc9782d7ae1614f14b6b25c7c73bf9d9aa5 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 4 Sep 2025 13:05:46 -0400 Subject: [PATCH 40/47] clean up Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 10 +++++++--- .../voice_agent/pipecat/services/nemo/__init__.py | 1 - .../voice_agent/pipecat/services/nemo/legacy_diar.py | 3 ++- nemo/agents/voice_agent/pipecat/services/nemo/stt.py | 1 - 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index f530cc2ef0de..b2fee90274cb 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -10,13 +10,13 @@ A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the sim - Allow users to talk to most LLMs from HuggingFace with configurable prompts. - Streaming speech recognition with low latency. - FastPitch-HiFiGAN TTS for fast audio response generation. -- Speaker diarization up to 4 speakers across different turns. +- Speaker diarization up to 4 speakers in different userturns. - WebSocket server for easy deployment. ## 💡 Upcoming Next - More accurate and noise-robust streaming ASR models. -- Faster EOU detection and backchannel handling (e.g., bot will not stop speaking when user is saying something like "uhuh", "wow", "i see"). +- Faster EOU detection and handling backchannel phrases. - Better streaming ASR and speaker diarization pipeline. - Better TTS model with more natural voice. - Joint ASR and speaker diarization model. @@ -130,7 +130,11 @@ We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) t ### 💬 Speaker Diarization -Speaker diarization aims to distinguish different speakers in the input speech audio. We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. As of now, we only support detecting 1 speaker for a single user turn, but different turns can be from different speakers, with a maximum of 4 speakers in the whole conversation. Currently supported models are: +Speaker diarization aims to distinguish different speakers in the input speech audio. We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. + +As of now, we only support detecting 1 speaker per user turn, but different turns come from different speakers, with a maximum of 4 speakers in the whole conversation. + +Currently supported models are: - [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) (default) diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py b/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py index 2830b8a94443..1b96c38c91ce 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from .diar import NemoDiarService from .llm import HuggingFaceLLMService from .stt import NemoSTTService diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py index 965941d671fe..2e08688e0e42 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. + from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Tuple import numpy as np import torch diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/stt.py b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py index 916f5e2fcd75..63ef595d2b00 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/stt.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import asyncio from typing import AsyncGenerator, List, Optional From 1ffddb2a5d7a1f018a9015040a4a9ab96897108c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 4 Sep 2025 14:29:20 -0400 Subject: [PATCH 41/47] fix for llama-nemotron template, and refactor Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 5 +++- .../server/bot_websocket_server.py | 18 ++++++------- .../voice_agent/server/server_config.yaml | 15 ++++++----- .../voice_agent/pipecat/services/nemo/llm.py | 26 ++++++++++++++++--- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index b2fee90274cb..fe79f89f5503 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -66,7 +66,7 @@ The incompatibility errors from pip can be ignored. ### Configure the server Edit the `server/server_config.yaml` file to configure the server, for example: -- Changing the LLM and prompt you want to use, by either putting a local path to a text file or the whole prompt string. See `example_prompts/` for examples to start with. +- Changing the LLM and system prompt you want to use in `llm.model` and `llm.system_prompt`, by either putting a local path to a text file or the whole prompt string. See `example_prompts/` for examples to start with. - Configure the LLM parameters, such as temperature, max tokens, etc. - Distribute different components to different GPUs if you have more than one. - Adjust VAD parameters for sensitivity and end-of-turn detection timeout. @@ -122,6 +122,9 @@ Most LLMs from HuggingFace are supported. A few examples are: Please refer to the HuggingFace webpage of each model to configure the model parameters `llm.generation_kwargs` and `llm.apply_chat_template_kwargs` in `server/server_config.yaml` as needed. +You can change the `llm.system_prompt` in `server/server_config.yaml` to configure the behavior of the LLM, by either putting a local path to a text file or the whole prompt string. See `example_prompts/` for examples to start with. + + ### 🎤 ASR We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech into text. While new models will be released soon, we use the existing English models for now: diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py index ba1a74daca37..cd8fa829dc60 100644 --- a/examples/voice_agent/server/bot_websocket_server.py +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -66,7 +66,7 @@ # Default Configuration SAMPLE_RATE = 16000 # Standard sample rate for speech recognition RAW_AUDIO_FRAME_LEN_IN_SECS = 0.016 # 16ms for websocket transport -BOT_PROMPT = """ +SYSTEM_PROMPT = """ You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. @@ -114,13 +114,13 @@ ### LLM SYSTEM_ROLE = server_config.llm.get("system_role", "system") -if server_config.get("bot_prompt", None) is not None: - bot_prompt = server_config.bot_prompt - if os.path.isfile(bot_prompt): - with open(bot_prompt, "r") as f: - bot_prompt = f.read() - BOT_PROMPT = bot_prompt -logger.info(f"BOT_PROMPT: {BOT_PROMPT}") +if server_config.llm.get("system_prompt", None) is not None: + system_prompt = server_config.llm.system_prompt + if os.path.isfile(system_prompt): + with open(system_prompt, "r") as f: + system_prompt = f.read() + SYSTEM_PROMPT = system_prompt +logger.info(f"System prompt: {SYSTEM_PROMPT}") LLM_MODEL = server_config.llm.model LLM_DEVICE = server_config.llm.device @@ -255,7 +255,7 @@ async def run_bot_websocket_server(): [ { "role": SYSTEM_ROLE, - "content": BOT_PROMPT, + "content": SYSTEM_PROMPT, } ], ) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index dc72f77ab192..dc73fd426912 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -1,7 +1,5 @@ - -# bot_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` -bot_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker." - +# This is an example config for setting up a NeMo Voice Agent server. +# Please refer to https://github.com/NVIDIA-NeMo/NeMo/tree/main/examples/voice_agent/README.md for more details transport: audio_out_10ms_chunks: 8 # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output @@ -30,8 +28,8 @@ diar: turn_taking: backchannel_phrases: "./server/backchannel_phrases.yaml" # set it to the actual path of the file, or specify a list of backchannel phrases here - max_buffer_size: 2 # num of words more than this amount will interrupt the LLM immediately - bot_stop_delay: 0.5 # in seconds, a delay between server and client audio output + max_buffer_size: 2 # num of words more than this amount will interrupt the LLM immediately if not backchannel phrases + bot_stop_delay: 0.5 # a delay in seconds allowed between server and client audio output, so that the BotStopSpeaking signal is handled not too far away from the actual time that the user hears all audio output llm: type: hf @@ -39,6 +37,9 @@ llm: model: "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" # model name for HF models, will be used via `AutoModelForCausalLM.from_pretrained()` device: "cuda" system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt + # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking + # system_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` + system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. If a speaker provides their name, use their name when addressing their requests." apply_chat_template_kwargs: null # please refer to the model page of each HF LLM model to set them correctly, by default `tokenize=False` and `add_generation_prompt=True` are applied generation_kwargs: # kwargs that will be passed into model.generate() function of HF models temperature: 0.7 # LLM sampling params @@ -52,5 +53,5 @@ tts: fastpitch_model: "nvidia/tts_en_fastpitch" hifigan_model: "nvidia/tts_hifigan" device: "cuda" - extra_separator: null # additional punctuations to chunk LLM response into segments for faster TTS output, e.g., "," + extra_separator: ",?!" # a string of additional punctuations to chunk LLM response into segments for faster TTS output, e.g., ",?!". Set to `null` to use default behavior think_tokens: ["", ""] # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py index 8feb62cadc0e..9bf699efdd2f 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py @@ -17,6 +17,7 @@ from threading import Thread from typing import AsyncGenerator, List +from jinja2.exceptions import TemplateError from loguru import logger from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam from pipecat.frames.frames import LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMTextFrame @@ -66,13 +67,32 @@ def __init__( print(f"LLM apply_chat_template kwargs: {self.apply_chat_template_kwargs}") + def _maybe_fix_messages(self, messages: List[ChatCompletionMessageParam]) -> List[ChatCompletionMessageParam]: + """ + Some LLMs like "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" requires a user turn after the system prompt, this function is used to add a dummy user turn if the system prompt is followed by an assistant turn. + """ + if messages[0]["role"] == "system" and messages[1]["role"] == "assistant": + message = {"role": "user", "content": "Hi"} + messages.insert(1, message) + return messages + async def generate_stream( self, messages: List[ChatCompletionMessageParam], **kwargs ) -> AsyncGenerator[ChatCompletionChunk, None]: + # Convert messages to prompt format - prompt = self.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs - ) + try: + prompt = self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs + ) + except TemplateError as e: + messages = self._maybe_fix_messages(messages) + prompt = self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs + ) + logger.warning( + f"Got TemplateError: {e}. Tried to fix by adding a dummy user message. New messages: {messages}" + ) logger.debug(f"LLM prompt: {prompt}") From 500b3967393de97493a0ac0361813f754949679c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 4 Sep 2025 14:57:57 -0400 Subject: [PATCH 42/47] fix tts separator Signed-off-by: stevehuang52 --- examples/voice_agent/server/server_config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index dc73fd426912..42e51757689e 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -53,5 +53,8 @@ tts: fastpitch_model: "nvidia/tts_en_fastpitch" hifigan_model: "nvidia/tts_hifigan" device: "cuda" - extra_separator: ",?!" # a string of additional punctuations to chunk LLM response into segments for faster TTS output, e.g., ",?!". Set to `null` to use default behavior + extra_separator: # a list of additional punctuations to chunk LLM response into segments for faster TTS output, e.g., ",". Set to `null` to use default behavior + - "," + - "?" + - "!" think_tokens: ["", ""] # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud From a59733c3e1ff18dadd72f230ca2a2ab7d2ecf874 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 5 Sep 2025 11:19:47 -0400 Subject: [PATCH 43/47] fix for llama-nemotron Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 1 + .../voice_agent/pipecat/services/nemo/llm.py | 90 ++++++++++++++++--- 2 files changed, 78 insertions(+), 13 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index fe79f89f5503..0f6499a6f67f 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -162,6 +162,7 @@ Additionally, the voice agent support ignoring back-channel phrases while the bo - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. - If you see errors like `SyntaxError: Unexpected reserved word` when running `npm run dev`, please update the Node.js version. - If you see the error `Error connecting: Cannot read properties of undefined (reading 'enumerateDevices')`, it usually means the browser is not allowed to access the microphone. Please check the browser settings and add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list, e.g., via `chrome://flags/#unsafely-treat-insecure-origin-as-secure` for chrome browser. +- If you see something like `node:internal/errors:496` when running `npm run dev`, remove the `client/node_modules` folder and run `npm install` again, then run `npm run dev` again. diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py index 9bf699efdd2f..fdc9084e9431 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py @@ -46,7 +46,7 @@ def __init__( self.dtype = dtype self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForCausalLM.from_pretrained( - model, device_map=device, torch_dtype=dtype + model, device_map=device, torch_dtype=dtype, trust_remote_code=True ) # type: AutoModelForCausalLM self.generation_kwargs = generation_kwargs if generation_kwargs else DEFAULT_GENERATION_KWARGS @@ -67,32 +67,92 @@ def __init__( print(f"LLM apply_chat_template kwargs: {self.apply_chat_template_kwargs}") - def _maybe_fix_messages(self, messages: List[ChatCompletionMessageParam]) -> List[ChatCompletionMessageParam]: + def _maybe_add_user_message(self, messages: List[ChatCompletionMessageParam]) -> List[ChatCompletionMessageParam]: """ Some LLMs like "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" requires a user turn after the system prompt, this function is used to add a dummy user turn if the system prompt is followed by an assistant turn. """ - if messages[0]["role"] == "system" and messages[1]["role"] == "assistant": + if len(messages) > 1 and messages[0]["role"] == "system" and messages[1]["role"] == "assistant": message = {"role": "user", "content": "Hi"} messages.insert(1, message) return messages - async def generate_stream( - self, messages: List[ChatCompletionMessageParam], **kwargs - ) -> AsyncGenerator[ChatCompletionChunk, None]: + def _maybe_merge_consecutive_turns( + self, messages: List[ChatCompletionMessageParam] + ) -> List[ChatCompletionMessageParam]: + """ + Merge consecutive turns of the same role into a single turn, since some LLMs like "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" do not support consecutive turns of the same role. + """ + if not messages: + return messages - # Convert messages to prompt format + merged_messages = [] + current_role = None + current_content = "" + + for message in messages: + role = message["role"] + content = message["content"] + + if role == current_role: + # Merge with previous message of same role + current_content += "; " + content + else: + # Save previous message if exists + if current_role is not None: + merged_messages.append({"role": current_role, "content": current_content}) + + # Start new message + current_role = role + current_content = content + + # Add the last message + if current_role is not None: + merged_messages.append({"role": current_role, "content": current_content}) + + return merged_messages + + def _get_prompt_from_messages(self, messages: List[ChatCompletionMessageParam]): try: prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs ) + return prompt except TemplateError as e: - messages = self._maybe_fix_messages(messages) + logger.warning(f"Got TemplateError: {e}.") + + logger.debug(f"Input LLM messages: {messages}") + if len(messages) > 1 and messages[0]["role"] == "system" and messages[1]["role"] == "assistant": + logger.warning("Trying to fix by adding dummy user message after system prompt...") + try: + messages = self._maybe_add_user_message(messages) + logger.debug(f"LLM messages after adding dummy user message: {messages}") + prompt = self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs + ) + return prompt + except TemplateError as e: + logger.warning(f"Got TemplateError: {e}. Trying to fix by merging consecutive turns if possible.") + + try: + new_messages = self._maybe_merge_consecutive_turns(messages) + logger.debug(f"LLM messages after merging consecutive user turns: {new_messages}") prompt = self.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs - ) - logger.warning( - f"Got TemplateError: {e}. Tried to fix by adding a dummy user message. New messages: {messages}" + new_messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs ) + # Update the messages in place if successful + messages.clear() + messages.extend(new_messages) + return prompt + except Exception as e: + logger.warning(f"Got Exception: {e}, messages: {messages}") + raise e + + async def generate_stream( + self, messages: List[ChatCompletionMessageParam], **kwargs + ) -> AsyncGenerator[ChatCompletionChunk, None]: + + # Convert messages to prompt format + prompt = self._get_prompt_from_messages(messages) logger.debug(f"LLM prompt: {prompt}") @@ -161,7 +221,7 @@ async def _process_context(self, context: OpenAILLMContext): and other information needed for the LLM interaction. """ await self.push_frame(LLMFullResponseStartFrame()) - + cumulative_text = "" try: await self.start_ttfb_metrics() messages = context.get_messages() @@ -169,12 +229,16 @@ async def _process_context(self, context: OpenAILLMContext): if chunk.choices[0].delta.content: await self.stop_ttfb_metrics() text = chunk.choices[0].delta.content + cumulative_text += text frame = LLMTextFrame(text) await self.push_frame(frame) except Exception as e: logger.error(f"Error in _process_context: {e}", exc_info=True) raise finally: + cumulative_text = " ".join(cumulative_text.split()).strip() + if not cumulative_text: + logger.warning(f"LLM response is empty for context: {context}") await self.push_frame(LLMFullResponseEndFrame()) async def get_chat_completions( From 90cbfc18a485f21ca27b623402bab9f88a6e97e8 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 5 Sep 2025 11:31:14 -0400 Subject: [PATCH 44/47] update cfg Signed-off-by: stevehuang52 --- examples/voice_agent/server/server_config.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 42e51757689e..4f1495ec0033 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -43,8 +43,10 @@ llm: apply_chat_template_kwargs: null # please refer to the model page of each HF LLM model to set them correctly, by default `tokenize=False` and `add_generation_prompt=True` are applied generation_kwargs: # kwargs that will be passed into model.generate() function of HF models temperature: 0.7 # LLM sampling params + top_k: 20 # LLM sampling params top_p: 0.9 # LLM sampling params - max_new_tokens: 128 # max num of output tokens from LLM + min_p: 0.0 # LLM sampling params + max_new_tokens: 256 # max num of output tokens from LLM do_sample: true # enable sampling tts: From 30a55bc8dd904ef8ea84f2e2c40119e6135bf830 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 5 Sep 2025 12:33:11 -0400 Subject: [PATCH 45/47] refactor and update doc Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 5 ++- .../voice_agent/server/server_config.yaml | 6 ++- .../voice_agent/pipecat/services/nemo/llm.py | 44 +++++++++---------- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 0f6499a6f67f..1dee0d6d9c25 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -1,6 +1,8 @@ # NeMo Voice Agent -A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is open-source and deployed locally so you can have your own voice agent. Feel free to explore the code and see how different speech technologies can be integrated with LLMs to create a seamless conversation experience. As of now, we only support English input and output, but more languages will be supported in the future. +A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is open-source and deployed locally so you can have your own voice agent. Feel free to explore the code and see how different speech technologies can be integrated with LLMs to create a seamless conversation experience. + +As of now, we only support English input and output, but more languages will be supported in the future. @@ -116,7 +118,6 @@ If you want to use a different port for client connection, you can modify `clien Most LLMs from HuggingFace are supported. A few examples are: - [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) (default) -- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) - [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) - [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 4f1495ec0033..010cecc9d8d5 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -2,7 +2,7 @@ # Please refer to https://github.com/NVIDIA-NeMo/NeMo/tree/main/examples/voice_agent/README.md for more details transport: - audio_out_10ms_chunks: 8 # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output + audio_out_10ms_chunks: 10 # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output vad: type: silero @@ -40,7 +40,9 @@ llm: # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking # system_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. If a speaker provides their name, use their name when addressing their requests." - apply_chat_template_kwargs: null # please refer to the model page of each HF LLM model to set them correctly, by default `tokenize=False` and `add_generation_prompt=True` are applied + apply_chat_template_kwargs: # please refer to the model page of each HF LLM model to set them correctly. + add_generation_prompt: true # This is required in most cases, do not change unless you're sure of it + tokenize: false # This is required, do not change generation_kwargs: # kwargs that will be passed into model.generate() function of HF models temperature: 0.7 # LLM sampling params top_k: 20 # LLM sampling params diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py index fdc9084e9431..2b3f07434ca6 100644 --- a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py +++ b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py @@ -50,22 +50,17 @@ def __init__( ) # type: AutoModelForCausalLM self.generation_kwargs = generation_kwargs if generation_kwargs else DEFAULT_GENERATION_KWARGS - self.apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs else {} - print(f"LLM generation kwargs: {self.generation_kwargs}") + logger.debug(f"LLM generation kwargs: {self.generation_kwargs}") + self.apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs else {} if "tokenize" in self.apply_chat_template_kwargs: - logger.warning( - f"`tokenize` is not configurable in apply_chat_template_kwargs, it will be ignored and forced to False" - ) + if self.apply_chat_template_kwargs["tokenize"] is not False: + logger.warning( + f"Found `tokenize=True` in apply_chat_template_kwargs, it will be ignored and forced to `False`" + ) self.apply_chat_template_kwargs.pop("tokenize") - if "add_generation_prompt" in self.apply_chat_template_kwargs: - logger.warning( - f"`add_generation_prompt` is not configurable in apply_chat_template_kwargs, it will be ignored and forced to True" - ) - self.apply_chat_template_kwargs.pop("add_generation_prompt") - - print(f"LLM apply_chat_template kwargs: {self.apply_chat_template_kwargs}") + logger.debug(f"LLM apply_chat_template kwargs: {self.apply_chat_template_kwargs}") def _maybe_add_user_message(self, messages: List[ChatCompletionMessageParam]) -> List[ChatCompletionMessageParam]: """ @@ -111,11 +106,20 @@ def _maybe_merge_consecutive_turns( return merged_messages - def _get_prompt_from_messages(self, messages: List[ChatCompletionMessageParam]): + def _apply_chat_template(self, messages: List[ChatCompletionMessageParam]) -> str: + """ + Apply the chat template to the messages. + """ + return self.tokenizer.apply_chat_template(messages, tokenize=False, **self.apply_chat_template_kwargs) + + def _get_prompt_from_messages(self, messages: List[ChatCompletionMessageParam]) -> str: + """ + Get the formatted prompt from the conversation history messages. + This function also tries to fix the messages if the LLM cannot handle consecutive turns of the same role, + or requires a user turn after the system prompt. + """ try: - prompt = self.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs - ) + prompt = self._apply_chat_template(messages) return prompt except TemplateError as e: logger.warning(f"Got TemplateError: {e}.") @@ -126,9 +130,7 @@ def _get_prompt_from_messages(self, messages: List[ChatCompletionMessageParam]): try: messages = self._maybe_add_user_message(messages) logger.debug(f"LLM messages after adding dummy user message: {messages}") - prompt = self.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs - ) + prompt = self._apply_chat_template(messages) return prompt except TemplateError as e: logger.warning(f"Got TemplateError: {e}. Trying to fix by merging consecutive turns if possible.") @@ -136,9 +138,7 @@ def _get_prompt_from_messages(self, messages: List[ChatCompletionMessageParam]): try: new_messages = self._maybe_merge_consecutive_turns(messages) logger.debug(f"LLM messages after merging consecutive user turns: {new_messages}") - prompt = self.tokenizer.apply_chat_template( - new_messages, tokenize=False, add_generation_prompt=True, **self.apply_chat_template_kwargs - ) + prompt = self._apply_chat_template(new_messages) # Update the messages in place if successful messages.clear() messages.extend(new_messages) From de148aea0ca58c309f3ee5ca1e766bddb78c14fa Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 5 Sep 2025 15:52:53 -0400 Subject: [PATCH 46/47] change default llm to qwen Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 7 ++++--- examples/voice_agent/server/server_config.yaml | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 1dee0d6d9c25..3f0eb5d29c0e 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -117,8 +117,9 @@ If you want to use a different port for client connection, you can modify `clien ### 🤖 LLM Most LLMs from HuggingFace are supported. A few examples are: -- [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) (default) -- [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) +- [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) (default) +- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) +- [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) - [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) Please refer to the HuggingFace webpage of each model to configure the model parameters `llm.generation_kwargs` and `llm.apply_chat_template_kwargs` in `server/server_config.yaml` as needed. @@ -158,7 +159,7 @@ Additionally, the voice agent support ignoring back-channel phrases while the bo ## 📝 Notes & FAQ - Only one connection to the server is supported at a time, a new connection will disconnect the previous one, but the context will be preserved. -- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded using a command like `huggingface-cli download Qwen/Qwen3-8B --local-dir `. Same for TTS models. +- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded using a command like `huggingface-cli download Qwen/Qwen2.5-7B-Instruct --local-dir `. Same for TTS models. - The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. - The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. - If you see errors like `SyntaxError: Unexpected reserved word` when running `npm run dev`, please update the Node.js version. diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 010cecc9d8d5..2c0e9870ed48 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -34,7 +34,7 @@ turn_taking: llm: type: hf dtype: bfloat16 # torch.dtype for LLM - model: "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" # model name for HF models, will be used via `AutoModelForCausalLM.from_pretrained()` + model: "Qwen/Qwen2.5-7B-Instruct" # model name for HF models, will be used via `AutoModelForCausalLM.from_pretrained()` device: "cuda" system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking From f3572f71f3d01d2c208ffd1f3e42f13a42021731 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 5 Sep 2025 16:06:04 -0400 Subject: [PATCH 47/47] update doc Signed-off-by: stevehuang52 --- examples/voice_agent/README.md | 1 + examples/voice_agent/server/server_config.yaml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md index 3f0eb5d29c0e..c50e9283ab87 100644 --- a/examples/voice_agent/README.md +++ b/examples/voice_agent/README.md @@ -118,6 +118,7 @@ If you want to use a different port for client connection, you can modify `clien Most LLMs from HuggingFace are supported. A few examples are: - [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) (default) +- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) - [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) - [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) - [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml index 2c0e9870ed48..1181ccf08380 100644 --- a/examples/voice_agent/server/server_config.yaml +++ b/examples/voice_agent/server/server_config.yaml @@ -40,7 +40,8 @@ llm: # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking # system_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. If a speaker provides their name, use their name when addressing their requests." - apply_chat_template_kwargs: # please refer to the model page of each HF LLM model to set them correctly. + # Please refer to the model page of each HF LLM model to set following params properly. + apply_chat_template_kwargs: # kwargs that will be passed into tokenizer.apply_chat_template() function add_generation_prompt: true # This is required in most cases, do not change unless you're sure of it tokenize: false # This is required, do not change generation_kwargs: # kwargs that will be passed into model.generate() function of HF models