diff --git a/.gitignore b/.gitignore index 1aa5ef00de5e..6184f338cb60 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,6 @@ examples/neural_graphs/*.yml nemo_experiments/ slurm*.out + +node_modules/ +.vite/ \ No newline at end of file diff --git a/examples/voice_agent/README.md b/examples/voice_agent/README.md new file mode 100644 index 000000000000..c50e9283ab87 --- /dev/null +++ b/examples/voice_agent/README.md @@ -0,0 +1,186 @@ +# NeMo Voice Agent + +A [Pipecat](https://github.com/pipecat-ai/pipecat) example demonstrating the simplest way to create a voice agent using NVIDIA NeMo STT/TTS service and HuggingFace LLM. Everything is open-source and deployed locally so you can have your own voice agent. Feel free to explore the code and see how different speech technologies can be integrated with LLMs to create a seamless conversation experience. + +As of now, we only support English input and output, but more languages will be supported in the future. + + + +## ✨ Key Features + +- Open-source, local deployment, and flexible customization. +- Allow users to talk to most LLMs from HuggingFace with configurable prompts. +- Streaming speech recognition with low latency. +- FastPitch-HiFiGAN TTS for fast audio response generation. +- Speaker diarization up to 4 speakers in different userturns. +- WebSocket server for easy deployment. + + +## 💡 Upcoming Next +- More accurate and noise-robust streaming ASR models. +- Faster EOU detection and handling backchannel phrases. +- Better streaming ASR and speaker diarization pipeline. +- Better TTS model with more natural voice. +- Joint ASR and speaker diarization model. +- Function calling, RAG, etc. + + + +## 🚀 Quick Start + +### Hardware requirements + +- A computer with at least one GPU. At least 18GB VRAM is recommended for using 8B LLMs, and 10GB VRAM for 4B LLMs. +- A microphone connected to the computer. +- A speaker connected to the computer. + +### Install dependencies + +First, install or update the npm and node.js to the latest version, for example: + +```bash +sudo apt-get update +sudo apt-get install -y npm nodejs +``` + +or: + +```bash +curl -fsSL https://fnm.vercel.app/install | bash +. ~/.bashrc +fnm use --install-if-missing 20 +``` + +Second, create a new conda environment with the dependencies: + +```bash +conda env create -f environment.yml +``` + +Then you can activate the environment via `conda activate nemo-voice`. + +Alternatively, you can install the dependencies manually in an existing environment via: +```bash +pip install -r requirements.txt +``` +The incompatibility errors from pip can be ignored. + +### Configure the server + +Edit the `server/server_config.yaml` file to configure the server, for example: +- Changing the LLM and system prompt you want to use in `llm.model` and `llm.system_prompt`, by either putting a local path to a text file or the whole prompt string. See `example_prompts/` for examples to start with. +- Configure the LLM parameters, such as temperature, max tokens, etc. +- Distribute different components to different GPUs if you have more than one. +- Adjust VAD parameters for sensitivity and end-of-turn detection timeout. + +**If you want to access the server from a different machine, you need to change the `baseUrl` in `client/src/app.ts` to the actual ip address of the server machine.** + + + +### Start the server + +Open a terminal and run the server via: + +```bash +NEMO_PATH=??? # Use your local NeMo path for the latest version +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +# export HF_TOKEN="hf_..." # Use your own HuggingFace API token if needed, as some models may require. +# export HF_HUB_CACHE="/path/to/your/huggingface/cache" # change where HF cache is stored if you don't want to use the default cache +# export SERVER_CONFIG_PATH="/path/to/your/server_config.yaml" # change where the server config is stored if you have a couple of different configs +python ./server/server.py +``` + +### Launch the client +In another terminal on the server machine, start the client via: + +```bash +cd client +npm install +npm run dev +``` + +There should be a message in terminal showing the address and port of the client. + +### Connect to the client via browser + +Open the client via browser: `http://[YOUR MACHINE IP ADDRESS]:5173/` (or whatever address and port is shown in the terminal where the client was launched). + +You can mute/unmute your microphone via the "Mute" button, and reset the LLM context history and speaker cache by clicking the "Reset" button. + +**If using chrome browser, you need to add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`.** + +If you want to use a different port for client connection, you can modify `client/vite.config.js` to change the `port` variable. + +## 📑 Supported Models + +### 🤖 LLM + +Most LLMs from HuggingFace are supported. A few examples are: +- [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) (default) +- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) +- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) +- [nvidia/Llama-3.1-Nemotron-Nano-8B-v1](https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1) +- [nvidia/Nemotron-Mini-4B-Instruct](https://huggingface.co/nvidia/Nemotron-Mini-4B-Instruct) + +Please refer to the HuggingFace webpage of each model to configure the model parameters `llm.generation_kwargs` and `llm.apply_chat_template_kwargs` in `server/server_config.yaml` as needed. + +You can change the `llm.system_prompt` in `server/server_config.yaml` to configure the behavior of the LLM, by either putting a local path to a text file or the whole prompt string. See `example_prompts/` for examples to start with. + + +### 🎤 ASR + +We use [cache-aware streaming FastConformer](https://arxiv.org/abs/2312.17279) to transcribe the user's speech into text. While new models will be released soon, we use the existing English models for now: +- [stt_en_fastconformer_hybrid_large_streaming_80ms](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms) (default) +- [nvidia/stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) + +### 💬 Speaker Diarization + +Speaker diarization aims to distinguish different speakers in the input speech audio. We use [streaming Sortformer](http://arxiv.org/abs/2507.18446) to detect the speaker for each user turn. + +As of now, we only support detecting 1 speaker per user turn, but different turns come from different speakers, with a maximum of 4 speakers in the whole conversation. + +Currently supported models are: + - [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) (default) + + +Please note that in some circumstances, the diarization model might not work well in noisy environments, or it may confuse the speakers. In this case, you can disable the diarization by setting `diar.enabled` to `false` in `server/server_config.yaml`. + +### 🔉 TTS + +We use [FastPitch-HiFiGAN](https://huggingface.co/nvidia/tts_en_fastpitch) to generate the speech for the LLM response, and it only supports English output. More TTS models will be supported in the future. + + +### Turn-taking + +As the new turn-taking prediction model is not yet released, we use the VAD-based turn-taking prediction for now. You can set the `vad.stop_secs` to the desired value in `server/server_config.yaml` to control the amount of silence needed to indicate the end of a user's turn. + +Additionally, the voice agent support ignoring back-channel phrases while the bot is talking, which it means phrases such as "uh-huh", "yeah", "okay" will not interrupt the bot while it's talking. To control the backchannel phrases to be used, you can set the `turn_taking.backchannel_phrases` to the desired list of phrases or a file path to a yaml file containing the list of phrases in `server/server_config.yaml`. Setting it to `null` will disable detecting the backchannel phrases, and that the VAD will interrupt the bot immediately when the user starts speaking. + + +## 📝 Notes & FAQ +- Only one connection to the server is supported at a time, a new connection will disconnect the previous one, but the context will be preserved. +- If directly loading from HuggingFace and got I/O erros, you can set `llm.model=`, where the model is downloaded using a command like `huggingface-cli download Qwen/Qwen2.5-7B-Instruct --local-dir `. Same for TTS models. +- The current ASR and diarization models are not noise-robust, you might need to use a noise-cancelling microphone or a quiet environment. But we will release better models soon. +- The diarization model works best with speakers that have much more different voices from each other, while it might not work well on some accents due to the limited training data. +- If you see errors like `SyntaxError: Unexpected reserved word` when running `npm run dev`, please update the Node.js version. +- If you see the error `Error connecting: Cannot read properties of undefined (reading 'enumerateDevices')`, it usually means the browser is not allowed to access the microphone. Please check the browser settings and add `http://[YOUR MACHINE IP ADDRESS]:5173/` to the allow list, e.g., via `chrome://flags/#unsafely-treat-insecure-origin-as-secure` for chrome browser. +- If you see something like `node:internal/errors:496` when running `npm run dev`, remove the `client/node_modules` folder and run `npm install` again, then run `npm run dev` again. + + + +## ☁️ NVIDIA NIM Services + +NVIDIA also provides a variety of [NIM](https://developer.nvidia.com/nim?sortBy=developer_learning_library%2Fsort%2Ffeatured_in.nim%3Adesc%2Ctitle%3Aasc&hitsPerPage=12) services for better ASR, TTS and LLM performance with more efficient deployment on either cloud or local servers. + +You can also modify the `server/bot_websocket_server.py` to use NVIDIA NIM services for better LLM,ASR and TTS performance, by refering to these Pipecat services: +- [NVIDIA NIM LLM Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/nim/llm.py) +- [NVIDIA Riva ASR Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py) +- [NVIDIA Riva TTS Service](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/tts.py) + +For details of available NVIDIA NIM services, please refer to: +- [NVIDIA NIM LLM Service](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html) +- [NVIDIA Riva ASR NIM Service](https://docs.nvidia.com/nim/riva/asr/latest/overview.html) +- [NVIDIA Riva TTS NIM Service](https://docs.nvidia.com/nim/riva/tts/latest/overview.html) + + diff --git a/examples/voice_agent/client/index.html b/examples/voice_agent/client/index.html new file mode 100644 index 000000000000..c347fa972b0b --- /dev/null +++ b/examples/voice_agent/client/index.html @@ -0,0 +1,85 @@ + + + + + + + AI Chatbot + + + + +
+
+
+ Transport: Disconnected +
+
+ + +
+
+ + + + +
+
+ +
+
Microphone Volume:
+
+
+
+
0%
+
+ + + +
+

Debug Info

+
+
+
+ + + + + + diff --git a/examples/voice_agent/client/package-lock.json b/examples/voice_agent/client/package-lock.json new file mode 100644 index 000000000000..c3d6301a2843 --- /dev/null +++ b/examples/voice_agent/client/package-lock.json @@ -0,0 +1,1672 @@ +{ + "name": "client", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "client", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "@pipecat-ai/client-js": "^0.4.0", + "@pipecat-ai/websocket-transport": "^0.4.1", + "protobufjs": "^7.4.0" + }, + "devDependencies": { + "@types/node": "^22.15.30", + "@types/protobufjs": "^6.0.0", + "@vitejs/plugin-react-swc": "^3.10.1", + "typescript": "^5.8.3", + "vite": "^6.3.5" + } + }, + "node_modules/@babel/runtime": { + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz", + "integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@bufbuild/protobuf": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.5.2.tgz", + "integrity": "sha512-foZ7qr0IsUBjzWIq+SuBLfdQCpJ1j8cTuNNT4owngTHoN5KsJb8L9t65fzz7SCeSWzescoOil/0ldqiL041ABg==" + }, + "node_modules/@bufbuild/protoplugin": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/@bufbuild/protoplugin/-/protoplugin-2.5.2.tgz", + "integrity": "sha512-7d/NUae/ugs/qgHEYOwkVWGDE3Bf/xjuGviVFs38+MLRdwiHNTiuvzPVwuIPo/1wuZCZn3Nax1cg1owLuY72xw==", + "dependencies": { + "@bufbuild/protobuf": "2.5.2", + "@typescript/vfs": "^1.5.2", + "typescript": "5.4.5" + } + }, + "node_modules/@bufbuild/protoplugin/node_modules/typescript": { + "version": "5.4.5", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz", + "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/@daily-co/daily-js": { + "version": "0.79.0", + "resolved": "https://registry.npmjs.org/@daily-co/daily-js/-/daily-js-0.79.0.tgz", + "integrity": "sha512-Ii/Zi6cfTl2EZBpX8msRPNkkCHcajA+ErXpbN2Xe2KySd1Nb4IzC/QWJlSl9VA9pIlYPQicRTDoZnoym/0uEAw==", + "dependencies": { + "@babel/runtime": "^7.12.5", + "@sentry/browser": "^8.33.1", + "bowser": "^2.8.1", + "dequal": "^2.0.3", + "events": "^3.1.0" + }, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.5.tgz", + "integrity": "sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.5.tgz", + "integrity": "sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.5.tgz", + "integrity": "sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.5.tgz", + "integrity": "sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.5.tgz", + "integrity": "sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.5.tgz", + "integrity": "sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.5.tgz", + "integrity": "sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.5.tgz", + "integrity": "sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.5.tgz", + "integrity": "sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.5.tgz", + "integrity": "sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.5.tgz", + "integrity": "sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.5.tgz", + "integrity": "sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.5.tgz", + "integrity": "sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==", + "cpu": [ + "mips64el" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.5.tgz", + "integrity": "sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.5.tgz", + "integrity": "sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.5.tgz", + "integrity": "sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.5.tgz", + "integrity": "sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.5.tgz", + "integrity": "sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.5.tgz", + "integrity": "sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.5.tgz", + "integrity": "sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.5.tgz", + "integrity": "sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.5.tgz", + "integrity": "sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.5.tgz", + "integrity": "sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.5.tgz", + "integrity": "sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.5.tgz", + "integrity": "sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@pipecat-ai/client-js": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/@pipecat-ai/client-js/-/client-js-0.4.1.tgz", + "integrity": "sha512-3jLKRzeryqLxtkqvr4Bvxe2OxoI7mdOFecm6iolZizXnk/BE480SEg2oAKyov3b5oT6+jmPlT+1HRBlTzEtL7A==", + "dependencies": { + "@types/events": "^3.0.3", + "clone-deep": "^4.0.1", + "events": "^3.3.0", + "typed-emitter": "^2.1.0", + "uuid": "^10.0.0" + } + }, + "node_modules/@pipecat-ai/websocket-transport": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/@pipecat-ai/websocket-transport/-/websocket-transport-0.4.2.tgz", + "integrity": "sha512-mOYnw9n60usODrE35D+uhFbJXl0DqXV32pAqSHu1of049s128mex6Qv+W49DBMVr8h5W6pLGrXhm+XDAtN5leg==", + "dependencies": { + "@daily-co/daily-js": "^0.79.0", + "@protobuf-ts/plugin": "^2.11.0", + "@protobuf-ts/runtime": "^2.11.0", + "x-law": "^0.3.1" + }, + "peerDependencies": { + "@pipecat-ai/client-js": "~0.4.0" + } + }, + "node_modules/@protobuf-ts/plugin": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/plugin/-/plugin-2.11.0.tgz", + "integrity": "sha512-Y+p4Axrk3thxws4BVSIO+x4CKWH2c8k3K+QPrp6Oq8agdsXPL/uwsMTIdpTdXIzTaUEZFASJL9LU56pob5GTHg==", + "dependencies": { + "@bufbuild/protobuf": "^2.4.0", + "@bufbuild/protoplugin": "^2.4.0", + "@protobuf-ts/protoc": "^2.11.0", + "@protobuf-ts/runtime": "^2.11.0", + "@protobuf-ts/runtime-rpc": "^2.11.0", + "typescript": "^3.9" + }, + "bin": { + "protoc-gen-dump": "bin/protoc-gen-dump", + "protoc-gen-ts": "bin/protoc-gen-ts" + } + }, + "node_modules/@protobuf-ts/plugin/node_modules/typescript": { + "version": "3.9.10", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.10.tgz", + "integrity": "sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + }, + "node_modules/@protobuf-ts/protoc": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/protoc/-/protoc-2.11.0.tgz", + "integrity": "sha512-GYfmv1rjZ/7MWzUqMszhdXiuoa4Js/j6zCbcxFmeThBBUhbrXdPU42vY+QVCHL9PvAMXO+wEhUfPWYdd1YgnlA==", + "bin": { + "protoc": "protoc.js" + } + }, + "node_modules/@protobuf-ts/runtime": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/runtime/-/runtime-2.11.0.tgz", + "integrity": "sha512-DfpRpUiNvPC3Kj48CmlU4HaIEY1Myh++PIumMmohBAk8/k0d2CkxYxJfPyUAxfuUfl97F4AvuCu1gXmfOG7OJQ==" + }, + "node_modules/@protobuf-ts/runtime-rpc": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@protobuf-ts/runtime-rpc/-/runtime-rpc-2.11.0.tgz", + "integrity": "sha512-g/oMPym5LjVyCc3nlQc6cHer0R3CyleBos4p7CjRNzdKuH/FlRXzfQYo6EN5uv8vLtn7zEK9Cy4YBKvHStIaag==", + "dependencies": { + "@protobuf-ts/runtime": "^2.11.0" + } + }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==" + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.11.tgz", + "integrity": "sha512-L/gAA/hyCSuzTF1ftlzUSI/IKr2POHsv1Dd78GfqkR83KMNuswWD61JxGV2L7nRwBBBSDr6R1gCkdTmoN7W4ag==", + "dev": true + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.43.0.tgz", + "integrity": "sha512-Krjy9awJl6rKbruhQDgivNbD1WuLb8xAclM4IR4cN5pHGAs2oIMMQJEiC3IC/9TZJ+QZkmZhlMO/6MBGxPidpw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.43.0.tgz", + "integrity": "sha512-ss4YJwRt5I63454Rpj+mXCXicakdFmKnUNxr1dLK+5rv5FJgAxnN7s31a5VchRYxCFWdmnDWKd0wbAdTr0J5EA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.43.0.tgz", + "integrity": "sha512-eKoL8ykZ7zz8MjgBenEF2OoTNFAPFz1/lyJ5UmmFSz5jW+7XbH1+MAgCVHy72aG59rbuQLcJeiMrP8qP5d/N0A==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.43.0.tgz", + "integrity": "sha512-SYwXJgaBYW33Wi/q4ubN+ldWC4DzQY62S4Ll2dgfr/dbPoF50dlQwEaEHSKrQdSjC6oIe1WgzosoaNoHCdNuMg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.43.0.tgz", + "integrity": "sha512-SV+U5sSo0yujrjzBF7/YidieK2iF6E7MdF6EbYxNz94lA+R0wKl3SiixGyG/9Klab6uNBIqsN7j4Y/Fya7wAjQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.43.0.tgz", + "integrity": "sha512-J7uCsiV13L/VOeHJBo5SjasKiGxJ0g+nQTrBkAsmQBIdil3KhPnSE9GnRon4ejX1XDdsmK/l30IYLiAaQEO0Cg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.43.0.tgz", + "integrity": "sha512-gTJ/JnnjCMc15uwB10TTATBEhK9meBIY+gXP4s0sHD1zHOaIh4Dmy1X9wup18IiY9tTNk5gJc4yx9ctj/fjrIw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.43.0.tgz", + "integrity": "sha512-ZJ3gZynL1LDSIvRfz0qXtTNs56n5DI2Mq+WACWZ7yGHFUEirHBRt7fyIk0NsCKhmRhn7WAcjgSkSVVxKlPNFFw==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.43.0.tgz", + "integrity": "sha512-8FnkipasmOOSSlfucGYEu58U8cxEdhziKjPD2FIa0ONVMxvl/hmONtX/7y4vGjdUhjcTHlKlDhw3H9t98fPvyA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.43.0.tgz", + "integrity": "sha512-KPPyAdlcIZ6S9C3S2cndXDkV0Bb1OSMsX0Eelr2Bay4EsF9yi9u9uzc9RniK3mcUGCLhWY9oLr6er80P5DE6XA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loongarch64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.43.0.tgz", + "integrity": "sha512-HPGDIH0/ZzAZjvtlXj6g+KDQ9ZMHfSP553za7o2Odegb/BEfwJcR0Sw0RLNpQ9nC6Gy8s+3mSS9xjZ0n3rhcYg==", + "cpu": [ + "loong64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.43.0.tgz", + "integrity": "sha512-gEmwbOws4U4GLAJDhhtSPWPXUzDfMRedT3hFMyRAvM9Mrnj+dJIFIeL7otsv2WF3D7GrV0GIewW0y28dOYWkmw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.43.0.tgz", + "integrity": "sha512-XXKvo2e+wFtXZF/9xoWohHg+MuRnvO29TI5Hqe9xwN5uN8NKUYy7tXUG3EZAlfchufNCTHNGjEx7uN78KsBo0g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.43.0.tgz", + "integrity": "sha512-ruf3hPWhjw6uDFsOAzmbNIvlXFXlBQ4nk57Sec8E8rUxs/AI4HD6xmiiasOOx/3QxS2f5eQMKTAwk7KHwpzr/Q==", + "cpu": [ + "riscv64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.43.0.tgz", + "integrity": "sha512-QmNIAqDiEMEvFV15rsSnjoSmO0+eJLoKRD9EAa9rrYNwO/XRCtOGM3A5A0X+wmG+XRrw9Fxdsw+LnyYiZWWcVw==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.43.0.tgz", + "integrity": "sha512-jAHr/S0iiBtFyzjhOkAics/2SrXE092qyqEg96e90L3t9Op8OTzS6+IX0Fy5wCt2+KqeHAkti+eitV0wvblEoQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.43.0.tgz", + "integrity": "sha512-3yATWgdeXyuHtBhrLt98w+5fKurdqvs8B53LaoKD7P7H7FKOONLsBVMNl9ghPQZQuYcceV5CDyPfyfGpMWD9mQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.43.0.tgz", + "integrity": "sha512-wVzXp2qDSCOpcBCT5WRWLmpJRIzv23valvcTwMHEobkjippNf+C3ys/+wf07poPkeNix0paTNemB2XrHr2TnGw==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.43.0.tgz", + "integrity": "sha512-fYCTEyzf8d+7diCw8b+asvWDCLMjsCEA8alvtAutqJOJp/wL5hs1rWSqJ1vkjgW0L2NB4bsYJrpKkiIPRR9dvw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.43.0.tgz", + "integrity": "sha512-SnGhLiE5rlK0ofq8kzuDkM0g7FN1s5VYY+YSMTibP7CqShxCQvqtNxTARS4xX4PFJfHjG0ZQYX9iGzI3FQh5Aw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@sentry-internal/browser-utils": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/browser-utils/-/browser-utils-8.55.0.tgz", + "integrity": "sha512-ROgqtQfpH/82AQIpESPqPQe0UyWywKJsmVIqi3c5Fh+zkds5LUxnssTj3yNd1x+kxaPDVB023jAP+3ibNgeNDw==", + "dependencies": { + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry-internal/feedback": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/feedback/-/feedback-8.55.0.tgz", + "integrity": "sha512-cP3BD/Q6pquVQ+YL+rwCnorKuTXiS9KXW8HNKu4nmmBAyf7urjs+F6Hr1k9MXP5yQ8W3yK7jRWd09Yu6DHWOiw==", + "dependencies": { + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry-internal/replay": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/replay/-/replay-8.55.0.tgz", + "integrity": "sha512-roCDEGkORwolxBn8xAKedybY+Jlefq3xYmgN2fr3BTnsXjSYOPC7D1/mYqINBat99nDtvgFvNfRcZPiwwZ1hSw==", + "dependencies": { + "@sentry-internal/browser-utils": "8.55.0", + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry-internal/replay-canvas": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry-internal/replay-canvas/-/replay-canvas-8.55.0.tgz", + "integrity": "sha512-nIkfgRWk1091zHdu4NbocQsxZF1rv1f7bbp3tTIlZYbrH62XVZosx5iHAuZG0Zc48AETLE7K4AX9VGjvQj8i9w==", + "dependencies": { + "@sentry-internal/replay": "8.55.0", + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry/browser": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry/browser/-/browser-8.55.0.tgz", + "integrity": "sha512-1A31mCEWCjaMxJt6qGUK+aDnLDcK6AwLAZnqpSchNysGni1pSn1RWSmk9TBF8qyTds5FH8B31H480uxMPUJ7Cw==", + "dependencies": { + "@sentry-internal/browser-utils": "8.55.0", + "@sentry-internal/feedback": "8.55.0", + "@sentry-internal/replay": "8.55.0", + "@sentry-internal/replay-canvas": "8.55.0", + "@sentry/core": "8.55.0" + }, + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@sentry/core": { + "version": "8.55.0", + "resolved": "https://registry.npmjs.org/@sentry/core/-/core-8.55.0.tgz", + "integrity": "sha512-6g7jpbefjHYs821Z+EBJ8r4Z7LT5h80YSWRJaylGS4nW5W5Z2KXzpdnyFarv37O7QjauzVC2E+PABmpkw5/JGA==", + "engines": { + "node": ">=14.18" + } + }, + "node_modules/@swc/core": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core/-/core-1.12.1.tgz", + "integrity": "sha512-aKXdDTqxTVFl/bKQZ3EQUjEMBEoF6JBv29moMZq0kbVO43na6u/u+3Vcbhbrh+A2N0X5OL4RaveuWfAjEgOmeA==", + "dev": true, + "hasInstallScript": true, + "dependencies": { + "@swc/counter": "^0.1.3", + "@swc/types": "^0.1.23" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/swc" + }, + "optionalDependencies": { + "@swc/core-darwin-arm64": "1.12.1", + "@swc/core-darwin-x64": "1.12.1", + "@swc/core-linux-arm-gnueabihf": "1.12.1", + "@swc/core-linux-arm64-gnu": "1.12.1", + "@swc/core-linux-arm64-musl": "1.12.1", + "@swc/core-linux-x64-gnu": "1.12.1", + "@swc/core-linux-x64-musl": "1.12.1", + "@swc/core-win32-arm64-msvc": "1.12.1", + "@swc/core-win32-ia32-msvc": "1.12.1", + "@swc/core-win32-x64-msvc": "1.12.1" + }, + "peerDependencies": { + "@swc/helpers": ">=0.5.17" + }, + "peerDependenciesMeta": { + "@swc/helpers": { + "optional": true + } + } + }, + "node_modules/@swc/core-darwin-arm64": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-darwin-arm64/-/core-darwin-arm64-1.12.1.tgz", + "integrity": "sha512-nUjWVcJ3YS2N40ZbKwYO2RJ4+o2tWYRzNOcIQp05FqW0+aoUCVMdAUUzQinPDynfgwVshDAXCKemY8X7nN5MaA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-darwin-x64": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-darwin-x64/-/core-darwin-x64-1.12.1.tgz", + "integrity": "sha512-OGm4a4d3OeJn+tRt8H/eiHgTFrJbS6r8mi/Ob65tAEXZGHN900T2kR7c5ALr0V2hBOQ8BfhexwPoQlGQP/B95w==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-arm-gnueabihf": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.12.1.tgz", + "integrity": "sha512-76YeeQKyK0EtNkQiNBZ0nbVGooPf9IucY0WqVXVpaU4wuG7ZyLEE2ZAIgXafIuzODGQoLfetue7I8boMxh1/MA==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-arm64-gnu": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.12.1.tgz", + "integrity": "sha512-BxJDIJPq1+aCh9UsaSAN6wo3tuln8UhNXruOrzTI8/ElIig/3sAueDM6Eq7GvZSGGSA7ljhNATMJ0elD7lFatQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-arm64-musl": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.12.1.tgz", + "integrity": "sha512-NhLdbffSXvY0/FwUSAl4hKBlpe5GHQGXK8DxTo3HHjLsD9sCPYieo3vG0NQoUYAy4ZUY1WeGjyxeq4qZddJzEQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-x64-gnu": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.12.1.tgz", + "integrity": "sha512-CrYnV8SZIgArQ9LKH0xEF95PKXzX9WkRSc5j55arOSBeDCeDUQk1Bg/iKdnDiuj5HC1hZpvzwMzSBJjv+Z70jA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-linux-x64-musl": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.12.1.tgz", + "integrity": "sha512-BQMl3d0HaGB0/h2xcKlGtjk/cGRn2tnbsaChAKcjFdCepblKBCz1pgO/mL7w5iXq3s57wMDUn++71/a5RAkZOA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-win32-arm64-msvc": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.12.1.tgz", + "integrity": "sha512-b7NeGnpqTfmIGtUqXBl0KqoSmOnH64nRZoT5l4BAGdvwY7nxitWR94CqZuwyLPty/bLywmyDA9uO12Kvgb3+gg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-win32-ia32-msvc": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.12.1.tgz", + "integrity": "sha512-iU/29X2D7cHBp1to62cUg/5Xk8K+lyOJiKIGGW5rdzTW/c2zz3d/ehgpzVP/rqC4NVr88MXspqHU4il5gmDajw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/core-win32-x64-msvc": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.12.1.tgz", + "integrity": "sha512-+Zh+JKDwiFqV5N9yAd2DhYVGPORGh9cfenu1ptr9yge+eHAf7vZJcC3rnj6QMR1QJh0Y5VC9+YBjRFjZVA7XDw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=10" + } + }, + "node_modules/@swc/counter": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz", + "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==", + "dev": true + }, + "node_modules/@swc/types": { + "version": "0.1.23", + "resolved": "https://registry.npmjs.org/@swc/types/-/types-0.1.23.tgz", + "integrity": "sha512-u1iIVZV9Q0jxY+yM2vw/hZGDNudsN85bBpTqzAQ9rzkxW9D+e3aEM4Han+ow518gSewkXgjmEK0BD79ZcNVgPw==", + "dev": true, + "dependencies": { + "@swc/counter": "^0.1.3" + } + }, + "node_modules/@types/estree": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", + "integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==", + "dev": true + }, + "node_modules/@types/events": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/events/-/events-3.0.3.tgz", + "integrity": "sha512-trOc4AAUThEz9hapPtSd7wf5tiQKvTtu5b371UxXdTuqzIh0ArcRspRP0i0Viu+LXstIQ1z96t1nsPxT9ol01g==" + }, + "node_modules/@types/node": { + "version": "22.15.32", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.32.tgz", + "integrity": "sha512-3jigKqgSjsH6gYZv2nEsqdXfZqIFGAV36XYYjf9KGZ3PSG+IhLecqPnI310RvjutyMwifE2hhhNEklOUrvx/wA==", + "dev": true, + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/@types/node/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true + }, + "node_modules/@types/protobufjs": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/@types/protobufjs/-/protobufjs-6.0.0.tgz", + "integrity": "sha512-A27RDExpAf3rdDjIrHKiJK6x8kqqJ4CmoChwtipfhVAn1p7+wviQFFP7dppn8FslSbHtQeVPvi8wNKkDjSYjHw==", + "deprecated": "This is a stub types definition for protobufjs (https://github.com/dcodeIO/ProtoBuf.js). protobufjs provides its own type definitions, so you don't need @types/protobufjs installed!", + "dev": true, + "dependencies": { + "protobufjs": "*" + } + }, + "node_modules/@typescript/vfs": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/@typescript/vfs/-/vfs-1.6.1.tgz", + "integrity": "sha512-JwoxboBh7Oz1v38tPbkrZ62ZXNHAk9bJ7c9x0eI5zBfBnBYGhURdbnh7Z4smN/MV48Y5OCcZb58n972UtbazsA==", + "dependencies": { + "debug": "^4.1.1" + }, + "peerDependencies": { + "typescript": "*" + } + }, + "node_modules/@vitejs/plugin-react-swc": { + "version": "3.10.2", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react-swc/-/plugin-react-swc-3.10.2.tgz", + "integrity": "sha512-xD3Rdvrt5LgANug7WekBn1KhcvLn1H3jNBfJRL3reeOIua/WnZOEV5qi5qIBq5T8R0jUDmRtxuvk4bPhzGHDWw==", + "dev": true, + "dependencies": { + "@rolldown/pluginutils": "1.0.0-beta.11", + "@swc/core": "^1.11.31" + }, + "peerDependencies": { + "vite": "^4 || ^5 || ^6 || ^7.0.0-beta.0" + } + }, + "node_modules/bowser": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/bowser/-/bowser-2.11.0.tgz", + "integrity": "sha512-AlcaJBi/pqqJBIQ8U9Mcpc9i8Aqxn88Skv5d+xBX006BY5u8N3mGLHa5Lgppa7L/HfwgwLgZ6NYs+Ag6uUmJRA==" + }, + "node_modules/clone-deep": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-4.0.1.tgz", + "integrity": "sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ==", + "dependencies": { + "is-plain-object": "^2.0.4", + "kind-of": "^6.0.2", + "shallow-clone": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/debug": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", + "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "engines": { + "node": ">=6" + } + }, + "node_modules/esbuild": { + "version": "0.25.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.5.tgz", + "integrity": "sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==", + "dev": true, + "hasInstallScript": true, + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.5", + "@esbuild/android-arm": "0.25.5", + "@esbuild/android-arm64": "0.25.5", + "@esbuild/android-x64": "0.25.5", + "@esbuild/darwin-arm64": "0.25.5", + "@esbuild/darwin-x64": "0.25.5", + "@esbuild/freebsd-arm64": "0.25.5", + "@esbuild/freebsd-x64": "0.25.5", + "@esbuild/linux-arm": "0.25.5", + "@esbuild/linux-arm64": "0.25.5", + "@esbuild/linux-ia32": "0.25.5", + "@esbuild/linux-loong64": "0.25.5", + "@esbuild/linux-mips64el": "0.25.5", + "@esbuild/linux-ppc64": "0.25.5", + "@esbuild/linux-riscv64": "0.25.5", + "@esbuild/linux-s390x": "0.25.5", + "@esbuild/linux-x64": "0.25.5", + "@esbuild/netbsd-arm64": "0.25.5", + "@esbuild/netbsd-x64": "0.25.5", + "@esbuild/openbsd-arm64": "0.25.5", + "@esbuild/openbsd-x64": "0.25.5", + "@esbuild/sunos-x64": "0.25.5", + "@esbuild/win32-arm64": "0.25.5", + "@esbuild/win32-ia32": "0.25.5", + "@esbuild/win32-x64": "0.25.5" + } + }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "engines": { + "node": ">=0.8.x" + } + }, + "node_modules/fdir": { + "version": "6.4.6", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz", + "integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==", + "dev": true, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/kind-of": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz", + "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==" + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true + }, + "node_modules/picomatch": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.2.tgz", + "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/protobufjs": { + "version": "7.5.3", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.3.tgz", + "integrity": "sha512-sildjKwVqOI2kmFDiXQ6aEB0fjYTafpEvIBs8tOR8qI4spuL9OPROLVu2qZqi/xgCfsHIwVqlaF8JBjWFHnKbw==", + "hasInstallScript": true, + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/protobufjs/node_modules/@types/node": { + "version": "24.0.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.3.tgz", + "integrity": "sha512-R4I/kzCYAdRLzfiCabn9hxWfbuHS573x+r0dJMkkzThEa7pbrcDWK+9zu3e7aBOouf+rQAciqPFMnxwr0aWgKg==", + "dependencies": { + "undici-types": "~7.8.0" + } + }, + "node_modules/rollup": { + "version": "4.43.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.43.0.tgz", + "integrity": "sha512-wdN2Kd3Twh8MAEOEJZsuxuLKCsBEo4PVNLK6tQWAn10VhsVewQLzcucMgLolRlhFybGxfclbPeEYBaP6RvUFGg==", + "dev": true, + "dependencies": { + "@types/estree": "1.0.7" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.43.0", + "@rollup/rollup-android-arm64": "4.43.0", + "@rollup/rollup-darwin-arm64": "4.43.0", + "@rollup/rollup-darwin-x64": "4.43.0", + "@rollup/rollup-freebsd-arm64": "4.43.0", + "@rollup/rollup-freebsd-x64": "4.43.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.43.0", + "@rollup/rollup-linux-arm-musleabihf": "4.43.0", + "@rollup/rollup-linux-arm64-gnu": "4.43.0", + "@rollup/rollup-linux-arm64-musl": "4.43.0", + "@rollup/rollup-linux-loongarch64-gnu": "4.43.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.43.0", + "@rollup/rollup-linux-riscv64-gnu": "4.43.0", + "@rollup/rollup-linux-riscv64-musl": "4.43.0", + "@rollup/rollup-linux-s390x-gnu": "4.43.0", + "@rollup/rollup-linux-x64-gnu": "4.43.0", + "@rollup/rollup-linux-x64-musl": "4.43.0", + "@rollup/rollup-win32-arm64-msvc": "4.43.0", + "@rollup/rollup-win32-ia32-msvc": "4.43.0", + "@rollup/rollup-win32-x64-msvc": "4.43.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/rxjs": { + "version": "7.8.2", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.2.tgz", + "integrity": "sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==", + "optional": true, + "dependencies": { + "tslib": "^2.1.0" + } + }, + "node_modules/shallow-clone": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-3.0.1.tgz", + "integrity": "sha512-/6KqX+GVUdqPuPPd2LxDDxzX6CAbjJehAAOKlNpqqUpAqPM6HeL8f+o3a+JsyGjn2lv0WY8UsTgUJjU9Ok55NA==", + "dependencies": { + "kind-of": "^6.0.2" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/tinyglobby": { + "version": "0.2.14", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.14.tgz", + "integrity": "sha512-tX5e7OM1HnYr2+a2C/4V0htOcSQcoSTH9KgJnVvNm5zm/cyEWKJ7j7YutsH9CxMdtOkkLFy2AHrMci9IM8IPZQ==", + "dev": true, + "dependencies": { + "fdir": "^6.4.4", + "picomatch": "^4.0.2" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "optional": true + }, + "node_modules/typed-emitter": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/typed-emitter/-/typed-emitter-2.1.0.tgz", + "integrity": "sha512-g/KzbYKbH5C2vPkaXGu8DJlHrGKHLsM25Zg9WuC9pMGfuvT+X25tZQWo5fK1BjBm8+UrVE9LDCvaY0CQk+fXDA==", + "optionalDependencies": { + "rxjs": "*" + } + }, + "node_modules/typescript": { + "version": "5.8.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", + "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz", + "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==" + }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/vite": { + "version": "6.3.5", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz", + "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==", + "dev": true, + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.4.4", + "picomatch": "^4.0.2", + "postcss": "^8.5.3", + "rollup": "^4.34.9", + "tinyglobby": "^0.2.13" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || ^20.0.0 || >=22.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", + "jiti": ">=1.21.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/x-law": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/x-law/-/x-law-0.3.1.tgz", + "integrity": "sha512-Nvo6OKj6UL2LuzAc08uJkwIDkK2PsTEdpLiY82NkwMptuRpAA1V7arUl7ZY12BcgRYNq8uh1pdAu7G6VeQn7Hg==", + "engines": { + "node": ">=18" + } + } + } +} diff --git a/examples/voice_agent/client/package.json b/examples/voice_agent/client/package.json new file mode 100644 index 000000000000..d2df048f50f8 --- /dev/null +++ b/examples/voice_agent/client/package.json @@ -0,0 +1,26 @@ +{ + "name": "client", + "version": "1.0.0", + "main": "index.js", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview" + }, + "keywords": [], + "author": "", + "license": "ISC", + "description": "", + "devDependencies": { + "@types/node": "^22.15.30", + "@types/protobufjs": "^6.0.0", + "@vitejs/plugin-react-swc": "^3.10.1", + "typescript": "^5.8.3", + "vite": "^6.3.5" + }, + "dependencies": { + "@pipecat-ai/client-js": "^0.4.0", + "@pipecat-ai/websocket-transport": "^0.4.1", + "protobufjs": "^7.4.0" + } +} diff --git a/examples/voice_agent/client/src/app.ts b/examples/voice_agent/client/src/app.ts new file mode 100644 index 000000000000..c9809fa69c8a --- /dev/null +++ b/examples/voice_agent/client/src/app.ts @@ -0,0 +1,521 @@ +/** + * Copyright (c) 2024–2025, Daily + * + * SPDX-License-Identifier: BSD 2-Clause License + */ + +/** + * RTVI Client Implementation + * + * This client connects to an RTVI-compatible bot server using WebSocket. + * + * Requirements: + * - A running RTVI bot server (defaults to http://localhost:7860) + */ + +import { + RTVIClient, + RTVIClientOptions, + RTVIEvent, +} from '@pipecat-ai/client-js'; +import { + WebSocketTransport +} from "@pipecat-ai/websocket-transport"; + +class WebsocketClientApp { + private rtviClient: RTVIClient | null = null; + private connectBtn: HTMLButtonElement | null = null; + private disconnectBtn: HTMLButtonElement | null = null; + private muteBtn: HTMLButtonElement | null = null; + private resetBtn: HTMLButtonElement | null = null; + private serverSelect: HTMLSelectElement | null = null; + private statusSpan: HTMLElement | null = null; + private debugLog: HTMLElement | null = null; + private volumeBar: HTMLElement | null = null; + private volumeText: HTMLElement | null = null; + private botAudio: HTMLAudioElement; + private isConnecting: boolean = false; + private isDisconnecting: boolean = false; + private isMuted: boolean = false; + private audioContext: AudioContext | null = null; + private analyser: AnalyserNode | null = null; + private microphone: MediaStreamAudioSourceNode | null = null; + private volumeUpdateInterval: number | null = null; + + // Server configurations + private readonly serverConfigs = { + websocket: { + name: 'WebSocket Server', + baseUrl: 'http://localhost:7860', + port: 8765 + }, + fastapi: { + name: 'FastAPI Server', + baseUrl: 'http://localhost:8000', + port: 8000 + } + }; + + constructor() { + console.log("WebsocketClientApp"); + this.botAudio = document.createElement('audio'); + this.botAudio.autoplay = true; + //this.botAudio.playsInline = true; + document.body.appendChild(this.botAudio); + + this.setupDOMElements(); + this.setupEventListeners(); + } + + /** + * Set up references to DOM elements and create necessary media elements + */ + private setupDOMElements(): void { + this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement; + this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement; + this.muteBtn = document.getElementById('mute-btn') as HTMLButtonElement; + this.resetBtn = document.getElementById('reset-btn') as HTMLButtonElement; + this.serverSelect = document.getElementById('server-select') as HTMLSelectElement; + this.statusSpan = document.getElementById('connection-status'); + this.debugLog = document.getElementById('debug-log'); + this.volumeBar = document.getElementById('volume-bar'); + this.volumeText = document.getElementById('volume-text'); + } + + /** + * Set up event listeners for connect/disconnect buttons + */ + private setupEventListeners(): void { + this.connectBtn?.addEventListener('click', () => this.connect()); + this.disconnectBtn?.addEventListener('click', () => this.disconnect()); + this.muteBtn?.addEventListener('click', () => this.toggleMute()); + this.resetBtn?.addEventListener('click', () => this.reset()); + this.serverSelect?.addEventListener('change', () => this.updateServerUrl()); + } + + /** + * Add a timestamped message to the debug log + */ + private log(message: string): void { + if (!this.debugLog) return; + const entry = document.createElement('div'); + entry.textContent = `${new Date().toISOString()} - ${message}`; + if (message.startsWith('User: ')) { + entry.style.color = '#2196F3'; + } else if (message.startsWith('Bot: ')) { + entry.style.color = '#4CAF50'; + } + this.debugLog.appendChild(entry); + this.debugLog.scrollTop = this.debugLog.scrollHeight; + console.log(message); + } + + /** + * Update the connection status display + */ + private updateStatus(status: string): void { + if (this.statusSpan) { + this.statusSpan.textContent = status; + } + this.log(`Status: ${status}`); + } + + /** + * Check for available media tracks and set them up if present + * This is called when the bot is ready or when the transport state changes to ready + */ + setupMediaTracks() { + if (!this.rtviClient) return; + const tracks = this.rtviClient.tracks(); + if (tracks.bot?.audio) { + this.setupAudioTrack(tracks.bot.audio); + } + } + + /** + * Set up listeners for track events (start/stop) + * This handles new tracks being added during the session + */ + setupTrackListeners() { + if (!this.rtviClient) { + this.log('Cannot setup track listeners: client is null'); + return; + } + + try { + // Listen for new tracks starting + this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => { + // Only handle non-local (bot) tracks + if (!participant?.local && track.kind === 'audio') { + this.setupAudioTrack(track); + } + }); + + // Listen for tracks stopping + this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => { + this.log(`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`); + }); + } catch (error) { + this.log(`Error setting up track listeners: ${error}`); + } + } + + /** + * Set up an audio track for playback + * Handles both initial setup and track updates + */ + private setupAudioTrack(track: MediaStreamTrack): void { + this.log('Setting up audio track'); + if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) { + const oldTrack = this.botAudio.srcObject.getAudioTracks()[0]; + if (oldTrack?.id === track.id) return; + } + this.botAudio.srcObject = new MediaStream([track]); + } + + /** + * Initialize and connect to the bot + * This sets up the RTVI client, initializes devices, and establishes the connection + */ + public async connect(): Promise { + if (this.isConnecting) { + this.log('Connection already in progress, ignoring...'); + return; + } + + try { + this.isConnecting = true; + const startTime = Date.now(); + + //const transport = new DailyTransport(); + const transport = new WebSocketTransport(); + const RTVIConfig: RTVIClientOptions = { + transport, + params: { + // The baseURL and endpoint of your bot server that the client will connect to + baseUrl: this.getSelectedServerConfig().baseUrl, + endpoints: { connect: '/connect' }, + }, + enableMic: true, + enableCam: false, + callbacks: { + onConnected: () => { + this.updateStatus('Connected'); + if (this.connectBtn) this.connectBtn.disabled = true; + if (this.disconnectBtn) this.disconnectBtn.disabled = false; + if (this.muteBtn) { + this.muteBtn.disabled = false; + this.muteBtn.textContent = 'Mute'; + } + if (this.resetBtn) this.resetBtn.disabled = false; + if (this.serverSelect) this.serverSelect.disabled = true; + // Start volume monitoring when connected + if (!this.isMuted) { + this.startVolumeMonitoring(); + } + }, + onDisconnected: () => { + // Only handle disconnect if we're not in the middle of error cleanup + if (!this.isConnecting) { + this.updateStatus('Disconnected'); + if (this.connectBtn) this.connectBtn.disabled = false; + if (this.disconnectBtn) this.disconnectBtn.disabled = true; + if (this.muteBtn) { + this.muteBtn.disabled = true; + this.muteBtn.textContent = 'Mute'; + } + if (this.resetBtn) this.resetBtn.disabled = true; + if (this.serverSelect) this.serverSelect.disabled = false; + // Stop volume monitoring when disconnected + this.stopVolumeMonitoring(); + this.log('Client disconnected'); + } + }, + onBotReady: (data) => { + this.log(`Bot ready: ${JSON.stringify(data)}`); + this.setupMediaTracks(); + }, + onUserTranscript: (data) => { + if (data.final) { + this.log(`User: ${data.text}`); + } + }, + onBotTranscript: (data) => this.log(`Bot: ${data.text}`), + onMessageError: (error) => console.error('Message error:', error), + onError: (error) => console.error('Error:', error), + }, + } + + // Create the client with error handling + try { + this.rtviClient = new RTVIClient(RTVIConfig); + this.setupTrackListeners(); + } catch (clientError) { + this.log(`Error creating RTVI client: ${clientError}`); + throw clientError; + } + + this.log('Initializing devices...'); + await this.rtviClient.initDevices(); + + this.log('Connecting to bot...'); + await this.rtviClient.connect(); + + const timeTaken = Date.now() - startTime; + this.log(`Connection complete, timeTaken: ${timeTaken}`); + } catch (error) { + this.log(`Error connecting: ${(error as Error).message}`); + this.updateStatus('Error'); + // Clean up if there's an error + await this.cleanupOnError(); + } finally { + this.isConnecting = false; + } + } + + /** + * Clean up resources when there's an error during connection + */ + private async cleanupOnError(): Promise { + // Set disconnecting flag to prevent onDisconnected callback interference + this.isDisconnecting = true; + + // Store reference to client before it might become null + const client = this.rtviClient; + + if (client) { + try { + // Check if the client is in a state where disconnect can be called + if (typeof client.disconnect === 'function') { + await client.disconnect(); + } + } catch (disconnectError) { + this.log(`Error during cleanup disconnect: ${disconnectError}`); + } finally { + // Always reset the client to null to allow reconnection + this.rtviClient = null; + } + } else { + this.log('Client was already null during cleanup'); + } + + // Reset button states + if (this.connectBtn) this.connectBtn.disabled = false; + if (this.disconnectBtn) this.disconnectBtn.disabled = true; + if (this.muteBtn) { + this.muteBtn.disabled = true; + this.muteBtn.textContent = 'Mute'; + } + if (this.resetBtn) this.resetBtn.disabled = true; + if (this.serverSelect) this.serverSelect.disabled = false; + + // Stop volume monitoring + this.stopVolumeMonitoring(); + + // Reset mute state + this.isMuted = false; + + // Reset disconnecting flag + this.isDisconnecting = false; + } + + /** + * Disconnect from the bot and clean up media resources + */ + public async disconnect(): Promise { + if (this.isDisconnecting) { + this.log('Disconnection already in progress, ignoring...'); + return; + } + + this.isDisconnecting = true; + + // Store reference to client before it might become null + const client = this.rtviClient; + + if (client) { + try { + // Check if the client is in a state where disconnect can be called + if (typeof client.disconnect === 'function') { + await client.disconnect(); + } + } catch (error) { + this.log(`Error disconnecting: ${(error as Error).message}`); + } finally { + // Always clean up resources and reset the client + this.rtviClient = null; + if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) { + this.botAudio.srcObject.getAudioTracks().forEach((track) => track.stop()); + this.botAudio.srcObject = null; + } + } + } else { + this.log('Client was already null during disconnect'); + } + + // Stop volume monitoring + this.stopVolumeMonitoring(); + + // Reset mute state + this.isMuted = false; + + this.isDisconnecting = false; + } + + /** + * Toggle microphone mute/unmute + */ + private toggleMute(): void { + if (!this.rtviClient) { + this.log('Cannot toggle mute: client is null'); + return; + } + + this.isMuted = !this.isMuted; + this.rtviClient.enableMic(!this.isMuted); + + // Update button text + if (this.muteBtn) { + this.muteBtn.textContent = this.isMuted ? 'Unmute' : 'Mute'; + } + + // Update volume monitoring + if (this.isMuted) { + this.stopVolumeMonitoring(); + } else { + this.startVolumeMonitoring(); + } + + this.log(this.isMuted ? 'Microphone muted' : 'Microphone unmuted'); + } + + /** + * Start monitoring microphone volume + */ + private async startVolumeMonitoring(): Promise { + try { + if (!this.audioContext) { + this.audioContext = new AudioContext(); + } + + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + + this.analyser = this.audioContext.createAnalyser(); + this.analyser.fftSize = 256; + this.analyser.smoothingTimeConstant = 0.8; + + this.microphone = this.audioContext.createMediaStreamSource(stream); + this.microphone.connect(this.analyser); + + // Start continuous volume updates + this.volumeUpdateInterval = window.setInterval(() => { + this.updateVolumeDisplay(); + }, 100); // Update every 100ms + + this.log('Volume monitoring started'); + } catch (error) { + this.log(`Error starting volume monitoring: ${error}`); + } + } + + /** + * Stop monitoring microphone volume + */ + private stopVolumeMonitoring(): void { + if (this.volumeUpdateInterval) { + clearInterval(this.volumeUpdateInterval); + this.volumeUpdateInterval = null; + } + + if (this.microphone) { + this.microphone.disconnect(); + this.microphone = null; + } + + // Reset volume display + this.updateVolumeDisplay(0); + this.log('Volume monitoring stopped'); + } + + /** + * Update the volume display + */ + private updateVolumeDisplay(volume?: number): void { + if (!this.volumeBar || !this.volumeText) return; + + if (volume === undefined && this.analyser) { + const dataArray = new Uint8Array(this.analyser.frequencyBinCount); + this.analyser.getByteFrequencyData(dataArray); + + // Calculate average volume + const average = dataArray.reduce((sum, value) => sum + value, 0) / dataArray.length; + volume = (average / 255) * 100; + } + + const displayVolume = volume || 0; + const clampedVolume = Math.min(100, Math.max(0, displayVolume)); + + this.volumeBar.style.width = `${clampedVolume}%`; + this.volumeText.textContent = `${Math.round(clampedVolume)}%`; + + // Update color based on volume level + if (clampedVolume < 30) { + this.volumeBar.style.background = '#4caf50'; // Green + } else if (clampedVolume < 70) { + this.volumeBar.style.background = '#ff9800'; // Orange + } else { + this.volumeBar.style.background = '#f44336'; // Red + } + } + + /** + * Reset the conversation context by calling the server action + */ + private async reset(): Promise { + if (!this.rtviClient) { + this.log('Cannot reset: not connected to server'); + return; + } + + try { + this.log('Resetting conversation context...'); + + // Call the reset action on the server + const result = await this.rtviClient.action({ service: 'context', action: 'reset', arguments: [] }); + + if (result) { + this.log('Conversation context reset successfully'); + } else { + this.log('Failed to reset conversation context'); + } + } catch (error) { + this.log(`Error resetting context: ${error}`); + } + } + + private getSelectedServerConfig(): { name: string; baseUrl: string; port: number } { + const selectedValue = this.serverSelect?.value || 'websocket'; + return this.serverConfigs[selectedValue as keyof typeof this.serverConfigs]; + } + + private updateServerUrl(): void { + const selectedConfig = this.getSelectedServerConfig(); + this.log(`Server changed to: ${selectedConfig.name} (${selectedConfig.baseUrl})`); + + // If connected, show a message that they need to reconnect + if (this.rtviClient) { + this.log('Please disconnect and reconnect to use the new server'); + } + } +} + +declare global { + interface Window { + WebsocketClientApp: typeof WebsocketClientApp; + } +} + +window.addEventListener('DOMContentLoaded', () => { + window.WebsocketClientApp = WebsocketClientApp; + new WebsocketClientApp(); +}); diff --git a/examples/voice_agent/client/src/style.css b/examples/voice_agent/client/src/style.css new file mode 100644 index 000000000000..a19b4e0f1bc0 --- /dev/null +++ b/examples/voice_agent/client/src/style.css @@ -0,0 +1,180 @@ +body { + margin: 0; + padding: 20px; + font-family: Arial, sans-serif; + background-color: #f0f0f0; +} + +.container { + max-width: 1200px; + margin: 0 auto; +} + +.status-bar { + display: flex; + justify-content: space-between; + align-items: center; + padding: 10px; + background-color: #fff; + border-radius: 8px; + margin-bottom: 20px; +} + +.controls button { + padding: 8px 16px; + margin-left: 10px; + border: none; + border-radius: 4px; + cursor: pointer; +} + +#connect-btn { + background-color: #4caf50; + color: white; +} + +#disconnect-btn { + background-color: #f44336; + color: white; +} + +#mute-btn { + background-color: #ff9800; + color: white; +} + +#mute-btn:disabled { + background-color: #ccc; + color: #666; +} + +button:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.volume-indicator { + display: flex; + align-items: center; + gap: 10px; + padding: 10px; + background-color: #fff; + border-radius: 8px; + margin-bottom: 20px; +} + +.volume-label { + font-weight: bold; + min-width: 120px; +} + +.volume-bar-container { + flex: 1; + height: 20px; + background-color: #e0e0e0; + border-radius: 10px; + overflow: hidden; + position: relative; +} + +.volume-bar { + height: 100%; + background: linear-gradient(90deg, #4caf50, #ff9800, #f44336); + width: 0%; + transition: width 0.1s ease; + border-radius: 10px; +} + +.volume-text { + min-width: 40px; + text-align: right; + font-weight: bold; + font-size: 14px; +} + +.main-content { + background-color: #fff; + border-radius: 8px; + padding: 20px; + margin-bottom: 20px; +} + +.bot-container { + display: flex; + flex-direction: column; + align-items: center; +} + +#bot-video-container { + width: 640px; + height: 360px; + background-color: #e0e0e0; + border-radius: 8px; + margin: 20px auto; + overflow: hidden; + display: flex; + align-items: center; + justify-content: center; +} + +#bot-video-container video { + width: 100%; + height: 100%; + object-fit: cover; +} + +.debug-panel { + background-color: #fff; + border-radius: 8px; + padding: 20px; +} + +.debug-panel h3 { + margin: 0 0 10px 0; + font-size: 16px; + font-weight: bold; +} + +#debug-log { + height: 500px; + overflow-y: auto; + background-color: #f8f8f8; + padding: 10px; + border-radius: 4px; + font-family: monospace; + font-size: 12px; + line-height: 1.4; +} + +.server-selection { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 10px; +} + +.server-selection label { + font-weight: bold; + color: #333; +} + +.server-selection select { + padding: 6px 12px; + border: 1px solid #ccc; + border-radius: 4px; + background-color: white; + font-size: 14px; + cursor: pointer; +} + +.server-selection select:focus { + outline: none; + border-color: #2196F3; + box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2); +} + +.server-selection select:disabled { + background-color: #f5f5f5; + cursor: not-allowed; + opacity: 0.6; +} diff --git a/examples/voice_agent/client/tsconfig.json b/examples/voice_agent/client/tsconfig.json new file mode 100644 index 000000000000..c9c555d96f35 --- /dev/null +++ b/examples/voice_agent/client/tsconfig.json @@ -0,0 +1,111 @@ +{ + "compilerOptions": { + /* Visit https://aka.ms/tsconfig to read more about this file */ + + /* Projects */ + // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ + // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ + // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ + // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ + // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ + // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + + /* Language and Environment */ + "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ + // "jsx": "preserve", /* Specify what JSX code is generated. */ + // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ + // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ + // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ + // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ + // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ + // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ + // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ + // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ + // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ + + /* Modules */ + "module": "commonjs", /* Specify what module code is generated. */ + // "rootDir": "./", /* Specify the root folder within your source files. */ + // "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */ + // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ + // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ + // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ + // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ + // "types": [], /* Specify type package names to be included without being referenced in a source file. */ + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ + // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ + // "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */ + // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ + // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ + // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ + // "noUncheckedSideEffectImports": true, /* Check side effect imports. */ + // "resolveJsonModule": true, /* Enable importing .json files. */ + // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ + // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ + + /* JavaScript Support */ + // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ + // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ + // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ + + /* Emit */ + // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + // "declarationMap": true, /* Create sourcemaps for d.ts files. */ + // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ + // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ + // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + // "noEmit": true, /* Disable emitting files from a compilation. */ + // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ + // "outDir": "./", /* Specify an output folder for all emitted files. */ + // "removeComments": true, /* Disable emitting comments. */ + // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ + // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ + // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ + // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ + // "newLine": "crlf", /* Set the newline character for emitting files. */ + // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ + // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ + // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ + // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ + // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + + /* Interop Constraints */ + // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ + // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ + // "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */ + // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ + "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ + // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ + "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ + + /* Type Checking */ + "strict": true, /* Enable all strict type-checking options. */ + // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ + // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ + // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ + // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ + // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ + // "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */ + // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ + // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ + // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ + // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ + // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ + // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ + // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ + // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ + // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ + // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ + + /* Completeness */ + // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + "skipLibCheck": true /* Skip type checking all .d.ts files. */ + } +} diff --git a/examples/voice_agent/client/vite.config.js b/examples/voice_agent/client/vite.config.js new file mode 100644 index 000000000000..16c0f9648ff8 --- /dev/null +++ b/examples/voice_agent/client/vite.config.js @@ -0,0 +1,17 @@ +import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react-swc'; + +export default defineConfig({ + plugins: [react()], + server: { + host: '0.0.0.0', // Bind to all interfaces + port: 5173, // Back to default Vite port + proxy: { + // Proxy /api requests to the backend server + '/connect': { + target: 'http://0.0.0.0:7860', // Replace with your backend URL if needed + changeOrigin: true, + }, + }, + }, +}); diff --git a/examples/voice_agent/environment.yml b/examples/voice_agent/environment.yml new file mode 100644 index 000000000000..6589bcb04ab7 --- /dev/null +++ b/examples/voice_agent/environment.yml @@ -0,0 +1,435 @@ +name: nemo-voice +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2025.2.25=h06a4308_0 + - cudatoolkit=11.8.0=h6a678d5_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - libxcb=1.17.0=h9b100fa_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.16=h5eee18b_0 + - pip=25.1=pyhc872135_2 + - pthread-stubs=0.3=h0ce48e5_1 + - python=3.10.12=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=78.1.1=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h993c535_1 + - wheel=0.45.1=py310h06a4308_0 + - xorg-libx11=1.8.12=h9b100fa_1 + - xorg-libxau=1.0.12=h9b100fa_0 + - xorg-libxdmcp=1.1.5=h9b100fa_0 + - xorg-xorgproto=2024.1=h5eee18b_1 + - xz=5.6.4=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.3.0 + - accelerate==1.7.0 + - accelerated-scan==0.2.0 + - addict==2.4.0 + - aiofiles==24.1.0 + - aiohappyeyeballs==2.6.1 + - aiohttp==3.11.18 + - aiosignal==1.3.2 + - alabaster==1.0.0 + - alembic==1.16.2 + - aniso8601==10.0.1 + - annotated-types==0.7.0 + - antlr4-python3-runtime==4.9.3 + - anyio==4.9.0 + - asciitree==0.3.3 + - asttokens==3.0.0 + - async-timeout==5.0.1 + - attrdict==2.0.1 + - attrs==25.3.0 + - audioread==3.0.1 + - av==14.4.0 + - babel==2.17.0 + - backoff==2.2.1 + - bcrypt==4.3.0 + - beautifulsoup4==4.13.1 + - bitsandbytes==0.46.0 + - black==24.10.0 + - blinker==1.9.0 + - boto3==1.38.38 + - botocore==1.38.38 + - braceexpand==0.1.7 + - bracex==2.5.post1 + - catalogue==2.0.10 + - cdifflib==1.2.6 + - certifi==2025.6.15 + - cffi==1.17.1 + - chardet==5.2.0 + - charset-normalizer==3.4.2 + - click==8.2.1 + - clip==0.2.0 + - cloudpickle==3.1.1 + - colorama==0.4.6 + - coloredlogs==15.0.1 + - colorlog==6.9.0 + - contourpy==1.3.2 + - coverage==7.9.1 + - cryptography==42.0.8 + - cycler==0.12.1 + - cytoolz==1.0.1 + - dataclasses-json==0.6.7 + - dataproperty==1.1.0 + - datasets==3.6.0 + - decorator==5.2.1 + - decord==0.6.0 + - defusedxml==0.7.1 + - deprecated==1.2.18 + - diffusers==0.33.1 + - dill==0.3.8 + - distance==0.1.3 + - distro==1.9.0 + - docker==7.1.0 + - docopt==0.6.2 + - docstring-parser==0.16 + - docutils==0.21.2 + - dotenv==0.9.9 + - editdistance==0.8.1 + - einops==0.8.1 + - einops-exts==0.0.4 + - emoji==2.14.1 + - eval-type-backport==0.2.2 + - evaluate==0.4.3 + - exceptiongroup==1.3.0 + - executing==2.2.0 + - fabric==3.2.2 + - faiss-cpu==1.11.0 + - fastapi==0.115.13 + - fasteners==0.19 + - fiddle==0.3.0 + - filelock==3.18.0 + - filetype==1.2.0 + - flashlight==0.1.1 + - flashlight-text==0.0.7 + - flask==3.1.1 + - flask-restful==0.3.10 + - flatbuffers==25.2.10 + - fonttools==4.58.4 + - frozenlist==1.7.0 + - fsspec==2024.12.0 + - ftfy==6.3.1 + - future==1.0.0 + - g2p-en==2.1.0 + - gdown==5.2.0 + - gitdb==4.0.12 + - gitpython==3.1.44 + - glibc==0.6.1 + - greenlet==3.2.3 + - grpcio==1.67.1 + - grpcio-tools==1.67.1 + - h11==0.16.0 + - h5py==3.14.0 + - hf-xet==1.1.4 + - httpcore==1.0.9 + - httptools==0.6.4 + - httpx==0.28.1 + - huggingface-hub==0.33.0 + - humanfriendly==10.0 + - hydra-core==1.3.2 + - idna==3.10 + - ijson==3.4.0 + - imageio==2.37.0 + - imagesize==1.4.1 + - immutabledict==4.2.0 + - importlib-metadata==8.7.0 + - indic-numtowords==1.0.2 + - inflect==7.5.0 + - iniconfig==2.1.0 + - inquirerpy==0.3.4 + - intervaltree==3.1.0 + - invoke==2.2.0 + - ipython==8.37.0 + - isort==5.13.2 + - itsdangerous==2.2.0 + - janome==0.5.0 + - jedi==0.19.2 + - jieba==0.42.1 + - jinja2==3.1.6 + - jiter==0.10.0 + - jiwer==3.1.0 + - jmespath==1.0.1 + - joblib==1.5.1 + - jsonlines==4.0.0 + - jsonschema==4.24.0 + - jsonschema-specifications==2025.4.1 + - kaldi-python-io==1.2.2 + - kaldiio==2.18.1 + - kiwisolver==1.4.8 + - kornia==0.8.1 + - kornia-rs==0.1.9 + - langdetect==1.0.9 + - latexcodec==3.0.1 + - lazy-loader==0.4 + - levenshtein==0.27.1 + - lhotse==1.30.3 + - libcst==1.8.2 + - librosa==0.11.0 + - lightning==2.4.0 + - lightning-utilities==0.14.3 + - lilcom==1.8.1 + - livekit==1.0.9 + - livekit-agents==1.1.1 + - livekit-api==1.0.2 + - livekit-plugins-turn-detector==1.1.1 + - livekit-protocol==1.0.3 + - llvmlite==0.44.0 + - loguru==0.7.3 + - lxml==5.4.0 + - mako==1.3.10 + - markdown==3.8 + - markdown-it-py==3.0.0 + - markdown2==2.5.3 + - markupsafe==3.0.2 + - marshmallow==3.26.1 + - matplotlib==3.10.3 + - matplotlib-inline==0.1.7 + - mbstrdecoder==1.1.4 + - mdurl==0.1.2 + - mediapy==1.1.6 + - megatron-core==0.12.1 + - megatron-energon==5.2.0 + - ml-dtypes==0.5.1 + - more-itertools==10.7.0 + - mpmath==1.3.0 + - msgpack==1.1.1 + - multi-storage-client==0.23.0 + - multidict==6.5.0 + - multiprocess==0.70.16 + - mypy-extensions==1.1.0 + - nemo-run==0.4.0 + - nemo-text-processing==1.1.0 + - nemo-toolkit==2.4.0rc2 + - nerfacc==0.5.3 + - nest-asyncio==1.6.0 + - networkx==3.4.2 + - ninja==1.11.1.4 + - nltk==3.9.1 + - num2words==0.5.14 + - numba==0.61.2 + - numcodecs==0.13.1 + - numexpr==2.11.0 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.6.4.1 + - nvidia-cuda-cupti-cu12==12.6.80 + - nvidia-cuda-nvrtc-cu12==12.6.77 + - nvidia-cuda-runtime-cu12==12.6.77 + - nvidia-cudnn-cu12==9.5.1.17 + - nvidia-cufft-cu12==11.3.0.4 + - nvidia-cufile-cu12==1.11.1.6 + - nvidia-curand-cu12==10.3.7.77 + - nvidia-cusolver-cu12==11.7.1.2 + - nvidia-cusparse-cu12==12.5.4.2 + - nvidia-cusparselt-cu12==0.6.3 + - nvidia-lm-eval==25.5 + - nvidia-ml-py==12.575.51 + - nvidia-modelopt==0.31.0 + - nvidia-modelopt-core==0.31.0 + - nvidia-nccl-cu12==2.26.2 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.6.77 + - nvidia-resiliency-ext==0.4.0 + - nvidia-riva-client==2.21.0 + - nvtx==0.2.12 + - omegaconf==2.3.0 + - onnx==1.17.0 + - onnxruntime==1.22.0 + - open-clip-torch==2.24.0 + - openai==1.74.0 + - opencc==1.1.9 + - opencc-python-reimplemented==0.1.7 + - opentelemetry-api==1.34.1 + - optuna==4.4.0 + - packaging==24.2 + - pandas==2.3.0 + - pangu==4.0.6.1 + - parameterized==0.9.0 + - paramiko==3.5.1 + - parso==0.8.4 + - pathspec==0.12.1 + - pathvalidate==3.3.1 + - peft==0.15.2 + - pesq==0.0.4 + - pexpect==4.9.0 + - pfzy==0.3.4 + - pillow==11.1.0 + - pipecat-ai==0.0.76 + - plac==1.4.5 + - platformdirs==4.3.8 + - pluggy==1.6.0 + - pooch==1.8.2 + - portalocker==3.2.0 + - prettytable==3.16.0 + - progress==1.6 + - prompt-toolkit==3.0.51 + - propcache==0.3.2 + - protobuf==5.29.5 + - psutil==7.0.0 + - ptyprocess==0.7.0 + - pulp==3.2.1 + - pure-eval==0.2.3 + - pyannote-core==5.0.0 + - pyannote-database==5.1.3 + - pyannote-metrics==3.2.1 + - pyarrow==20.0.0 + - pybind11==2.13.6 + - pybtex==0.24.0 + - pybtex-docutils==1.0.3 + - pycparser==2.22 + - pydantic==2.10.6 + - pydantic-core==2.27.2 + - pydub==0.25.1 + - pygments==2.19.1 + - pyjwt==2.10.1 + - pyloudnorm==0.1.1 + - pynacl==1.5.0 + - pynini==2.1.6.post1 + - pynvml==12.0.0 + - pyparsing==3.2.3 + - pypdf==5.6.0 + - pypinyin==0.54.0 + - pypinyin-dict==0.9.0 + - pyre-extensions==0.0.32 + - pysocks==1.7.1 + - pystoi==0.4.1 + - pytablewriter==1.2.1 + - pytest==8.4.0 + - pytest-cov==6.2.1 + - pytest-httpserver==1.1.3 + - pytest-mock==3.14.1 + - pytest-random-order==1.1.1 + - pytest-runner==6.0.1 + - python-dateutil==2.9.0.post0 + - python-dotenv==1.1.0 + - graphviz==0.21 + - python-iso639==2025.2.18 + - python-magic==0.4.27 + - pytorch-lightning==2.5.1.post0 + - pytz==2025.2 + - pyyaml==6.0.2 + - qwen-vl-utils==0.0.11 + - rapidfuzz==3.13.0 + - referencing==0.36.2 + - regex==2024.11.6 + - requests==2.32.4 + - requests-toolbelt==1.0.0 + - resampy==0.4.3 + - rich==14.0.0 + - rouge-score==0.1.2 + - rpds-py==0.25.1 + - ruamel-yaml==0.18.14 + - ruamel-yaml-clib==0.2.12 + - s3fs==0.4.2 + - s3transfer==0.13.0 + - sacrebleu==2.5.1 + - sacremoses==0.1.1 + - safetensors==0.5.3 + - sanic==0.7.0 + - scikit-learn==1.7.0 + - scipy==1.15.3 + - seaborn==0.13.2 + - sentence-transformers==4.1.0 + - sentencepiece==0.2.0 + - sentry-sdk==2.30.0 + - setproctitle==1.3.6 + - shellingham==1.5.4 + - six==1.17.0 + - smmap==5.0.2 + - sniffio==1.3.1 + - snowballstemmer==3.0.1 + - sortedcontainers==2.4.0 + - sounddevice==0.5.2 + - soundfile==0.13.1 + - soupsieve==2.7 + - sox==1.5.0 + - soxr==0.5.0.post1 + - sphinx==8.1.3 + - sphinxcontrib-applehelp==2.0.0 + - sphinxcontrib-bibtex==2.6.4 + - sphinxcontrib-devhelp==2.0.0 + - sphinxcontrib-htmlhelp==2.1.0 + - sphinxcontrib-jsmath==1.0.1 + - sphinxcontrib-qthelp==2.0.0 + - sphinxcontrib-serializinghtml==2.0.0 + - sqlalchemy==2.0.41 + - stack-data==0.6.3 + - starlette==0.46.2 + - structlog==25.4.0 + - sympy==1.14.0 + - tabledata==1.3.4 + - tabulate==0.9.0 + - taming-transformers==0.0.1 + - tcolorpy==0.1.7 + - tenacity==9.1.2 + - tensorboard==2.19.0 + - tensorboard-data-server==0.7.2 + - tensorstore==0.1.71 + - termcolor==3.1.0 + - text-unidecode==1.3 + - textdistance==4.6.3 + - texterrors==0.5.1 + - threadpoolctl==3.6.0 + - tiktoken==0.7.0 + - timm==1.0.15 + - tokenizers==0.21.1 + - tomli==2.2.1 + - toolz==1.0.0 + - torch==2.7.1 + - torchaudio==2.7.1 + - torchdiffeq==0.2.5 + - torchmetrics==1.7.3 + - torchprofile==0.0.4 + - torchsde==0.2.6 + - torchvision==0.22.1 + - torchx==0.7.0 + - tqdm==4.67.1 + - tqdm-multiprocess==0.0.11 + - traitlets==5.14.3 + - trampoline==0.1.2 + - transformers==4.51.3 + - tree-sitter==0.24.0 + - tree-sitter-python==0.23.6 + - trimesh==4.6.12 + - triton==3.3.1 + - typeguard==4.4.3 + - typepy==1.3.4 + - typer==0.16.0 + - types-protobuf==4.25.0.20240417 + - typing-extensions==4.14.0 + - typing-inspect==0.9.0 + - typing-inspection==0.4.1 + - tzdata==2025.2 + - ujson==5.10.0 + - unstructured + - unstructured-client + - urllib3==1.26.20 + - uvicorn==0.34.3 + - uvloop==0.21.0 + - wandb==0.20.1 + - watchfiles==1.1.0 + - wcmatch==10.0 + - wcwidth==0.2.13 + - webdataset==0.2.111 + - websockets==15.0.1 + - werkzeug==3.1.3 + - wget==3.2 + - whisper-normalizer==0.1.12 + - word2number==1.1 + - wrapt==1.17.2 + - xattr==1.1.4 + - xxhash==3.5.0 + - yarl==1.20.1 + - zarr==2.18.3 + - zipp==3.23.0 + - zstandard==0.23.0 diff --git a/examples/voice_agent/example_prompts/fast-bite.txt b/examples/voice_agent/example_prompts/fast-bite.txt new file mode 100644 index 000000000000..7170000fdfed --- /dev/null +++ b/examples/voice_agent/example_prompts/fast-bite.txt @@ -0,0 +1,51 @@ +Fast Bites Lunch Menu + +Burgers and Sandwiches +1. Classic Cheeseburger – $5.99 + Juicy beef patty, cheddar cheese, pickles, ketchup & mustard on a toasted bun. + - Make it a double cheeseburger by adding another patty - $1.50 +2. Crispy Chicken Sandwich – $6.49 + Fried chicken filet, lettuce, mayo, and pickles on a brioche bun. +3. Veggie Wrap – $5.49 + Grilled vegetables, hummus, lettuce, and tomato in a spinach wrap. + +Combo Deals (includes small fries and fountain soda) +4. Cheeseburger Combo – $8.99 +5. Chicken Sandwich Combo – $9.49 +6. Veggie Wrap Combo – $8.49 + +Sides +7. French Fries + - Small - $2.49 + - Medium - $3.49 + - Large - $4.49 +8. Chicken Nuggets + - 4 pcs - $3.29 + - 8 pcs - $5.99 + - 12 pcs - $8.99 +9. Side Salad - $2.99 + +Drinks +10. Fountain Soda (16 oz, choices: Coke, Diet Coke, Sprite, Fanta) – $1.99 +11. Iced Tea or Lemonade – $2.29 +12. Bottled Water – $1.49 + +You are a helpful assistant named Lisa that helps customers order food from the lunch menu. +Start by greeting the user warmly and introducing yourself within one sentence "Hi welcome to Fast Bites! I'm Lisa, what can I help you with?". +Your answer should be concise and to the point. +Do not include the whole lunch menu in your response, only include the items that are relevant to the user's question. +If the user asks about a specific item, you should include the price of that item. +If the user asks about the menu, you should include the entire lunch menu. +If the user asks about the prices, you should include the prices of the items. +If the user asks about the location, you should include the location of the restaurant (123 Main St, Anytown, USA). +If the user asks about the hours, you should include the hours of the restaurant (11:00 AM - 9:00 PM). +When a user asks for the total price of the order, you should include the total price of the order. +When the conversation is done, you should say "Thank you for your order! Your total is . Please come back soon!", where is the total price of the orders of all speakers. +If a speaker finishes their order and you don't know their name, you should ask them for their name and associate it with their order. +When introducing an item from the menu, you should include the name of the item and the price. +Stick strictly to the lunch menu and do not make up any items. +You might also see speaker tags (, , etc.) in the user context. +You should respond to the user based on the speaker tag and the context of that speaker. +Do not include the speaker tags in your response, use them only to identify the speaker. +If there are multiple speakers, you should handle the order of each speaker separately and not mix up the speakers. +Do not respond only with "Hi" or "Hi there", you should focus on the task of taking the order and not just greeting the user. diff --git a/examples/voice_agent/example_prompts/simple_chatbot.txt b/examples/voice_agent/example_prompts/simple_chatbot.txt new file mode 100644 index 000000000000..9ded9d0e841d --- /dev/null +++ b/examples/voice_agent/example_prompts/simple_chatbot.txt @@ -0,0 +1,3 @@ +You are a helpful AI agent named Lisa. +Start by greeting the user warmly and introducing yourself within one sentence. +Your answer should be concise and to the point. \ No newline at end of file diff --git a/examples/voice_agent/example_prompts/simple_chatbot_diar.txt b/examples/voice_agent/example_prompts/simple_chatbot_diar.txt new file mode 100644 index 000000000000..2c6baa58e7a0 --- /dev/null +++ b/examples/voice_agent/example_prompts/simple_chatbot_diar.txt @@ -0,0 +1,7 @@ +You are a helpful AI agent named Lisa. +Start by greeting the user warmly and introducing yourself within one sentence. +Your answer should be concise and to the point. +You might also see speaker tags (, , etc.) in the user context. +You should respond to the user based on the speaker tag and the context of that speaker. +Do not include the speaker tags in your response, use them only to identify the speaker. +If a speaker provides their name, use their name when addressing their requests. \ No newline at end of file diff --git a/examples/voice_agent/requirements.txt b/examples/voice_agent/requirements.txt new file mode 100644 index 000000000000..c78bc299602d --- /dev/null +++ b/examples/voice_agent/requirements.txt @@ -0,0 +1,8 @@ +fastapi[all] +huggingface-hub +nemo-toolkit +onnxruntime +pipecat-ai +python-dotenv +uvicorn +websockets diff --git a/examples/voice_agent/server/backchannel_phrases.yaml b/examples/voice_agent/server/backchannel_phrases.yaml new file mode 100644 index 000000000000..38c7523a7153 --- /dev/null +++ b/examples/voice_agent/server/backchannel_phrases.yaml @@ -0,0 +1,79 @@ +- "absolutely" +- "ah" +- "all right" +- "alright" +- "but yeah" +- "cool" +- "definitely" +- "exactly" +- "go ahead" +- "good" +- "great" +- "great thanks" +- "ha ha" +- "hi" +- "hmm" +- "humm" +- "huh" +- "i know" +- "i know right" +- "i see" +- "indeed" +- "interesting" +- "mhmm" +- "mhmm mhmm" +- "mhmm right" +- "mhmm yeah" +- "mhmm yes" +- "mm hmm" +- "mmhmm" +- "nice" +- "of course" +- "oh" +- "oh dear" +- "oh man" +- "oh okay" +- "oh wow" +- "oh yes" +- "ok" +- "ok thanks" +- "okay" +- "okay okay" +- "okay thanks" +- "perfect" +- "really" +- "right" +- "right exactly" +- "right right" +- "right yeah" +- "so yeah" +- "sounds good" +- "sure" +- "sure thing" +- "thank you" +- "thanks" +- "that's awesome" +- "thats right" +- "thats true" +- "true" +- "uh huh" +- "uh-huh" +- "uh-huh yeah" +- "uhhuh" +- "uhhuh okay" +- "um-humm" +- "well" +- "what" +- "wow" +- "yeah" +- "yeah i know" +- "yeah i see" +- "yeah mhmm" +- "yeah okay" +- "yeah right" +- "yeah uh-huh" +- "yeah yeah" +- "yep" +- "yes" +- "yes please" +- "yes yes" diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py new file mode 100644 index 000000000000..cd8fa829dc60 --- /dev/null +++ b/examples/voice_agent/server/bot_websocket_server.py @@ -0,0 +1,390 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import copy +import os +import signal +import sys + +from loguru import logger +from omegaconf import OmegaConf + +# Configure loguru to output to both console and file +logger.remove() # Remove default handler +logger.add( + sys.stderr, + format="{time:YYYY-MM-DD HH:mm:ss.SSSS} | {level: <8} | {name}:{function}:{line} - {message}", + level="DEBUG", +) + +logger.add("bot_server.log", rotation="1 day", level="DEBUG") + +# Global flag for graceful shutdown +shutdown_event = asyncio.Event() + +from pipecat.audio.vad.silero import SileroVADAnalyzer, VADParams +from pipecat.frames.frames import EndTaskFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor +from pipecat.serializers.protobuf import ProtobufFrameSerializer + +from nemo.agents.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService +from nemo.agents.voice_agent.pipecat.services.nemo.llm import HuggingFaceLLMService +from nemo.agents.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService +from nemo.agents.voice_agent.pipecat.services.nemo.tts import NeMoFastPitchHiFiGANTTSService +from nemo.agents.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService +from nemo.agents.voice_agent.pipecat.transports.network.websocket_server import ( + WebsocketServerParams, + WebsocketServerTransport, +) +from nemo.agents.voice_agent.pipecat.utils.text.simple_text_aggregator import SimpleSegmentedTextAggregator + +SERVER_CONFIG_PATH = os.environ.get( + "SERVER_CONFIG_PATH", f"{os.path.dirname(os.path.abspath(__file__))}/server_config.yaml" +) + +server_config = OmegaConf.load(SERVER_CONFIG_PATH) + +logger.info(f"Server config: {server_config}") + +# Default Configuration +SAMPLE_RATE = 16000 # Standard sample rate for speech recognition +RAW_AUDIO_FRAME_LEN_IN_SECS = 0.016 # 16ms for websocket transport +SYSTEM_PROMPT = """ +You are a helpful AI agent named Lisa. +Start by greeting the user warmly and introducing yourself within one sentence. +Your answer should be concise and to the point. +""" + +################ Start of Configuration ################# + +### Transport +TRANSPORT_AUDIO_OUT_10MS_CHUNKS = server_config.transport.audio_out_10ms_chunks + + +### VAD +vad_params = VADParams( + confidence=server_config.vad.confidence, + start_secs=server_config.vad.start_secs, + stop_secs=server_config.vad.stop_secs, + min_volume=server_config.vad.min_volume, +) + + +### STT +STT_MODEL_PATH = server_config.stt.model +STT_DEVICE = server_config.stt.device +stt_params = NeMoSTTInputParams( + att_context_size=server_config.stt.att_context_size, + frame_len_in_secs=server_config.stt.frame_len_in_secs, + raw_audio_frame_len_in_secs=RAW_AUDIO_FRAME_LEN_IN_SECS, +) + + +### Diarization +DIAR_MODEL = server_config.diar.model +USE_DIAR = server_config.diar.enabled +diar_params = NeMoDiarInputParams( + frame_len_in_secs=server_config.diar.frame_len_in_secs, + threshold=server_config.diar.threshold, +) + + +### Turn taking +TURN_TAKING_BACKCHANNEL_PHRASES = server_config.turn_taking.backchannel_phrases +TURN_TAKING_MAX_BUFFER_SIZE = server_config.turn_taking.max_buffer_size +TURN_TAKING_BOT_STOP_DELAY = server_config.turn_taking.bot_stop_delay + + +### LLM +SYSTEM_ROLE = server_config.llm.get("system_role", "system") +if server_config.llm.get("system_prompt", None) is not None: + system_prompt = server_config.llm.system_prompt + if os.path.isfile(system_prompt): + with open(system_prompt, "r") as f: + system_prompt = f.read() + SYSTEM_PROMPT = system_prompt +logger.info(f"System prompt: {SYSTEM_PROMPT}") + +LLM_MODEL = server_config.llm.model +LLM_DEVICE = server_config.llm.device +LLM_DTYPE = server_config.llm.dtype +LLM_GENERATION_KWARGS = server_config.llm.get("generation_kwargs", {}) +if LLM_GENERATION_KWARGS is not None: + LLM_GENERATION_KWARGS = OmegaConf.to_container(LLM_GENERATION_KWARGS) +LLM_APPLY_CHAT_TEMPLATE_KWARGS = server_config.llm.get("apply_chat_template_kwargs", None) +if LLM_APPLY_CHAT_TEMPLATE_KWARGS is not None: + LLM_APPLY_CHAT_TEMPLATE_KWARGS = OmegaConf.to_container(LLM_APPLY_CHAT_TEMPLATE_KWARGS) + + +### TTS +TTS_FASTPITCH_MODEL = server_config.tts.fastpitch_model +TTS_HIFIGAN_MODEL = server_config.tts.hifigan_model +TTS_DEVICE = server_config.tts.device +TTS_THINK_TOKENS = server_config.tts.get("think_tokens", None) +if TTS_THINK_TOKENS is not None: + TTS_THINK_TOKENS = OmegaConf.to_container(TTS_THINK_TOKENS) +TTS_EXTRA_SEPARATOR = server_config.tts.get("extra_separator", None) +if TTS_EXTRA_SEPARATOR is not None: + TTS_EXTRA_SEPARATOR = OmegaConf.to_container(TTS_EXTRA_SEPARATOR) + +################ End of Configuration ################# + + +def signal_handler(signum, frame): + """Handle shutdown signals gracefully""" + logger.info(f"Received signal {signum}, initiating graceful shutdown...") + shutdown_event.set() + + +async def run_bot_websocket_server(): + # Set up signal handlers for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + logger.info("Initializing WebSocket server transport...") + logger.info("Server configured to run indefinitely with no timeouts") + + """ + NO-TIMEOUT CONFIGURATION: + - session_timeout=None: Disables WebSocket session timeout + - idle_timeout=None: Disables pipeline idle timeout + - asyncio.wait_for(timeout=None): No timeout on pipeline runner + - Server will run indefinitely until manually stopped (Ctrl+C) + """ + + vad_analyzer = SileroVADAnalyzer( + sample_rate=SAMPLE_RATE, + params=vad_params, + ) + logger.info("VAD analyzer initialized") + + ws_transport = WebsocketServerTransport( + params=WebsocketServerParams( + serializer=ProtobufFrameSerializer(), + audio_in_enabled=True, + audio_out_enabled=True, + add_wav_header=False, + vad_analyzer=vad_analyzer, + session_timeout=None, # Disable session timeout + audio_in_sample_rate=SAMPLE_RATE, + can_create_user_frames=TURN_TAKING_BACKCHANNEL_PHRASES + is None, # if backchannel phrases are disabled, we can use VAD to interrupt the bot immediately + audio_out_10ms_chunks=TRANSPORT_AUDIO_OUT_10MS_CHUNKS, + ), + host="0.0.0.0", # Bind to all interfaces + port=8765, + ) + + logger.info("Initializing STT service...") + + stt = NemoSTTService( + model=STT_MODEL_PATH, + device=STT_DEVICE, + params=stt_params, + sample_rate=SAMPLE_RATE, + audio_passthrough=True, + has_turn_taking=True, + backend="legacy", + decoder_type="rnnt", + ) + logger.info("STT service initialized") + + if USE_DIAR: + diar = NemoDiarService( + model=DIAR_MODEL, + device=STT_DEVICE, + params=diar_params, + sample_rate=SAMPLE_RATE, + backend="legacy", + enabled=USE_DIAR, + ) + logger.info("Diarization service initialized") + else: + diar = None + + turn_taking = NeMoTurnTakingService( + use_vad=True, + use_diar=USE_DIAR, + max_buffer_size=TURN_TAKING_MAX_BUFFER_SIZE, + bot_stop_delay=TURN_TAKING_BOT_STOP_DELAY, + backchannel_phrases=TURN_TAKING_BACKCHANNEL_PHRASES, + ) + logger.info("Turn taking service initialized") + + logger.info("Initializing LLM service...") + + llm = HuggingFaceLLMService( + model=LLM_MODEL, + device=LLM_DEVICE, + dtype=LLM_DTYPE, + generation_kwargs=LLM_GENERATION_KWARGS, + apply_chat_template_kwargs=LLM_APPLY_CHAT_TEMPLATE_KWARGS, + ) + logger.info("LLM service initialized") + + text_aggregator = SimpleSegmentedTextAggregator(punctuation_marks=TTS_EXTRA_SEPARATOR) + + tts = NeMoFastPitchHiFiGANTTSService( + fastpitch_model=TTS_FASTPITCH_MODEL, + hifigan_model=TTS_HIFIGAN_MODEL, + device=TTS_DEVICE, + text_aggregator=text_aggregator, + think_tokens=TTS_THINK_TOKENS, + ) + + logger.info("TTS service initialized") + + context = OpenAILLMContext( + [ + { + "role": SYSTEM_ROLE, + "content": SYSTEM_PROMPT, + } + ], + ) + + original_messages = copy.deepcopy(context.get_messages()) + original_context = copy.deepcopy(context) + original_context.set_llm_adapter(llm.get_llm_adapter()) + + context_aggregator = llm.create_context_aggregator(context) + user_context_aggregator = context_aggregator.user() + assistant_context_aggregator = context_aggregator.assistant() + + # RTVI events for Pipecat client UI + rtvi = RTVIProcessor(config=RTVIConfig(config=[])) + + # Add reset action to RTVI processor + async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arguments: dict[str, any]) -> bool: + """Reset both user and assistant context aggregators""" + logger.info("Resetting conversation context...") + try: + user_context_aggregator.reset() + assistant_context_aggregator.reset() + user_context_aggregator.set_messages(copy.deepcopy(original_messages)) + assistant_context_aggregator.set_messages(copy.deepcopy(original_messages)) + + logger.info("Conversation context reset successfully") + return True + except Exception as e: + logger.error(f"Error resetting context: {e}") + return False + + reset_action = RTVIAction( + service="context", + action="reset", + result="bool", + arguments=[], + handler=reset_context_handler, + ) + rtvi.register_action(reset_action) + + logger.info("Setting up pipeline...") + + pipeline = [ + ws_transport.input(), + rtvi, + stt, + ] + + if USE_DIAR: + pipeline.append(diar) + + pipeline.extend( + [turn_taking, user_context_aggregator, llm, tts, ws_transport.output(), assistant_context_aggregator] + ) + + pipeline = Pipeline(pipeline) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=False, + enable_usage_metrics=False, + send_initial_empty_metrics=True, + report_only_initial_ttfb=True, + idle_timeout=None, # Disable idle timeout + ), + observers=[RTVIObserver(rtvi)], + idle_timeout_secs=None, + cancel_on_idle_timeout=False, + ) + + # Track task state + task_running = True + + @rtvi.event_handler("on_client_ready") + async def on_client_ready(rtvi: RTVIProcessor): + logger.info("Pipecat client ready.") + await rtvi.set_bot_ready() + # Kick off the conversation. + try: + await task.queue_frames([user_context_aggregator.get_context_frame()]) + except Exception as e: + logger.error(f"Error queuing context frame: {e}") + + @ws_transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Pipecat Client connected from {client.remote_address}") + # Reset RTVI state for new connection + rtvi._client_ready = False + rtvi._bot_ready = False + + @ws_transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Pipecat Client disconnected from {client.remote_address}") + # Don't cancel the task immediately - let it handle the disconnection gracefully + # The task will continue running and can accept new connections + # Only send an EndTaskFrame to clean up the current session + if task_running: + try: + await task.queue_frames([EndTaskFrame()]) + except Exception as e: + # Don't log warnings for normal connection closures + if "ConnectionClosedOK" not in str(e) and "1005" not in str(e): + logger.warning(f"Error sending EndTaskFrame: {e}") + else: + logger.debug(f"Normal connection closure: {e}") + + @ws_transport.event_handler("on_session_timeout") + async def on_session_timeout(transport, client): + logger.info(f"Session timeout for {client.remote_address}") + # Don't cancel the task - keep server running indefinitely + logger.info("Session timeout occurred but keeping server running") + # Note: With session_timeout=None, this handler should never be called + + logger.info("Starting pipeline runner...") + + try: + runner = PipelineRunner() + # Run the task until shutdown is requested + await asyncio.wait_for(runner.run(task), timeout=None) # No timeout - run indefinitely + except asyncio.TimeoutError: + logger.info("Pipeline runner timeout (should not happen with no timeout)") + except Exception as e: + logger.error(f"Pipeline runner error: {e}") + task_running = False + finally: + logger.info("Pipeline runner stopped") + + +if __name__ == "__main__": + asyncio.run(run_bot_websocket_server()) diff --git a/examples/voice_agent/server/env.example b/examples/voice_agent/server/env.example new file mode 100644 index 000000000000..65caf95bdd81 --- /dev/null +++ b/examples/voice_agent/server/env.example @@ -0,0 +1,2 @@ +HF_TOKEN= # Your HuggingFace API key +WEBSOCKET_SERVER='websocket_server' # Options: 'fast_api' or 'websocket_server' \ No newline at end of file diff --git a/examples/voice_agent/server/server.py b/examples/voice_agent/server/server.py new file mode 100644 index 000000000000..df6aab6af651 --- /dev/null +++ b/examples/voice_agent/server/server.py @@ -0,0 +1,87 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +from contextlib import asynccontextmanager +from typing import Any, Dict + +import uvicorn +from dotenv import load_dotenv +from fastapi import FastAPI, Request, WebSocket +from fastapi.middleware.cors import CORSMiddleware + +# Load environment variables +load_dotenv(override=True) + +from bot_websocket_server import run_bot_websocket_server + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Handles FastAPI startup and shutdown.""" + yield # Run app + + +# Initialize FastAPI app with lifespan manager +app = FastAPI(lifespan=lifespan) + +# Configure CORS to allow requests from any origin +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket, path: str = "/ws"): + raise NotImplementedError("FastAPI websocket endpoint is not implemented") + + +@app.post("/connect") +async def bot_connect(request: Request) -> Dict[Any, Any]: + print("Received /connect request") + server_mode = os.getenv("WEBSOCKET_SERVER", "websocket_server") + if server_mode == "websocket_server": + # Use the host that the client connected to (from the request) + server_host = request.url.hostname or request.headers.get("host", "").split(":")[0] + ws_url = f"ws://{server_host}:8765" + else: + ws_url = "ws://localhost:7860/ws" + print(f"Returning WebSocket URL: {ws_url}") + return {"ws_url": ws_url} + + +async def main(): + server_mode = os.getenv("WEBSOCKET_SERVER", "websocket_server") + tasks = [] + try: + if server_mode == "websocket_server": + tasks.append(run_bot_websocket_server()) + else: + raise ValueError(f"Invalid server mode: {server_mode}") + config = uvicorn.Config(app, host="0.0.0.0", port=7860) + server = uvicorn.Server(config) + tasks.append(server.serve()) + + await asyncio.gather(*tasks) + except asyncio.CancelledError: + print("Tasks cancelled (probably due to shutdown).") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/voice_agent/server/server_config.yaml b/examples/voice_agent/server/server_config.yaml new file mode 100644 index 000000000000..1181ccf08380 --- /dev/null +++ b/examples/voice_agent/server/server_config.yaml @@ -0,0 +1,65 @@ +# This is an example config for setting up a NeMo Voice Agent server. +# Please refer to https://github.com/NVIDIA-NeMo/NeMo/tree/main/examples/voice_agent/README.md for more details + +transport: + audio_out_10ms_chunks: 10 # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output + +vad: + type: silero + confidence: 0.6 # VAD threshold for detecting speech versus non-speech + start_secs: 0.1 # min amount of speech to trigger UserStartSpeaking + stop_secs: 0.8 # min amount of silence to trigger UserStopSpeaking + min_volume: 0.4 # Microphone volumn threshold for VAD + +stt: + type: nemo + model: "stt_en_fastconformer_hybrid_large_streaming_80ms" + device: "cuda" + att_context_size: [70, 1] # left and right attention context sizes for streaming ASR + frame_len_in_secs: 0.08 # default for FastConformer, do not change unless using other architechtures + +diar: + type: nemo + enabled: true # set to false to disable + model: "nvidia/diar_streaming_sortformer_4spk-v2" + device: "cuda" + threshold: 0.4 # threshold value used to determine if a speaker exists or not, setting it to a lower value will increaset the sensitivity of the model + frame_len_in_secs: 0.08 # default for Sortformer, do not change unless using other architechtures + +turn_taking: + backchannel_phrases: "./server/backchannel_phrases.yaml" # set it to the actual path of the file, or specify a list of backchannel phrases here + max_buffer_size: 2 # num of words more than this amount will interrupt the LLM immediately if not backchannel phrases + bot_stop_delay: 0.5 # a delay in seconds allowed between server and client audio output, so that the BotStopSpeaking signal is handled not too far away from the actual time that the user hears all audio output + +llm: + type: hf + dtype: bfloat16 # torch.dtype for LLM + model: "Qwen/Qwen2.5-7B-Instruct" # model name for HF models, will be used via `AutoModelForCausalLM.from_pretrained()` + device: "cuda" + system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt + # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking + # system_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt` + system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (, , etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. If a speaker provides their name, use their name when addressing their requests." + # Please refer to the model page of each HF LLM model to set following params properly. + apply_chat_template_kwargs: # kwargs that will be passed into tokenizer.apply_chat_template() function + add_generation_prompt: true # This is required in most cases, do not change unless you're sure of it + tokenize: false # This is required, do not change + generation_kwargs: # kwargs that will be passed into model.generate() function of HF models + temperature: 0.7 # LLM sampling params + top_k: 20 # LLM sampling params + top_p: 0.9 # LLM sampling params + min_p: 0.0 # LLM sampling params + max_new_tokens: 256 # max num of output tokens from LLM + do_sample: true # enable sampling + +tts: + type: nemo + model: fastpitch-hifigan + fastpitch_model: "nvidia/tts_en_fastpitch" + hifigan_model: "nvidia/tts_hifigan" + device: "cuda" + extra_separator: # a list of additional punctuations to chunk LLM response into segments for faster TTS output, e.g., ",". Set to `null` to use default behavior + - "," + - "?" + - "!" + think_tokens: ["", ""] # specify them to avoid TTS for thinking process, set to `null` to allow thinking out loud diff --git a/nemo/agents/__init__.py b/nemo/agents/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/__init__.py b/nemo/agents/voice_agent/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/__init__.py b/nemo/agents/voice_agent/pipecat/__init__.py new file mode 100644 index 000000000000..55fb128340af --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import pipecat +except ImportError: + raise ImportError("pipecat is not installed. Please install it with `pip install pipecat-ai`.") diff --git a/nemo/agents/voice_agent/pipecat/frames/__init__.py b/nemo/agents/voice_agent/pipecat/frames/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/frames/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/frames/frames.py b/nemo/agents/voice_agent/pipecat/frames/frames.py new file mode 100644 index 000000000000..df5f1c2c6fef --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/frames/frames.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import dataclass +import numpy as np +from pipecat.frames.frames import DataFrame + + +@dataclass +class DiarResultFrame(DataFrame): + """Diarization frame.""" + + diar_result: np.ndarray | int + stream_id: str = "default" diff --git a/nemo/agents/voice_agent/pipecat/services/__init__.py b/nemo/agents/voice_agent/pipecat/services/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py b/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py new file mode 100644 index 000000000000..1b96c38c91ce --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .diar import NemoDiarService +from .llm import HuggingFaceLLMService +from .stt import NemoSTTService +from .tts import NeMoFastPitchHiFiGANTTSService +from .turn_taking import NeMoTurnTakingService diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/diar.py new file mode 100644 index 000000000000..912179fd93e0 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/diar.py @@ -0,0 +1,359 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +from typing import AsyncGenerator, Optional + +import numpy as np +from loguru import logger +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + StartFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.stt_service import STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 +from pipecat.utils.tracing.service_decorators import traced_stt +from pydantic import BaseModel + +from nemo.agents.voice_agent.pipecat.frames.frames import DiarResultFrame +from nemo.agents.voice_agent.pipecat.services.nemo.legacy_diar import DiarizationConfig, NeMoLegacyDiarService + + +class NeMoDiarInputParams(BaseModel): + threshold: Optional[float] = ( + 0.4 # threshold value used to determine if a speaker exists or not, setting it to a lower value will increase the sensitivity of the diarization model + ) + language: Optional[Language] = Language.EN_US + frame_len_in_secs: Optional[float] = 0.08 # 80ms for FastConformer model + config_path: Optional[str] = None # path to the Niva ASR config file + raw_audio_frame_len_in_secs: Optional[float] = 0.016 # 16ms for websocket transport + buffer_size: Optional[int] = ( + 30 # number of audio frames to buffer, 1 frame is 16ms, streaming Sortformer was trained with 6*0.08=0.48s chunks + ) + + +class NemoDiarService(STTService): + def __init__( + self, + *, + model: Optional[str] = "", + device: Optional[str] = "cuda:0", + sample_rate: Optional[int] = 16000, + params: Optional[NeMoDiarInputParams] = None, + use_vad: bool = True, + audio_passthrough: bool = True, + backend: Optional[str] = "legacy", + enabled: bool = True, + **kwargs, + ): + super().__init__(audio_passthrough=audio_passthrough, **kwargs) + + self._enabled = enabled + self._queue = asyncio.Queue() + self._response_queue = asyncio.Queue() # Add response queue + self._processing_task = None # Add processing task + self._response_task = None # Add response task + self._device = device + self._sample_rate = sample_rate + self._audio_passthrough = audio_passthrough + params.buffer_size = params.frame_len_in_secs // params.raw_audio_frame_len_in_secs + self._params = params + self._model_name = model + self._use_vad = use_vad + self._backend = backend + if not params: + raise ValueError("params is required") + + self._load_model() + + self._vad_user_speaking = False + self._audio_buffer = [] + self._current_speaker_id = None + self._processing_running = False + + if not self._use_vad: + self._vad_user_speaking = True + + def _load_model(self): + if not self._enabled or not self._model_name: + self._model = None + self._enabled = False + return + + if self._backend == "legacy": + cfg = DiarizationConfig() + cfg.device = self._device + self._model = NeMoLegacyDiarService( + cfg, self._model_name, frame_len_in_secs=self._params.frame_len_in_secs, sample_rate=self.sample_rate + ) + else: + raise ValueError(f"Invalid backend: {self._backend}") + logger.info(f"Diarization service initialized on device: {self._device}") + + def can_generate_metrics(self) -> bool: + """Indicates whether this service can generate metrics. + + Returns: + bool: True, as this service supports metric generation. + """ + return True + + async def start(self, frame: StartFrame): + """Handle service start.""" + await super().start(frame) + + # Initialize the model if not already done + if not hasattr(self, "_model"): + self._load_model() + + # Start background processing task + if not self._processing_task: + self._processing_task = self.create_task(self._processing_task_handler()) + + # Start response handling task + if not self._response_task: + self._response_task = self.create_task(self._response_task_handler()) + + async def stop(self, frame: EndFrame): + """Handle service stop.""" + await super().stop(frame) + await self._stop_tasks() + + async def cancel(self, frame: CancelFrame): + """Handle service cancellation.""" + await super().cancel(frame) + await self._stop_tasks() + + async def _stop_tasks(self): + """Stop background processing tasks.""" + await self._queue.put(None) # Signal to stop processing + if self._processing_task: + await self.cancel_task(self._processing_task) + self._processing_task = None + + if self._response_task: + await self.cancel_task(self._response_task) + self._response_task = None + + def _diarization_processor(self): + """Background processor that handles diarization calls.""" + try: + while self._processing_running: + try: + # Get audio from queue - blocking call that will be interrupted by cancellation + future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop()) + audio = future.result() + + if audio is None: # Stop signal + logger.debug("Received stop signal in background processor") + break + + # Process diarization + diar_result = self._model.diarize(audio) + + # Send result back to async loop + asyncio.run_coroutine_threadsafe(self._response_queue.put(diar_result), self.get_event_loop()) + + except Exception as e: + logger.error(f"Error in background diarization processor: {e}") + # Send error back to async loop + asyncio.run_coroutine_threadsafe(self._response_queue.put(('error', e)), self.get_event_loop()) + + except Exception as e: + logger.error(f"Background diarization processor fatal error: {e}") + finally: + logger.debug("Background diarization processor stopped") + + async def _processing_task_handler(self): + """Handler for background processing task.""" + try: + self._processing_running = True + logger.debug("Starting background processing task") + await asyncio.to_thread(self._diarization_processor) + except asyncio.CancelledError: + logger.debug("Background processing task cancelled") + self._processing_running = False + raise + finally: + self._processing_running = False + + async def _handle_diarization_result(self, diar_result): + """Handle diarization result from background processing.""" + try: + if diar_result is None: + return + dominant_speaker_id = self._get_dominant_speaker_id(diar_result) + # logger.debug(f"Dominant speaker ID: {dominant_speaker_id}") + if dominant_speaker_id is not None and dominant_speaker_id != self._current_speaker_id: + self._current_speaker_id = dominant_speaker_id + logger.debug(f"Pushing DiarResultFrame with speaker {dominant_speaker_id}") + await self.push_frame(DiarResultFrame(dominant_speaker_id, stream_id="default")) + except Exception as e: + logger.error(f"Error handling diarization result: {e}") + await self.push_frame( + ErrorFrame( + str(e), + time_now_iso8601(), + ) + ) + + async def _response_task_handler(self): + """Handler for processing diarization results.""" + logger.debug("Response task handler started") + try: + while True: + try: + result = await self._response_queue.get() + + if isinstance(result, tuple) and result[0] == 'error': + # Handle error from background processing + error = result[1] + logger.error(f"Error in NeMo Diarization processing: {error}") + await self.push_frame( + ErrorFrame( + str(error), + time_now_iso8601(), + ) + ) + else: + # Handle successful diarization result + await self._handle_diarization_result(result) + + except Exception as e: + logger.error(f"Error in response task handler: {e}") + except asyncio.CancelledError: + logger.debug("Response task handler cancelled") + raise + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Process audio data and generate transcription frames. + + Args: + audio: Raw audio bytes to transcribe + + Yields: + Frame: Transcription frames containing the results + """ + if self._vad_user_speaking and self._enabled: + self._audio_buffer.append(audio) + if len(self._audio_buffer) >= self._params.buffer_size: + await self.start_ttfb_metrics() + await self.start_processing_metrics() + audio = b"".join(self._audio_buffer) + self._audio_buffer = [] + # Queue audio for background processing + await self._queue.put(audio) + yield None + + @traced_stt + async def _handle_transcription(self, transcript: str, is_final: bool, language: Optional[str] = None): + """Handle a transcription result. + + Args: + transcript: The transcribed text + is_final: Whether this is a final transcription + language: The language of the transcription + """ + pass # Base implementation - can be extended for specific handling needs + + async def set_language(self, language: Language): + """Update the service's recognition language. + + Args: + language: New language for recognition + """ + if self._params: + self._params.language = language + else: + self._params = NeMoDiarInputParams(language=language) + + logger.info(f"Switching STT language to: {language}") + + async def set_model(self, model: str): + """Update the service's model. + + Args: + model: New model name/path to use + """ + await super().set_model(model) + self._model_name = model + self._load_model() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process audio data and generate transcription frames. + + Args: + audio: Raw audio bytes to transcribe + + Yields: + Frame: Transcription frames containing the results + """ + if not self._enabled: + # if diarization is disabled, just pass the frame through + await self.push_frame(frame, direction) + return + + await super().process_frame(frame, direction) + if isinstance(frame, VADUserStartedSpeakingFrame): + self._vad_user_speaking = True + self._audio_buffer = [] + logger.debug("VADUserStartedSpeakingFrame received") + elif isinstance(frame, VADUserStoppedSpeakingFrame): + self._vad_user_speaking = False + logger.debug("VADUserStoppedSpeakingFrame received") + self._current_speaker_id = None + self._audio_buffer = [] + + def reset(self): + self._current_speaker_id = None + self._audio_buffer = [] + self._vad_user_speaking = False + self._model.reset_state() + + def _get_dominant_speaker_id(self, spk_pred: np.ndarray): + spk_pred = (spk_pred > self._params.threshold).astype(int) + dominant_speaker_id = None + if spk_pred.sum() > 0: + # get the dominant speaker id + # Filter to only keep frames that have any speaker probability > 0.0 + valid_frame_mask = spk_pred.sum(axis=1) > 0 + + # Filter diar_result to only keep valid frames + filtered_diar_result = spk_pred[valid_frame_mask] # ndarray of shape [num_valid_frames, num_speakers] + + # Get the primary speaker for each valid frame + primary_spk = np.argmax(filtered_diar_result, axis=1) # ndarray of shape [num_valid_frames] + # logger.debug(f"Primary speaker for valid frames: {primary_spk}") + + # count the number of different speakers in the primary speaker sequence + num_speakers = len(np.unique(primary_spk)) + # logger.debug(f"Number of different speakers: {num_speakers}") + + # If there are multiple speakers, get the dominant one + if num_speakers > 1: + # Count occurrences of each speaker + speaker_counts = np.bincount(primary_spk) + dominant_speaker_id = np.argmax(speaker_counts) + else: + # Only one speaker, return that speaker ID + dominant_speaker_id = primary_spk[0] + return dominant_speaker_id diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py new file mode 100644 index 000000000000..ecb2632254db --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_asr.py @@ -0,0 +1,260 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. + +import math +from typing import List + +import numpy as np +import torch +from omegaconf import open_dict + +import nemo.collections.asr as nemo_asr +from nemo.agents.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis +from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer + + +class NemoLegacyASRService: + def __init__( + self, + model: str = "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi", + att_context_size: List[int] = [70, 1], + device: str = "cuda", + eou_string: str = "", + eob_string: str = "", + decoder_type: str = None, + chunk_size: int = -1, + shift_size: int = -1, + left_chunks: int = 2, + sample_rate: int = 16000, + frame_len_in_secs: float = 0.08, + use_amp: bool = False, + chunk_size_in_secs: float = 0.08, + ): + self.model = model + self.eou_string = eou_string + self.eob_string = eob_string + self.device = device + self.att_context_size = att_context_size + self.decoder_type = decoder_type + self.chunk_size = chunk_size + self.shift_size = shift_size + self.left_chunks = left_chunks + self.asr_model = self._load_model(model) + self.tokenizer = self.asr_model.tokenizer # type: SentencePieceTokenizer + self.use_amp = use_amp + self.pad_and_drop_preencoded = False + self.blank_id = self.get_blank_id() + self.chunk_size_in_secs = chunk_size_in_secs + + print("NemoLegacyASRService initialized") + + assert len(self.att_context_size) == 2, "Att context size must be a list of two integers" + assert ( + self.att_context_size[0] >= 0 + ), f"Left att context size must be greater than 0: {self.att_context_size[0]}" + assert ( + self.att_context_size[1] >= 0 + ), f"Right att context size must be greater than 0: {self.att_context_size[1]}" + + window_stride_in_secs = self.asr_model.cfg.preprocessor.window_stride + model_stride = self.asr_model.cfg.encoder.subsampling_factor + self.model_chunk_size = self.asr_model.encoder.streaming_cfg.chunk_size + if isinstance(self.model_chunk_size, list): + self.model_chunk_size = self.model_chunk_size[1] + self.pre_encode_cache_size = self.asr_model.encoder.streaming_cfg.pre_encode_cache_size + if isinstance(self.pre_encode_cache_size, list): + self.pre_encode_cache_size = self.pre_encode_cache_size[1] + self.pre_encode_cache_size_in_secs = self.pre_encode_cache_size * window_stride_in_secs + + self.tokens_per_frame = math.ceil(np.trunc(self.chunk_size_in_secs / window_stride_in_secs) / model_stride) + # overwrite the encoder streaming params with proper shift size for cache aware streaming + self.asr_model.encoder.setup_streaming_params( + chunk_size=self.model_chunk_size // model_stride, shift_size=self.tokens_per_frame + ) + + model_chunk_size_in_secs = self.model_chunk_size * window_stride_in_secs + + self.buffer_size_in_secs = self.pre_encode_cache_size_in_secs + model_chunk_size_in_secs + + self._audio_buffer = CacheFeatureBufferer( + sample_rate=sample_rate, + buffer_size_in_secs=self.buffer_size_in_secs, + chunk_size_in_secs=self.chunk_size_in_secs, + preprocessor_cfg=self.asr_model.cfg.preprocessor, + device=self.device, + ) + self._reset_cache() + self._previous_hypotheses = self._get_blank_hypothesis() + + def _reset_cache(self): + ( + self._cache_last_channel, # [17, B, 70, 512] + self._cache_last_time, # [17, B, 512, 8] + self._cache_last_channel_len, # B + ) = self.asr_model.encoder.get_initial_cache_state( + 1 + ) # batch size is 1 + + def _get_blank_hypothesis(self) -> List[Hypothesis]: + blank_hypothesis = Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestamp=[], last_token=None) + return [blank_hypothesis] + + @property + def drop_extra_pre_encoded(self): + return self.asr_model.encoder.streaming_cfg.drop_extra_pre_encoded + + def get_blank_id(self): + return len(self.tokenizer.vocab) + + def get_text_from_tokens(self, tokens: List[int]) -> str: + sep = "\u2581" # '▁' + tokens = [int(t) for t in tokens if t != self.blank_id] + if tokens: + pieces = self.tokenizer.ids_to_tokens(tokens) + text = "".join([p.replace(sep, ' ') if p.startswith(sep) else p for p in pieces]) + else: + text = "" + return text + + def _load_model(self, model: str): + if model.endswith(".nemo"): + asr_model = nemo_asr.models.ASRModel.restore_from(model, map_location=torch.device(self.device)) + else: + asr_model = nemo_asr.models.ASRModel.from_pretrained(model, map_location=torch.device(self.device)) + + if self.decoder_type is not None and hasattr(asr_model, "cur_decoder"): + asr_model.change_decoding_strategy(decoder_type=self.decoder_type) + elif isinstance(asr_model, nemo_asr.models.EncDecCTCModel): + self.decoder_type = "ctc" + elif isinstance(asr_model, nemo_asr.models.EncDecRNNTModel): + self.decoder_type = "rnnt" + else: + raise ValueError("Decoder type not supported for this model.") + + if self.att_context_size is not None: + if hasattr(asr_model.encoder, "set_default_att_context_size"): + asr_model.encoder.set_default_att_context_size(att_context_size=self.att_context_size) + else: + raise ValueError("Model does not support multiple lookaheads.") + else: + self.att_context_size = asr_model.cfg.encoder.att_context_size + + decoding_cfg = asr_model.cfg.decoding + with open_dict(decoding_cfg): + decoding_cfg.strategy = "greedy" + decoding_cfg.compute_timestamps = False + decoding_cfg.preserve_alignments = True + if hasattr(asr_model, 'joint'): # if an RNNT model + decoding_cfg.greedy.max_symbols = 10 + decoding_cfg.fused_batch_size = -1 + asr_model.change_decoding_strategy(decoding_cfg) + + if hasattr(asr_model.encoder, "set_default_att_context_size"): + asr_model.encoder.set_default_att_context_size(att_context_size=self.att_context_size) + + # chunk_size is set automatically for models trained for streaming. + # For models trained for offline mode with full context, we need to pass the chunk_size explicitly. + if self.chunk_size > 0: + if self.shift_size < 0: + shift_size = self.chunk_size + else: + shift_size = self.shift_size + asr_model.encoder.setup_streaming_params( + chunk_size=self.chunk_size, left_chunks=self.left_chunks, shift_size=shift_size + ) + + asr_model.eval() + return asr_model + + def _get_best_hypothesis(self, encoded, encoded_len, partial_hypotheses=None): + if self.decoder_type == "ctc": + best_hyp = self.asr_model.decoding.ctc_decoder_predictions_tensor( + encoded, + encoded_len, + return_hypotheses=True, + ) + elif self.decoder_type == "rnnt": + best_hyp = self.asr_model.decoding.rnnt_decoder_predictions_tensor( + encoded, encoded_len, return_hypotheses=True, partial_hypotheses=partial_hypotheses + ) + else: + raise ValueError("Decoder type not supported for this model.") + return best_hyp + + def _get_tokens_from_alignments(self, alignments): + tokens = [] + if self.decoder_type == "ctc": + tokens = alignments[1] + tokens = [int(t) for t in tokens if t != self.blank_id] + elif self.decoder_type == "rnnt": + for t in range(len(alignments)): + for u in range(len(alignments[t])): + logprob, token_id = alignments[t][u] # (logprob, token_id) + token_id = int(token_id) + if token_id != self.blank_id: + tokens.append(token_id) + else: + raise ValueError("Decoder type not supported for this model.") + return tokens + + def transcribe(self, audio: bytes, stream_id: str = "default") -> str: + # Convert bytes to numpy array + audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0 + + self._audio_buffer.update(audio_array) + + features = self._audio_buffer.get_feature_buffer() + feature_lengths = torch.tensor([features.shape[1]], device=self.device) + features = features.unsqueeze(0) # Add batch dimension + + with torch.no_grad(): + ( + encoded, + encoded_len, + cache_last_channel, + cache_last_time, + cache_last_channel_len, + ) = self.asr_model.encoder.cache_aware_stream_step( + processed_signal=features, + processed_signal_length=feature_lengths, + cache_last_channel=self._cache_last_channel, + cache_last_time=self._cache_last_time, + cache_last_channel_len=self._cache_last_channel_len, + keep_all_outputs=False, + drop_extra_pre_encoded=self.drop_extra_pre_encoded, + ) + + best_hyp = self._get_best_hypothesis(encoded, encoded_len, partial_hypotheses=self._previous_hypotheses) + + self._previous_hypotheses = best_hyp + self._cache_last_channel = cache_last_channel + self._cache_last_time = cache_last_time + self._cache_last_channel_len = cache_last_channel_len + + tokens = self._get_tokens_from_alignments(best_hyp[0].alignments) + + text = self.get_text_from_tokens(tokens) + + is_final = False + if self.eou_string in text or self.eob_string in text: + is_final = True + self.reset_state(stream_id=stream_id) + return text, is_final + + def reset_state(self, stream_id: str = "default"): + self._audio_buffer.reset() + self._reset_cache() + self._previous_hypotheses = self._get_blank_hypothesis() diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py new file mode 100644 index 000000000000..2e08688e0e42 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/legacy_diar.py @@ -0,0 +1,212 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. + +from dataclasses import dataclass +from typing import Tuple + +import numpy as np +import torch +from torch import Tensor + +from nemo.agents.voice_agent.pipecat.services.nemo.utils import CacheFeatureBufferer +from nemo.collections.asr.models import SortformerEncLabelModel + +from nemo.collections.asr.modules.sortformer_modules import StreamingSortformerState + + +@dataclass +class DiarizationConfig: + """Diarization configuration parameters for inference.""" + + model_path: str = "nvidia/diar_sortformer_4spk-v1" + device: str = "cuda" + + log: bool = False # If True, log will be printed + max_num_speakers: int = 4 + spkcache_len: int = 188 + spkcache_refresh_rate: int = 144 + fifo_len: int = 188 + chunk_len: int = 6 + chunk_left_context: int = 1 + chunk_right_context: int = 7 + + +class NeMoLegacyDiarService: + def __init__( + self, + cfg: DiarizationConfig, + model: str, + frame_len_in_secs: float = 0.08, + sample_rate: int = 16000, + left_offset: int = 8, + right_offset: int = 8, + use_amp: bool = False, + compute_dtype: torch.dtype = torch.float32, + ): + self.model = model + self.cfg = cfg + self.cfg.model_path = model + self.diarizer = self.build_diarizer() + self.device = cfg.device + self.use_amp = use_amp + self.compute_dtype = compute_dtype + self.frame_len_in_secs = frame_len_in_secs + self.left_offset = left_offset + self.right_offset = right_offset + self.chunk_size = self.cfg.chunk_len + self.buffer_size_in_secs = ( + self.cfg.chunk_len * self.frame_len_in_secs + (self.left_offset + self.right_offset) * 0.01 + ) + self.max_num_speakers = self.cfg.max_num_speakers + + self.feature_bufferer = CacheFeatureBufferer( + sample_rate=sample_rate, + buffer_size_in_secs=self.buffer_size_in_secs, + chunk_size_in_secs=self.cfg.chunk_len * self.frame_len_in_secs, + preprocessor_cfg=self.diarizer.cfg.preprocessor, + device=self.device, + ) + self.streaming_state = self.init_streaming_state(batch_size=1) + self.total_preds = torch.zeros((1, 0, self.max_num_speakers), device=self.diarizer.device) + + print("NeMoLegacyDiarService initialized") + + def build_diarizer(self): + if self.cfg.model_path.endswith(".nemo"): + diar_model = SortformerEncLabelModel.restore_from(self.cfg.model_path, map_location=self.cfg.device) + else: + diar_model = SortformerEncLabelModel.from_pretrained(self.cfg.model_path, map_location=self.cfg.device) + + # Steaming mode setup + diar_model.sortformer_modules.chunk_len = self.cfg.chunk_len + diar_model.sortformer_modules.spkcache_len = self.cfg.spkcache_len + diar_model.sortformer_modules.chunk_left_context = self.cfg.chunk_left_context + diar_model.sortformer_modules.chunk_right_context = self.cfg.chunk_right_context + diar_model.sortformer_modules.fifo_len = self.cfg.fifo_len + diar_model.sortformer_modules.log = self.cfg.log + diar_model.sortformer_modules.spkcache_refresh_rate = self.cfg.spkcache_refresh_rate + diar_model.eval() + + return diar_model + + def print_diar_result(self, diar_result: np.ndarray): + for t in range(diar_result.shape[0]): + spk_probs = "" + for s in range(diar_result.shape[1]): + spk_probs += f"{diar_result[t, s]:.2f} " + print(f"Time {t}: {spk_probs}") + + def diarize(self, audio: bytes, stream_id: str = "default") -> str: + + audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0 + + self.feature_bufferer.update(audio_array) + + features = self.feature_bufferer.get_feature_buffer() + feature_buffers = features.unsqueeze(0) # add batch dimension + feature_buffers = feature_buffers.transpose(1, 2) # [batch, feature, time] -> [batch, time, feature] + feature_buffer_lens = torch.tensor([feature_buffers.shape[1]], device=self.device) + self.streaming_state, chunk_preds = self.stream_step( + processed_signal=feature_buffers, + processed_signal_length=feature_buffer_lens, + streaming_state=self.streaming_state, + total_preds=self.total_preds, + left_offset=self.left_offset, + right_offset=self.right_offset, + ) + self.total_preds = chunk_preds + diar_result = chunk_preds[:, -self.chunk_size :, :].clone().cpu().numpy() + return diar_result[0] # tensor of shape [6, 4] + + def reset_state(self, stream_id: str = "default"): + self.feature_bufferer.reset() + self.streaming_state = self.init_streaming_state(batch_size=1) + self.total_preds = torch.zeros((1, 0, self.max_num_speakers), device=self.diarizer.device) + + def init_streaming_state(self, batch_size: int = 1) -> StreamingSortformerState: + """ + Initialize the streaming state for the diarization model. + + Args: + batch_size: The batch size to use. + + Returns: + SortformerStreamingState: The initialized streaming state. + """ + # Use the model's init_streaming_state method but convert to SortformerStreamingState format + nemo_state = self.diarizer.sortformer_modules.init_streaming_state( + batch_size=batch_size, async_streaming=self.diarizer.async_streaming, device=self.device + ) + + return nemo_state + + def stream_step( + self, + processed_signal: Tensor, + processed_signal_length: Tensor, + streaming_state: StreamingSortformerState, + total_preds: Tensor, + left_offset: int = 0, + right_offset: int = 0, + ) -> Tuple[StreamingSortformerState, Tensor]: + """ + Execute a single streaming step for diarization. + + Args: + processed_signal: The processed audio signal. + processed_signal_length: The length of the processed signal. + streaming_state: The current streaming state. + total_preds: The total predictions so far. + left_offset: The left offset for the current chunk. + right_offset: The right offset for the current chunk. + + Returns: + Tuple[SortformerStreamingState, Tensor]: The updated streaming state and predictions. + """ + # Move tensors to correct device + if processed_signal.device != self.device: + processed_signal = processed_signal.to(self.device) + + if processed_signal_length.device != self.device: + processed_signal_length = processed_signal_length.to(self.device) + + if total_preds is not None and total_preds.device != self.device: + total_preds = total_preds.to(self.device) + + with ( + torch.amp.autocast(device_type=self.device, dtype=self.compute_dtype, enabled=self.use_amp), + torch.inference_mode(), + torch.no_grad(), + ): + try: + # Call the model's forward_streaming_step method + streaming_state, diar_pred_out_stream = self.diarizer.forward_streaming_step( + processed_signal=processed_signal, + processed_signal_length=processed_signal_length, + streaming_state=streaming_state, + total_preds=total_preds, + left_offset=left_offset, + right_offset=right_offset, + ) + except Exception as e: + print(f"Error in diarizer streaming step: {e}") + # print the stack trace + import traceback + + traceback.print_exc() + # Return the existing state and preds if there's an error + return streaming_state, total_preds + + return streaming_state, diar_pred_out_stream diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/llm.py b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py new file mode 100644 index 000000000000..2b3f07434ca6 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/llm.py @@ -0,0 +1,266 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import uuid +from threading import Thread +from typing import AsyncGenerator, List + +from jinja2.exceptions import TemplateError +from loguru import logger +from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam +from pipecat.frames.frames import LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMTextFrame +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.openai.llm import OpenAILLMService +from transformers import AsyncTextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer + +DEFAULT_GENERATION_KWARGS = { + "max_new_tokens": 256, + "temperature": 0.7, + "top_p": 0.9, + "do_sample": True, +} + + +class HuggingFaceLLMLocalService: + def __init__( + self, + model: str = "meta-llama/Meta-Llama-3-8B-Instruct", + device: str = "cuda:0", + dtype: str = "bfloat16", + generation_kwargs: dict = None, + apply_chat_template_kwargs: dict = None, + ): + self.device = device + self.dtype = dtype + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForCausalLM.from_pretrained( + model, device_map=device, torch_dtype=dtype, trust_remote_code=True + ) # type: AutoModelForCausalLM + + self.generation_kwargs = generation_kwargs if generation_kwargs else DEFAULT_GENERATION_KWARGS + logger.debug(f"LLM generation kwargs: {self.generation_kwargs}") + + self.apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs else {} + if "tokenize" in self.apply_chat_template_kwargs: + if self.apply_chat_template_kwargs["tokenize"] is not False: + logger.warning( + f"Found `tokenize=True` in apply_chat_template_kwargs, it will be ignored and forced to `False`" + ) + self.apply_chat_template_kwargs.pop("tokenize") + + logger.debug(f"LLM apply_chat_template kwargs: {self.apply_chat_template_kwargs}") + + def _maybe_add_user_message(self, messages: List[ChatCompletionMessageParam]) -> List[ChatCompletionMessageParam]: + """ + Some LLMs like "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" requires a user turn after the system prompt, this function is used to add a dummy user turn if the system prompt is followed by an assistant turn. + """ + if len(messages) > 1 and messages[0]["role"] == "system" and messages[1]["role"] == "assistant": + message = {"role": "user", "content": "Hi"} + messages.insert(1, message) + return messages + + def _maybe_merge_consecutive_turns( + self, messages: List[ChatCompletionMessageParam] + ) -> List[ChatCompletionMessageParam]: + """ + Merge consecutive turns of the same role into a single turn, since some LLMs like "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" do not support consecutive turns of the same role. + """ + if not messages: + return messages + + merged_messages = [] + current_role = None + current_content = "" + + for message in messages: + role = message["role"] + content = message["content"] + + if role == current_role: + # Merge with previous message of same role + current_content += "; " + content + else: + # Save previous message if exists + if current_role is not None: + merged_messages.append({"role": current_role, "content": current_content}) + + # Start new message + current_role = role + current_content = content + + # Add the last message + if current_role is not None: + merged_messages.append({"role": current_role, "content": current_content}) + + return merged_messages + + def _apply_chat_template(self, messages: List[ChatCompletionMessageParam]) -> str: + """ + Apply the chat template to the messages. + """ + return self.tokenizer.apply_chat_template(messages, tokenize=False, **self.apply_chat_template_kwargs) + + def _get_prompt_from_messages(self, messages: List[ChatCompletionMessageParam]) -> str: + """ + Get the formatted prompt from the conversation history messages. + This function also tries to fix the messages if the LLM cannot handle consecutive turns of the same role, + or requires a user turn after the system prompt. + """ + try: + prompt = self._apply_chat_template(messages) + return prompt + except TemplateError as e: + logger.warning(f"Got TemplateError: {e}.") + + logger.debug(f"Input LLM messages: {messages}") + if len(messages) > 1 and messages[0]["role"] == "system" and messages[1]["role"] == "assistant": + logger.warning("Trying to fix by adding dummy user message after system prompt...") + try: + messages = self._maybe_add_user_message(messages) + logger.debug(f"LLM messages after adding dummy user message: {messages}") + prompt = self._apply_chat_template(messages) + return prompt + except TemplateError as e: + logger.warning(f"Got TemplateError: {e}. Trying to fix by merging consecutive turns if possible.") + + try: + new_messages = self._maybe_merge_consecutive_turns(messages) + logger.debug(f"LLM messages after merging consecutive user turns: {new_messages}") + prompt = self._apply_chat_template(new_messages) + # Update the messages in place if successful + messages.clear() + messages.extend(new_messages) + return prompt + except Exception as e: + logger.warning(f"Got Exception: {e}, messages: {messages}") + raise e + + async def generate_stream( + self, messages: List[ChatCompletionMessageParam], **kwargs + ) -> AsyncGenerator[ChatCompletionChunk, None]: + + # Convert messages to prompt format + prompt = self._get_prompt_from_messages(messages) + + logger.debug(f"LLM prompt: {prompt}") + + inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(self.device) + + # Generate with streaming + streamer = AsyncTextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) + generation_kwargs = { + **inputs, + "streamer": streamer, + **self.generation_kwargs, + } + + # Start generation in background + thread = Thread( + target=self.model.generate, + kwargs=generation_kwargs, + ) + thread.start() + + # Stream the output + async for text in streamer: + # logger.debug(f"Streamer yielded text: {text}") + chunk = ChatCompletionChunk( + id="hf-" + str(uuid.uuid4()), + choices=[{"delta": {"content": text}, "finish_reason": None, "index": 0}], + created=int(time.time()), + model=self.model.config._name_or_path, + object="chat.completion.chunk", + ) + yield chunk + + +class HuggingFaceLLMService(OpenAILLMService): + def __init__( + self, + *, + model: str = "google/gemma-7b-it", + device: str = "cuda", + dtype: str = "bfloat16", + generation_kwargs: dict = None, + apply_chat_template_kwargs: dict = None, + **kwargs, + ): + self._model_name = model + self._device = device + self._dtype = dtype + self._generation_kwargs = generation_kwargs if generation_kwargs is not None else DEFAULT_GENERATION_KWARGS + self._apply_chat_template_kwargs = apply_chat_template_kwargs if apply_chat_template_kwargs is not None else {} + super().__init__(model=model, **kwargs) + + def create_client(self, api_key=None, base_url=None, **kwargs): + return HuggingFaceLLMLocalService( + model=self._model_name, + device=self._device, + dtype=self._dtype, + generation_kwargs=self._generation_kwargs, + apply_chat_template_kwargs=self._apply_chat_template_kwargs, + ) + + async def _process_context(self, context: OpenAILLMContext): + """Process a context through the LLM and push text frames. + + Args: + context (OpenAILLMContext): The context to process, containing messages + and other information needed for the LLM interaction. + """ + await self.push_frame(LLMFullResponseStartFrame()) + cumulative_text = "" + try: + await self.start_ttfb_metrics() + messages = context.get_messages() + async for chunk in self._client.generate_stream(messages): + if chunk.choices[0].delta.content: + await self.stop_ttfb_metrics() + text = chunk.choices[0].delta.content + cumulative_text += text + frame = LLMTextFrame(text) + await self.push_frame(frame) + except Exception as e: + logger.error(f"Error in _process_context: {e}", exc_info=True) + raise + finally: + cumulative_text = " ".join(cumulative_text.split()).strip() + if not cumulative_text: + logger.warning(f"LLM response is empty for context: {context}") + await self.push_frame(LLMFullResponseEndFrame()) + + async def get_chat_completions( + self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam] + ) -> AsyncGenerator[ChatCompletionChunk, None]: + """Create a streaming chat completion using HuggingFace model. + + Args: + context (OpenAILLMContext): The context object containing tools configuration + and other settings for the chat completion. + messages (List[ChatCompletionMessageParam]): The list of messages comprising + the conversation history and current request. + + Returns: + AsyncGenerator[ChatCompletionChunk]: A streaming response of chat completion + chunks that can be processed asynchronously. + """ + params = { + "max_tokens": self._settings["max_tokens"], + "temperature": self._settings["temperature"], + "top_p": self._settings["top_p"], + } + params.update(self._settings["extra"]) + + return self._client.generate_stream(messages, **params) diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/stt.py b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py new file mode 100644 index 000000000000..63ef595d2b00 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/stt.py @@ -0,0 +1,243 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from typing import AsyncGenerator, List, Optional + +from loguru import logger +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + InterimTranscriptionFrame, + StartFrame, + TranscriptionFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.stt_service import STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 +from pipecat.utils.tracing.service_decorators import traced_stt +from pydantic import BaseModel + +from nemo.agents.voice_agent.pipecat.services.nemo.legacy_asr import NemoLegacyASRService + +try: + # disable nemo logging + from nemo.utils import logging + + level = logging.getEffectiveLevel() + logging.setLevel(logging.CRITICAL) + + +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error('In order to use NVIDIA NeMo STT, you need to `pip install "nemo_toolkit[all]"`.') + raise Exception(f"Missing module: {e}") + + +class NeMoSTTInputParams(BaseModel): + language: Optional[Language] = Language.EN_US + att_context_size: Optional[List] = [70, 1] + frame_len_in_secs: Optional[float] = 0.08 # 80ms for FastConformer model + config_path: Optional[str] = None # path to the Niva ASR config file + raw_audio_frame_len_in_secs: Optional[float] = 0.016 # 16ms for websocket transport + buffer_size: Optional[int] = 5 # number of audio frames to buffer, 1 frame is 16ms + + +class NemoSTTService(STTService): + def __init__( + self, + *, + model: Optional[str] = "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi", + device: Optional[str] = "cuda:0", + sample_rate: Optional[int] = 16000, + params: Optional[NeMoSTTInputParams] = None, + has_turn_taking: bool = False, + backend: Optional[str] = "legacy", + decoder_type: Optional[str] = "rnnt", + **kwargs, + ): + super().__init__(**kwargs) + + self._queue = asyncio.Queue() + self._sample_rate = sample_rate + params.buffer_size = params.frame_len_in_secs // params.raw_audio_frame_len_in_secs + self._params = params + self._model_name = model + self._has_turn_taking = has_turn_taking + self._backend = backend + self._decoder_type = decoder_type + if not params: + raise ValueError("params is required") + + self._device = device + + self._load_model() + + self.audio_buffer = [] + + def _load_model(self): + if self._backend == "legacy": + self._model = NemoLegacyASRService(self._model_name, device=self._device, decoder_type=self._decoder_type) + else: + raise ValueError(f"Invalid ASR backend: {self._backend}") + + def can_generate_metrics(self) -> bool: + """Indicates whether this service can generate metrics. + + Returns: + bool: True, as this service supports metric generation. + """ + return True + + async def start(self, frame: StartFrame): + """Handle service start. + + Args: + frame: StartFrame containing initial configuration + """ + await super().start(frame) + + # Initialize the model if not already done + if not hasattr(self, "_model"): + self._load_model() + + async def stop(self, frame: EndFrame): + """Handle service stop. + + Args: + frame: EndFrame that triggered this method + """ + await super().stop(frame) + # Clear any internal state if needed + await self._queue.put(None) # Signal to stop processing + + async def cancel(self, frame: CancelFrame): + """Handle service cancellation. + + Args: + frame: CancelFrame that triggered this method + """ + await super().cancel(frame) + # Clear any internal state + await self._queue.put(None) # Signal to stop processing + self._queue = asyncio.Queue() # Reset the queue + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Process audio data and generate transcription frames. + + Args: + audio: Raw audio bytes to transcribe + + Yields: + Frame: Transcription frames containing the results + """ + await self.start_ttfb_metrics() + await self.start_processing_metrics() + + try: + is_final = False + transcription = None + self.audio_buffer.append(audio) + if len(self.audio_buffer) >= self._params.buffer_size: + audio = b"".join(self.audio_buffer) + self.audio_buffer = [] + + transcription, is_final = self._model.transcribe(audio) + await self.stop_ttfb_metrics() + await self.stop_processing_metrics() + + if transcription: + logger.debug(f"Transcription (is_final={is_final}): `{transcription}`") + + # Get the language from params or default to EN_US + language = self._params.language if self._params else Language.EN_US + + # Create and push the transcription frame + if self._has_turn_taking or not is_final: + frame_type = InterimTranscriptionFrame + else: + frame_type = TranscriptionFrame + await self.push_frame( + frame_type( + transcription, + "", # No speaker ID in this implementation + time_now_iso8601(), + language, + result={"text": transcription}, + ) + ) + + # Handle the transcription + await self._handle_transcription( + transcript=transcription, + is_final=is_final, + language=language, + ) + + yield None + + except Exception as e: + logger.error(f"Error in NeMo STT processing: {e}") + await self.push_frame( + ErrorFrame( + str(e), + time_now_iso8601(), + ) + ) + yield None + + @traced_stt + async def _handle_transcription(self, transcript: str, is_final: bool, language: Optional[str] = None): + """Handle a transcription result. + + Args: + transcript: The transcribed text + is_final: Whether this is a final transcription + language: The language of the transcription + """ + pass # Base implementation - can be extended for specific handling needs + + async def set_language(self, language: Language): + """Update the service's recognition language. + + Args: + language: New language for recognition + """ + if self._params: + self._params.language = language + else: + self._params = NeMoSTTInputParams(language=language) + + logger.info(f"Switching STT language to: {language}") + + async def set_model(self, model: str): + """Update the service's model. + + Args: + model: New model name/path to use + """ + await super().set_model(model) + self._model_name = model + self._load_model() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, VADUserStoppedSpeakingFrame) and isinstance(self._model, NemoLegacyASRService): + # manualy reset the state of the model when end of utterance is detected by VAD + logger.debug("Resetting state of the model due to VADUserStoppedSpeakingFrame") + self._model.reset_state() + await super().process_frame(frame, direction) diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/tts.py b/nemo/agents/voice_agent/pipecat/services/nemo/tts.py new file mode 100644 index 000000000000..a2954e9ca49f --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/tts.py @@ -0,0 +1,386 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import inspect +from collections.abc import AsyncGenerator +from typing import Iterator, List, Optional + +import numpy as np +import torch +from loguru import logger +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + StartFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.services.tts_service import TTSService + +from nemo.collections.tts.models import FastPitchModel, HifiGanModel + + +class BaseNemoTTSService(TTSService): + """Text-to-Speech service using Nemo TTS models. + + This service works with any TTS model that exposes a generate(text) method + that returns audio data. The TTS generation runs in a dedicated background thread to + avoid blocking the main asyncio event loop, following the same pattern as NemoDiarService. + + Args: + model: TTS model instance with a generate(text) method + sample_rate: Audio sample rate in Hz (defaults to 22050) + **kwargs: Additional arguments passed to TTSService + """ + + def __init__( + self, + *, + model, + device: str = "cuda", + sample_rate: int = 22050, + think_tokens: Optional[List[str]] = None, + **kwargs, + ): + super().__init__(sample_rate=sample_rate, **kwargs) + self._model_name = model + self._device = device + self._model = self._setup_model() + self._think_tokens = think_tokens + if think_tokens is not None: + assert ( + isinstance(think_tokens, list) and len(think_tokens) == 2 + ), f"think_tokens must be a list of two strings: {think_tokens}" + + # Background processing infrastructure - no response handler needed + self._tts_queue = asyncio.Queue() + self._processing_task = None + self._processing_running = False + + # Track pending requests with their response queues + self._pending_requests = {} + self._have_seen_think_tokens = False + + def _setup_model(self): + raise NotImplementedError("Subclass must implement _setup_model") + + def _generate_audio(self, text: str) -> Iterator[np.ndarray]: + raise NotImplementedError("Subclass must implement _generate_audio") + + def can_generate_metrics(self) -> bool: + return True + + async def start(self, frame: StartFrame): + """Handle service start.""" + await super().start(frame) + + # Initialize the model if not already done + if not hasattr(self, "_model") or self._model is None: + self._model = self._setup_model() + + # Only start background processing task - no response handler needed + if not self._processing_task: + self._processing_task = self.create_task(self._processing_task_handler()) + + async def stop(self, frame: EndFrame): + """Handle service stop.""" + await super().stop(frame) + await self._stop_tasks() + + async def cancel(self, frame: CancelFrame): + """Handle service cancellation.""" + await super().cancel(frame) + await self._stop_tasks() + + async def _stop_tasks(self): + """Stop background processing tasks.""" + self._processing_running = False + await self._tts_queue.put(None) # Signal to stop processing + + if self._processing_task: + await self.cancel_task(self._processing_task) + self._processing_task = None + + def _tts_processor(self): + """Background processor that handles TTS generation calls.""" + try: + while self._processing_running: + try: + future = asyncio.run_coroutine_threadsafe(self._tts_queue.get(), self.get_event_loop()) + request = future.result() + + if request is None: # Stop signal + logger.debug("Received stop signal in TTS background processor") + break + + text, request_id = request + logger.debug(f"Processing TTS request for text: [{text}]") + + # Get the response queue for this request + response_queue = None + future = asyncio.run_coroutine_threadsafe( + self._get_response_queue(request_id), self.get_event_loop() + ) + response_queue = future.result() + + if response_queue is None: + logger.warning(f"No response queue found for request {request_id}") + continue + + # Process TTS generation + try: + audio_result = self._generate_audio(text) + + # Send result directly to the waiting request + asyncio.run_coroutine_threadsafe( + response_queue.put(('success', audio_result)), self.get_event_loop() + ) + except Exception as e: + logger.error(f"Error in TTS generation: {e}") + # Send error directly to the waiting request + asyncio.run_coroutine_threadsafe(response_queue.put(('error', e)), self.get_event_loop()) + + except Exception as e: + logger.error(f"Error in background TTS processor: {e}") + + except Exception as e: + logger.error(f"Background TTS processor fatal error: {e}") + finally: + logger.debug("Background TTS processor stopped") + + async def _get_response_queue(self, request_id: str): + """Get the response queue for a specific request.""" + return self._pending_requests.get(request_id) + + async def _processing_task_handler(self): + """Handler for background processing task.""" + try: + self._processing_running = True + logger.debug("Starting background TTS processing task") + await asyncio.to_thread(self._tts_processor) + except asyncio.CancelledError: + logger.debug("Background TTS processing task cancelled") + self._processing_running = False + raise + finally: + self._processing_running = False + + def _handle_think_tokens(self, text: str) -> Optional[str]: + """ + Handle the thinking tokens for TTS. + If the thinking tokens are not provided, return the text as is. + If the thinking tokens are provided, and the LLM is thinking, return None. + If the thinking tokens are provided, and the LLM is done thinking, return the text after the end of thinking tokens. + If the thinking tokens are provided, and the LLM starts thinking, return the text before the start of thinking tokens. + If the thinking tokens are provided, and the LLM is not thinking, return the text as is. + """ + if not self._think_tokens: + return text + elif self._have_seen_think_tokens: + # LLM is thinking + if self._think_tokens[1] not in text: + # LLM is still thinking + return None + else: + # LLM is done thinking + idx = text.index(self._think_tokens[1]) + # only return the text after the end of thinking tokens + text = text[idx + len(self._think_tokens[1]) :] + self._have_seen_think_tokens = False + return text + elif self._think_tokens[0] in text: + # LLM now starts thinking + self._have_seen_think_tokens = True + # return text before the start of thinking tokens + idx = text.index(self._think_tokens[0]) + text = text[:idx] + return text + else: + # LLM is not thinking + return text + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using the Nemo TTS model.""" + text = self._handle_think_tokens(text) + + if not text: + yield None + return + + logger.debug(f"{self}: Generating TTS [{text}]") + + try: + await self.start_ttfb_metrics() + yield TTSStartedFrame() + + # Generate unique request ID + import uuid + + request_id = str(uuid.uuid4()) + + # Create response queue for this specific request + request_queue = asyncio.Queue() + self._pending_requests[request_id] = request_queue + + try: + # Queue the TTS request for background processing + await self._tts_queue.put((text, request_id)) + + # Wait for the result directly from our request queue + result = await request_queue.get() + status, data = result + + if status == 'error': + logger.error(f"{self} TTS generation error: {data}") + yield ErrorFrame(error=f"TTS generation error: {str(data)}") + return + + audio_result = data + if audio_result is None: + logger.error(f"{self} TTS model returned None for text: [{text}]") + yield ErrorFrame(error="TTS generation failed - no audio returned") + return + + await self.start_tts_usage_metrics(text) + + # Process the audio result (same as before) + if ( + inspect.isgenerator(audio_result) + or hasattr(audio_result, '__iter__') + and hasattr(audio_result, '__next__') + ): + # Handle generator case + first_chunk = True + for audio_chunk in audio_result: + if first_chunk: + await self.stop_ttfb_metrics() + first_chunk = False + + if audio_chunk is None: + break + + audio_bytes = self._convert_to_bytes(audio_chunk) + chunk_size = self.chunk_size + for i in range(0, len(audio_bytes), chunk_size): + audio_chunk_bytes = audio_bytes[i : i + chunk_size] + if not audio_chunk_bytes: + break + + frame = TTSAudioRawFrame( + audio=audio_chunk_bytes, sample_rate=self.sample_rate, num_channels=1 + ) + yield frame + else: + # Handle single result case + await self.stop_ttfb_metrics() + audio_bytes = self._convert_to_bytes(audio_result) + + chunk_size = self.chunk_size + for i in range(0, len(audio_bytes), chunk_size): + chunk = audio_bytes[i : i + chunk_size] + if not chunk: + break + + frame = TTSAudioRawFrame(audio=chunk, sample_rate=self.sample_rate, num_channels=1) + yield frame + + yield TTSStoppedFrame() + + finally: + # Clean up the pending request + if request_id in self._pending_requests: + del self._pending_requests[request_id] + + except Exception as e: + logger.exception(f"{self} error generating TTS: {e}") + error_message = f"TTS generation error: {str(e)}" + yield ErrorFrame(error=error_message) + + def _convert_to_bytes(self, audio_data) -> bytes: + """Convert various audio data formats to bytes.""" + if isinstance(audio_data, (bytes, bytearray)): + return bytes(audio_data) + + # Handle numpy arrays + try: + import numpy as np + + if isinstance(audio_data, np.ndarray): + # Ensure it's in the right format (16-bit PCM) + if audio_data.dtype in [np.float32, np.float64]: + # Convert float [-1, 1] to int16 [-32768, 32767] + audio_data = np.clip(audio_data, -1.0, 1.0) # Ensure values are in range + audio_data = (audio_data * 32767).astype(np.int16) + elif audio_data.dtype != np.int16: + # Convert other integer types to int16 + audio_data = audio_data.astype(np.int16) + return audio_data.tobytes() + elif hasattr(audio_data, 'tobytes'): + return audio_data.tobytes() + else: + return bytes(audio_data) + except ImportError: + # Fallback if numpy is not available + if hasattr(audio_data, 'tobytes'): + return audio_data.tobytes() + else: + return bytes(audio_data) + + +class NeMoFastPitchHiFiGANTTSService(BaseNemoTTSService): + def __init__( + self, + fastpitch_model: str = "nvidia/tts_en_fastpitch", + hifigan_model: str = "nvidia/tts_hifigan", + device: str = "cuda", + **kwargs, + ): + model_name = f"{fastpitch_model}+{hifigan_model}" + self._fastpitch_model_name = fastpitch_model + self._hifigan_model_name = hifigan_model + super().__init__(model=model_name, device=device, **kwargs) + + def _setup_model(self): + print("Loading model...") + self._fastpitch_model = self._setup_fastpitch_model(self._fastpitch_model_name) + self._hifigan_model = self._setup_hifigan_model(self._hifigan_model_name) + return self._fastpitch_model, self._hifigan_model + + def _setup_fastpitch_model(self, model_name: str): + if model_name.endswith(".nemo"): + fastpitch_model = FastPitchModel.restore_from(model_name, map_location=torch.device(self._device)) + else: + fastpitch_model = FastPitchModel.from_pretrained(model_name, map_location=torch.device(self._device)) + fastpitch_model.eval() + return fastpitch_model + + def _setup_hifigan_model(self, model_name: str): + if model_name.endswith(".nemo"): + hifigan_model = HifiGanModel.restore_from(model_name, map_location=torch.device(self._device)) + else: + hifigan_model = HifiGanModel.from_pretrained(model_name, map_location=torch.device(self._device)) + hifigan_model.eval() + return hifigan_model + + def _generate_audio(self, text: str) -> Iterator[np.ndarray]: + with torch.no_grad(): + parsed = self._fastpitch_model.parse(text) + spectrogram = self._fastpitch_model.generate_spectrogram(tokens=parsed) + audio = self._hifigan_model.convert_spectrogram_to_audio(spec=spectrogram) + audio = audio.detach().view(-1).cpu().numpy() + yield audio diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py new file mode 100644 index 000000000000..be012fdf8eb3 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/turn_taking.py @@ -0,0 +1,360 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from pathlib import Path +from typing import List, Optional, Union + +import yaml +from loguru import logger +from pipecat.frames.frames import ( + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, + Frame, + InterimTranscriptionFrame, + StartInterruptionFrame, + StopInterruptionFrame, + TranscriptionFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 + +from nemo.agents.voice_agent.pipecat.frames.frames import DiarResultFrame + + +class NeMoTurnTakingService(FrameProcessor): + def __init__( + self, + backchannel_phrases: Union[str, List[str]] = None, + eou_string: str = "", + eob_string: str = "", + language: Language = Language.EN_US, + use_vad: bool = True, + use_diar: bool = False, + max_buffer_size: int = 3, + bot_stop_delay: float = 0.5, + **kwargs, + ): + super().__init__(**kwargs) + self.eou_string = eou_string + self.eob_string = eob_string + self.language = language + self.use_vad = use_vad + self.use_diar = use_diar + self.max_buffer_size = max_buffer_size + + self.backchannel_phrases = self._load_backchannel_phrases(backchannel_phrases) + self.backchannel_phrases_nopc = set([self.clean_text(phrase) for phrase in self.backchannel_phrases]) + self.bot_stop_delay = bot_stop_delay + # internal data + self._current_speaker_id = None + self._prev_speaker_id = None + self._bot_stop_time = None + self._bot_speaking = False + self._vad_user_speaking = False + self._have_sent_user_started_speaking = False + self._user_speaking_buffer = "" + if not self.use_vad: + # if vad is not used, we assume the user is always speaking + self._vad_user_speaking = True + + def _load_backchannel_phrases(self, backchannel_phrases: Optional[Union[str, List[str]]] = None): + if not backchannel_phrases: + return [] + + if isinstance(backchannel_phrases, str) and Path(backchannel_phrases).is_file(): + logger.info(f"Loading backchannel phrases from file: {backchannel_phrases}") + if not Path(backchannel_phrases).exists(): + raise FileNotFoundError(f"Backchannel phrases file not found: {backchannel_phrases}") + with open(backchannel_phrases, "r") as f: + backchannel_phrases = yaml.safe_load(f) + if not isinstance(backchannel_phrases, list): + raise ValueError(f"Backchannel phrases must be a list, got {type(backchannel_phrases)}") + logger.info(f"Loaded {len(backchannel_phrases)} backchannel phrases from file: {backchannel_phrases}") + elif isinstance(backchannel_phrases, list): + logger.info(f"Using backchannel phrases from list: {backchannel_phrases}") + else: + raise ValueError(f"Invalid backchannel phrases: {backchannel_phrases}") + return backchannel_phrases + + def clean_text(self, text: str) -> str: + """ + Clean the text so that it can be used for backchannel detection. + """ + if self.language != Language.EN_US: + raise ValueError(f"Language {self.language} not supported, currently only English is supported.") + for eou_string in [self.eou_string, self.eob_string]: + if text.endswith(eou_string): + text = text[: -len(eou_string)].strip() + text = text.lower() + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + return " ".join(text.split()).strip() + + def is_backchannel(self, text: str) -> bool: + """ + Check if the text is a backchannel phrase. + """ + if text.startswith("") :] + text = self.clean_text(text) + return text in self.backchannel_phrases_nopc + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if self._bot_stop_time is not None: + # check if the bot has stopped speaking for more than the delay + if time.time() - self._bot_stop_time > self.bot_stop_delay: + # set the _bot_speaking flag to False to actually consider the bot as stopped speaking + logger.debug( + f"Bot stopped speaking for more than {self.bot_stop_delay} seconds, setting _bot_speaking to False" + ) + self._bot_stop_time = None + self._bot_speaking = False + + if isinstance(frame, (TranscriptionFrame, InterimTranscriptionFrame)): + await self._handle_transcription(frame, direction) + elif isinstance(frame, VADUserStartedSpeakingFrame): + await self._handle_user_started_speaking(frame, direction) + elif isinstance(frame, VADUserStoppedSpeakingFrame): + await self._handle_user_stopped_speaking(frame, direction) + elif isinstance(frame, BotStartedSpeakingFrame): + logger.debug("BotStartedSpeakingFrame received") + self._bot_speaking = True + elif isinstance(frame, BotStoppedSpeakingFrame): + logger.debug("BotStoppedSpeakingFrame received") + self._bot_stop_time = time.time() + if self.bot_stop_delay is None or self.bot_stop_delay <= 0: + # only set the flag if the delay is not set or is 0 + self._bot_speaking = False + logger.debug(f"Setting _bot_speaking to False") + elif isinstance(frame, DiarResultFrame): + logger.debug("DiarResultFrame received") + await self._handle_diar_result(frame, direction) + else: + await self.push_frame(frame, direction) + + async def _handle_transcription( + self, frame: TranscriptionFrame | InterimTranscriptionFrame, direction: FrameDirection + ): + text_segment = frame.text + if self._vad_user_speaking: + self._user_speaking_buffer += text_segment + has_eou = self._user_speaking_buffer.endswith(self.eou_string) + has_eob = self._user_speaking_buffer.endswith(self.eob_string) + if has_eou: + # EOU detected, we assume the user is done speaking, so we push the completed text and interrupt the bot + logger.debug(f" Detected: `{self._user_speaking_buffer}`") + completed_text = self._user_speaking_buffer[: -len(self.eou_string)].strip() + self._user_speaking_buffer = "" + if self._bot_speaking and self.is_backchannel(completed_text): + logger.debug(f" detected for a backchannel phrase while bot is speaking: `{completed_text}`") + else: + await self._handle_completed_text(completed_text, direction) + await self._handle_user_interruption(UserStoppedSpeakingFrame()) + self._have_sent_user_started_speaking = False # user is done speaking, so we reset the flag + elif has_eob and self._bot_speaking: + # ignore the backchannel string while bot is speaking + logger.debug(f"Ignoring backchannel string while bot is speaking: `{self._user_speaking_buffer}`") + # push the backchannel string upstream, not downstream + await self.push_frame( + TranscriptionFrame( + text=f"({self._user_speaking_buffer})", + user_id="", + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": f"Backchannel detected: {self._user_speaking_buffer}"}, + ), + direction=FrameDirection.UPSTREAM, + ) + self._have_sent_user_started_speaking = False # treat it as if the user is not speaking + self._user_speaking_buffer = "" # discard backchannel string and reset the buffer + else: + # if bot is not speaking, the backchannel string is not considered a backchannel phrase + # user is still speaking, so we append the text segment to the buffer + logger.debug(f"User is speaking: `{self._user_speaking_buffer}`") + if has_eob: + logger.debug( + f"{self.eob_string} detected but ignored because bot is NOT speaking: `{self._user_speaking_buffer}`" + ) + self._user_speaking_buffer = self._user_speaking_buffer[: -len(self.eob_string)].strip() + completed_words = self._user_speaking_buffer.strip().split()[ + :-1 + ] # assume the last word is not completed + if len(completed_words) >= self.max_buffer_size: + completed_text = " ".join(completed_words) + await self._handle_completed_text(completed_text, direction, is_final=False) + else: + # if vad is not detecting user speaking + logger.debug( + f"VAD is not detecting user speaking, but still received text segment from STT: `{text_segment}`" + ) + is_backchannel = self.is_backchannel(text_segment) + if text_segment.endswith(self.eob_string): + is_backchannel = True + logger.debug(f"Dropping EOB token: `{text_segment}`") + text_segment = text_segment[: -len(self.eob_string)].strip() + elif text_segment.endswith(self.eou_string): + logger.debug(f"Dropping EOU token: `{text_segment}`") + text_segment = text_segment[: -len(self.eou_string)].strip() + + if not text_segment.strip(): + return + if is_backchannel and self._bot_speaking: + logger.debug(f"Backchannel detected while bot is speaking: `{text_segment}`") + # push the backchannel string upstream, not downstream + curr_text = str(self._user_speaking_buffer + text_segment) + self._user_speaking_buffer = "" + await self.push_frame( + TranscriptionFrame( + text=f"({curr_text})", + user_id="", + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": f"Backchannel detected: {self._user_speaking_buffer+text_segment}"}, + ), + direction=FrameDirection.UPSTREAM, + ) + else: + # if the text segment is not empty and have non-space characters, we append it to the buffer + self._user_speaking_buffer += text_segment + if self.is_backchannel(self._user_speaking_buffer): + logger.debug(f"Backchannel detected: `{self._user_speaking_buffer}`") + self._user_speaking_buffer = "" + self._have_sent_user_started_speaking = False + return + logger.debug(f"Appending text segment to user speaking buffer: `{self._user_speaking_buffer}`") + + async def _handle_completed_text(self, completed_text: str, direction: FrameDirection, is_final: bool = True): + if not self._have_sent_user_started_speaking: + # if we haven't sent the user started speaking frame, we send it now + # so that the bot can be interrupted and be ready to respond to the new user turn + await self._handle_user_interruption(UserStartedSpeakingFrame()) + self._have_sent_user_started_speaking = True + + completed_text = completed_text.strip() + completed_text = completed_text.replace(self.eou_string, "").replace(self.eob_string, "") + + if self.use_diar and not completed_text.startswith(" {completed_text}" + + frame_type = TranscriptionFrame if is_final else InterimTranscriptionFrame + text_frame = frame_type( + text=completed_text, + user_id="", # No speaker ID in this implementation + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": completed_text}, + ) + logger.debug(f"Pushing text frame: {text_frame}") + await self.push_frame(text_frame, direction) + + async def _handle_user_started_speaking(self, frame: VADUserStartedSpeakingFrame, direction: FrameDirection): + self._vad_user_speaking = True + logger.debug("NeMoTurnTakingService: VADUserStartedSpeakingFrame") + await self.push_frame(frame, direction) + + def _contains_only_speaker_tags(self, text: str) -> bool: + """ + Check if the text contains only speaker tags. + """ + return text.strip().startswith("") + + async def _handle_user_stopped_speaking(self, frame: VADUserStoppedSpeakingFrame, direction: FrameDirection): + """ + Handle the user stopped speaking frame. + If the buffer is not empty: + If the bot is not speaking, we push the completed text frame regardless of whether it is a backchannel string. + If the bot is speaking, we ignore the backchannel string if it is a backchannel string. + If the buffer is empty, we do nothing. + """ + if self.use_vad: + self._vad_user_speaking = False + logger.debug("NeMoTurnTakingService: VADUserStoppedSpeakingFrame") + await self.push_frame(frame, direction) + + # if user buffer only contains speaker tags, we don't push the completed text frame + if self._contains_only_speaker_tags(self._user_speaking_buffer): + logger.debug(f"User buffer only contains speaker tags: `{self._user_speaking_buffer}`, ignoring") + return + + is_backchannel = self.is_backchannel(self._user_speaking_buffer) + if not self._user_speaking_buffer: + return + if not self._bot_speaking or not is_backchannel: + logger.debug(f"Bot talking: {self._bot_speaking}, backchannel: {is_backchannel}") + logger.debug(f"Pushing completed text frame for VAD user stopped speaking: {self._user_speaking_buffer}") + await self._handle_completed_text(self._user_speaking_buffer, direction) + self._user_speaking_buffer = "" + if self._have_sent_user_started_speaking: + await self._handle_user_interruption(UserStoppedSpeakingFrame()) + self._have_sent_user_started_speaking = False + elif is_backchannel: + logger.debug(f"Backchannel detected: `{self._user_speaking_buffer}`") + # push the backchannel string upstream, not downstream + await self.push_frame( + TranscriptionFrame( + text=f"({self._user_speaking_buffer})", + user_id="", + timestamp=time_now_iso8601(), + language=self.language if self.language else Language.EN_US, + result={"text": f"Backchannel detected: {self._user_speaking_buffer}"}, + ), + direction=FrameDirection.UPSTREAM, + ) + self._user_speaking_buffer = "" + self._have_sent_user_started_speaking = False + + async def _handle_user_interruption(self, frame: Frame): + # Adapted from BaseInputTransport._handle_user_interruption + if isinstance(frame, UserStartedSpeakingFrame): + logger.debug("User started speaking") + await self.push_frame(frame) + await self.push_frame(StartInterruptionFrame(), direction=FrameDirection.DOWNSTREAM) + elif isinstance(frame, UserStoppedSpeakingFrame): + logger.debug("User stopped speaking") + await self.push_frame(frame) + if self.interruptions_allowed: + await self.push_frame(StopInterruptionFrame(), direction=FrameDirection.DOWNSTREAM) + else: + logger.debug(f"Unknown frame type for _handle_user_interruption: {type(frame)}") + + async def _handle_diar_result(self, frame: DiarResultFrame, direction: FrameDirection): + if not self.use_diar: + logger.debug("Diarization is disabled, skipping") + return + + new_speaker_id = frame.diar_result # speaker id of the dominant speaker + + # logger.debug(f"Dominant speaker ID: {dominant_speaker_id}") + self._prev_speaker_id = self._current_speaker_id + last_speaker_id = self._current_speaker_id + + if not self._user_speaking_buffer.startswith(" to the beginning of the current utterance + self._user_speaking_buffer = f" {self._user_speaking_buffer}" + elif last_speaker_id != new_speaker_id: + # change the speaker tag to the dominant speaker id + self._user_speaking_buffer = self._user_speaking_buffer[len("") :] + self._user_speaking_buffer = f" {self._user_speaking_buffer}" + logger.debug(f"Speaker changed from {last_speaker_id} to {new_speaker_id}") + self._current_speaker_id = new_speaker_id diff --git a/nemo/agents/voice_agent/pipecat/services/nemo/utils.py b/nemo/agents/voice_agent/pipecat/services/nemo/utils.py new file mode 100644 index 000000000000..421bf9823b5a --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/services/nemo/utils.py @@ -0,0 +1,197 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# NOTE: This file will be deprecated in the future, as the new inference pipeline will replace it. + +import math + +import numpy as np +import torch +from omegaconf import DictConfig + +import nemo.collections.asr as nemo_asr + +LOG_MEL_ZERO = -16.635 + + +class AudioBufferer: + def __init__(self, sample_rate: int, buffer_size_in_secs: float): + self.buffer_size = int(buffer_size_in_secs * sample_rate) + self.sample_buffer = torch.zeros(self.buffer_size, dtype=torch.float32) + + def reset(self) -> None: + """ + Reset the buffer to zero + """ + self.sample_buffer.zero_() + + def update(self, audio: np.ndarray) -> None: + """ + Update the buffer with the new frame + Args: + frame (Frame): frame to update the buffer with + """ + if not isinstance(audio, torch.Tensor): + audio = torch.from_numpy(audio) + + audio_size = audio.shape[0] + if audio_size > self.buffer_size: + raise ValueError(f"Frame size ({audio_size}) exceeds buffer size ({self.buffer_size})") + + shift = audio_size + self.sample_buffer[:-shift] = self.sample_buffer[shift:].clone() + self.sample_buffer[-shift:] = audio.clone() + + def get_buffer(self) -> torch.Tensor: + """ + Get the current buffer + Returns: + torch.Tensor: current state of the buffer + """ + return self.sample_buffer.clone() + + def is_buffer_empty(self) -> bool: + """ + Check if the buffer is empty + Returns: + bool: True if the buffer is empty, False otherwise + """ + return self.sample_buffer.sum() == 0 + + +class CacheFeatureBufferer: + def __init__( + self, + sample_rate: int, + buffer_size_in_secs: float, + chunk_size_in_secs: float, + preprocessor_cfg: DictConfig, + device: torch.device, + fill_value: float = LOG_MEL_ZERO, + ): + + if buffer_size_in_secs < chunk_size_in_secs: + raise ValueError( + f"Buffer size ({buffer_size_in_secs}s) should be no less than chunk size ({chunk_size_in_secs}s)" + ) + + self.sample_rate = sample_rate + self.buffer_size_in_secs = buffer_size_in_secs + self.chunk_size_in_secs = chunk_size_in_secs + self.device = device + + if hasattr(preprocessor_cfg, 'log') and preprocessor_cfg.log: + self.ZERO_LEVEL_SPEC_DB_VAL = LOG_MEL_ZERO # Log-Mel spectrogram value for zero signals + else: + self.ZERO_LEVEL_SPEC_DB_VAL = fill_value + + self.n_feat = preprocessor_cfg.features + self.timestep_duration = preprocessor_cfg.window_stride + self.n_chunk_look_back = int(self.timestep_duration * self.sample_rate) + self.chunk_size = int(self.chunk_size_in_secs * self.sample_rate) + self.sample_buffer = AudioBufferer(sample_rate, buffer_size_in_secs) + + self.feature_buffer_len = int(buffer_size_in_secs / self.timestep_duration) + self.feature_chunk_len = int(chunk_size_in_secs / self.timestep_duration) + self.feature_buffer = torch.full( + [self.n_feat, self.feature_buffer_len], + self.ZERO_LEVEL_SPEC_DB_VAL, + dtype=torch.float32, + device=self.device, + ) + + self.preprocessor = nemo_asr.models.ASRModel.from_config_dict(preprocessor_cfg) + self.preprocessor.to(self.device) + + def is_buffer_empty(self) -> bool: + """ + Check if the buffer is empty + Returns: + bool: True if the buffer is empty, False otherwise + """ + return self.sample_buffer.is_buffer_empty() + + def reset(self) -> None: + """ + Reset the buffer to zero + """ + self.sample_buffer.reset() + self.feature_buffer.fill_(self.ZERO_LEVEL_SPEC_DB_VAL) + + def _update_feature_buffer(self, feat_chunk: torch.Tensor) -> None: + """ + Add an extracted feature to `feature_buffer` + """ + self.feature_buffer[:, : -self.feature_chunk_len] = self.feature_buffer[:, self.feature_chunk_len :].clone() + self.feature_buffer[:, -self.feature_chunk_len :] = feat_chunk.clone() + + def preprocess(self, audio_signal: torch.Tensor) -> torch.Tensor: + """ + Preprocess the audio signal using the preprocessor + Args: + audio_signal (torch.Tensor): audio signal + Returns: + torch.Tensor: preprocessed features + """ + audio_signal = audio_signal.unsqueeze_(0).to(self.device) + audio_signal_len = torch.tensor([audio_signal.shape[1]], device=self.device) + features, _ = self.preprocessor( + input_signal=audio_signal, + length=audio_signal_len, + ) + features = features.squeeze() + return features + + def update(self, audio: np.ndarray) -> None: + """ + Update the sample anf feature buffers with the new frame + Args: + frame (Frame): frame to update the buffer with + """ + + # Update the sample buffer with the new frame + self.sample_buffer.update(audio) + + if math.isclose(self.buffer_size_in_secs, self.chunk_size_in_secs): + # If the buffer size is equal to the chunk size, just take the whole buffer + samples = self.sample_buffer.sample_buffer.clone() + else: + # Add look_back to have context for the first feature + samples = self.sample_buffer.sample_buffer[-(self.n_chunk_look_back + self.chunk_size) :] + + # Get the mel spectrogram + features = self.preprocess(samples) + + # If the features are longer than supposed to be, drop the last frames + # Drop the last diff frames because they might be incomplete + if (diff := features.shape[1] - self.feature_chunk_len - 1) > 0: + features = features[:, :-diff] + + # Update the feature buffer with the new features + self._update_feature_buffer(features[:, -self.feature_chunk_len :]) + + def get_buffer(self) -> torch.Tensor: + """ + Get the current sample buffer + Returns: + torch.Tensor: current state of the buffer + """ + return self.sample_buffer.get_buffer() + + def get_feature_buffer(self) -> torch.Tensor: + """ + Get the current feature buffer + Returns: + torch.Tensor: current state of the feature buffer + """ + return self.feature_buffer.clone() diff --git a/nemo/agents/voice_agent/pipecat/transports/__init__.py b/nemo/agents/voice_agent/pipecat/transports/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/transports/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/transports/base_input.py b/nemo/agents/voice_agent/pipecat/transports/base_input.py new file mode 100644 index 000000000000..79a477ad3416 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/transports/base_input.py @@ -0,0 +1,58 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from loguru import logger +from pipecat.audio.vad.vad_analyzer import VADState +from pipecat.frames.frames import ( + InputAudioRawFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.transports.base_input import BaseInputTransport as _BaseInputTransport + + +class BaseInputTransport(_BaseInputTransport): + async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState): + """Handle Voice Activity Detection results and generate appropriate frames.""" + new_vad_state = await self._vad_analyze(audio_frame) + if new_vad_state != vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: + frame = None + # If the turn analyser is enabled, this will prevent: + # - Creating the UserStoppedSpeakingFrame + # - Creating the UserStartedSpeakingFrame multiple times + can_create_user_frames = ( + self._params.turn_analyzer is None or not self._params.turn_analyzer.speech_triggered + ) and self._params.can_create_user_frames + + if new_vad_state == VADState.SPEAKING: + await self.push_frame(VADUserStartedSpeakingFrame()) + if can_create_user_frames: + frame = UserStartedSpeakingFrame() + else: + logger.debug("base_input: VAD state changed to SPEAKING but can_create_user_frames is False") + elif new_vad_state == VADState.QUIET: + await self.push_frame(VADUserStoppedSpeakingFrame()) + if can_create_user_frames: + frame = UserStoppedSpeakingFrame() + else: + logger.debug("base_input: VAD state changed to QUIET but can_create_user_frames is False") + + if frame: + await self._handle_user_interruption(frame) + + vad_state = new_vad_state + return vad_state diff --git a/nemo/agents/voice_agent/pipecat/transports/base_transport.py b/nemo/agents/voice_agent/pipecat/transports/base_transport.py new file mode 100644 index 000000000000..eb57024611b6 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/transports/base_transport.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pipecat.transports.base_transport import TransportParams as _TransportParams + + +class TransportParams(_TransportParams): + can_create_user_frames: bool = True diff --git a/nemo/agents/voice_agent/pipecat/transports/network/__init__.py b/nemo/agents/voice_agent/pipecat/transports/network/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/transports/network/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/transports/network/websocket_server.py b/nemo/agents/voice_agent/pipecat/transports/network/websocket_server.py new file mode 100644 index 000000000000..800d9ddb860e --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/transports/network/websocket_server.py @@ -0,0 +1,304 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +from typing import Optional + +from loguru import logger +from pipecat.frames.frames import CancelFrame, EndFrame, InputAudioRawFrame, StartFrame +from pipecat.serializers.base_serializer import FrameSerializer +from pipecat.transports.base_transport import BaseTransport +from pipecat.transports.network.websocket_server import ( + WebsocketServerCallbacks, + WebsocketServerOutputTransport, + WebsocketServerParams, +) + +from nemo.agents.voice_agent.pipecat.transports.base_input import BaseInputTransport +from nemo.agents.voice_agent.pipecat.transports.base_transport import TransportParams + +try: + import websockets +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use websockets, you need to `pip install pipecat-ai[websocket]`.") + raise Exception(f"Missing module: {e}") + + +class WebsocketServerParams(TransportParams): + """Configuration parameters for WebSocket server transport. + + Parameters: + add_wav_header: Whether to add WAV headers to audio frames. + serializer: Frame serializer for message encoding/decoding. + session_timeout: Timeout in seconds for client sessions. + """ + + add_wav_header: bool = False + serializer: Optional[FrameSerializer] = None + session_timeout: Optional[int] = None + + +class WebsocketServerInputTransport(BaseInputTransport): + """WebSocket server input transport for receiving client data. + + Handles incoming WebSocket connections, message processing, and client + session management including timeout monitoring and connection lifecycle. + """ + + def __init__( + self, + transport: BaseTransport, + host: str, + port: int, + params: WebsocketServerParams, + callbacks: WebsocketServerCallbacks, + **kwargs, + ): + """Initialize the WebSocket server input transport. + + Args: + transport: The parent transport instance. + host: Host address to bind the WebSocket server to. + port: Port number to bind the WebSocket server to. + params: WebSocket server configuration parameters. + callbacks: Callback functions for WebSocket events. + **kwargs: Additional arguments passed to parent class. + """ + super().__init__(params, **kwargs) + + self._transport = transport + self._host = host + self._port = port + self._params = params + self._callbacks = callbacks + + self._websocket: Optional[websockets.WebSocketServerProtocol] = None + + self._server_task = None + + # This task will monitor the websocket connection periodically. + self._monitor_task = None + + self._stop_server_event = asyncio.Event() + + # Whether we have seen a StartFrame already. + self._initialized = False + + async def start(self, frame: StartFrame): + """Start the WebSocket server and initialize components. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + + if self._initialized: + return + + self._initialized = True + + if self._params.serializer: + await self._params.serializer.setup(frame) + if not self._server_task: + self._server_task = self.create_task(self._server_task_handler()) + await self.set_transport_ready(frame) + + async def stop(self, frame: EndFrame): + """Stop the WebSocket server and cleanup resources. + + Args: + frame: The end frame signaling transport shutdown. + """ + await super().stop(frame) + self._stop_server_event.set() + if self._monitor_task: + await self.cancel_task(self._monitor_task) + self._monitor_task = None + if self._server_task: + await self.wait_for_task(self._server_task) + self._server_task = None + + async def cancel(self, frame: CancelFrame): + """Cancel the WebSocket server and stop all processing. + + Args: + frame: The cancel frame signaling immediate cancellation. + """ + await super().cancel(frame) + if self._monitor_task: + await self.cancel_task(self._monitor_task) + self._monitor_task = None + if self._server_task: + await self.cancel_task(self._server_task) + self._server_task = None + + async def cleanup(self): + """Cleanup resources and parent transport.""" + await super().cleanup() + await self._transport.cleanup() + + async def _server_task_handler(self): + """Handle WebSocket server startup and client connections.""" + logger.info(f"Starting websocket server on {self._host}:{self._port}") + async with websockets.serve(self._client_handler, self._host, self._port) as server: + await self._callbacks.on_websocket_ready() + await self._stop_server_event.wait() + + async def _client_handler(self, websocket: websockets.WebSocketServerProtocol, path: Optional[str] = None): + """Handle individual client connections and message processing.""" + logger.info(f"New client connection from {websocket.remote_address}") + if self._websocket: + await self._websocket.close() + logger.warning("Only one client connected, using new connection") + + self._websocket = websocket + + # Notify + await self._callbacks.on_client_connected(websocket) + + # Create a task to monitor the websocket connection + if not self._monitor_task and self._params.session_timeout: + self._monitor_task = self.create_task(self._monitor_websocket(websocket, self._params.session_timeout)) + + # Handle incoming messages + try: + async for message in websocket: + if not self._params.serializer: + continue + + frame = await self._params.serializer.deserialize(message) + + if not frame: + continue + + if isinstance(frame, InputAudioRawFrame): + await self.push_audio_frame(frame) + else: + await self.push_frame(frame) + except Exception as e: + logger.error(f"{self} exception receiving data: {e.__class__.__name__} ({e})") + + # Notify disconnection + await self._callbacks.on_client_disconnected(websocket) + + await self._websocket.close() + self._websocket = None + + logger.info(f"Client {websocket.remote_address} disconnected") + + async def _monitor_websocket(self, websocket: websockets.WebSocketServerProtocol, session_timeout: int): + """Monitor WebSocket connection for session timeout.""" + try: + await asyncio.sleep(session_timeout) + if not websocket.closed: + await self._callbacks.on_session_timeout(websocket) + except asyncio.CancelledError: + logger.info(f"Monitoring task cancelled for: {websocket.remote_address}") + raise + + +class WebsocketServerTransport(BaseTransport): + """WebSocket server transport for bidirectional real-time communication. + + Provides a complete WebSocket server implementation with separate input and + output transports, client connection management, and event handling for + real-time audio and data streaming applications. + """ + + def __init__( + self, + params: WebsocketServerParams, + host: str = "localhost", + port: int = 8765, + input_name: Optional[str] = None, + output_name: Optional[str] = None, + ): + """Initialize the WebSocket server transport. + + Args: + params: WebSocket server configuration parameters. + host: Host address to bind the server to. Defaults to "localhost". + port: Port number to bind the server to. Defaults to 8765. + input_name: Optional name for the input processor. + output_name: Optional name for the output processor. + """ + super().__init__(input_name=input_name, output_name=output_name) + self._host = host + self._port = port + self._params = params + + self._callbacks = WebsocketServerCallbacks( + on_client_connected=self._on_client_connected, + on_client_disconnected=self._on_client_disconnected, + on_session_timeout=self._on_session_timeout, + on_websocket_ready=self._on_websocket_ready, + ) + self._input: Optional[WebsocketServerInputTransport] = None + self._output: Optional[WebsocketServerOutputTransport] = None + self._websocket: Optional[websockets.WebSocketServerProtocol] = None + + # Register supported handlers. The user will only be able to register + # these handlers. + self._register_event_handler("on_client_connected") + self._register_event_handler("on_client_disconnected") + self._register_event_handler("on_session_timeout") + self._register_event_handler("on_websocket_ready") + + def input(self) -> WebsocketServerInputTransport: + """Get the input transport for receiving client data. + + Returns: + The WebSocket server input transport instance. + """ + if not self._input: + self._input = WebsocketServerInputTransport( + self, self._host, self._port, self._params, self._callbacks, name=self._input_name + ) + return self._input + + def output(self) -> WebsocketServerOutputTransport: + """Get the output transport for sending data to clients. + + Returns: + The WebSocket server output transport instance. + """ + if not self._output: + self._output = WebsocketServerOutputTransport(self, self._params, name=self._output_name) + return self._output + + async def _on_client_connected(self, websocket): + """Handle client connection events.""" + if self._output: + await self._output.set_client_connection(websocket) + await self._call_event_handler("on_client_connected", websocket) + else: + logger.error("A WebsocketServerTransport output is missing in the pipeline") + + async def _on_client_disconnected(self, websocket): + """Handle client disconnection events.""" + if self._output: + await self._output.set_client_connection(None) + await self._call_event_handler("on_client_disconnected", websocket) + else: + logger.error("A WebsocketServerTransport output is missing in the pipeline") + + async def _on_session_timeout(self, websocket): + """Handle client session timeout events.""" + await self._call_event_handler("on_session_timeout", websocket) + + async def _on_websocket_ready(self): + """Handle WebSocket server ready events.""" + await self._call_event_handler("on_websocket_ready") diff --git a/nemo/agents/voice_agent/pipecat/utils/__init__.py b/nemo/agents/voice_agent/pipecat/utils/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/utils/text/__init__.py b/nemo/agents/voice_agent/pipecat/utils/text/__init__.py new file mode 100644 index 000000000000..341a77c5bc66 --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/utils/text/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py b/nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py new file mode 100644 index 000000000000..ada66aef6dec --- /dev/null +++ b/nemo/agents/voice_agent/pipecat/utils/text/simple_text_aggregator.py @@ -0,0 +1,52 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from pipecat.utils.string import match_endofsentence +from pipecat.utils.text.simple_text_aggregator import SimpleTextAggregator + + +class SimpleSegmentedTextAggregator(SimpleTextAggregator): + def __init__(self, punctuation_marks: str | list[str] = ",!?", **kwargs): + super().__init__(**kwargs) + if not punctuation_marks: + self._punctuation_marks = set() + else: + self._punctuation_marks = set(punctuation_marks) + + def _find_segment_end(self, text: str) -> Optional[int]: + for punc in self._punctuation_marks: + idx = text.find(punc) + if idx != -1: + return idx + return None + + async def aggregate(self, text: str) -> Optional[str]: + result: Optional[str] = None + + self._text += text + + self._text = self._text.replace("*", "") + + eos_end_marker = match_endofsentence(self._text) + + if not eos_end_marker: + eos_end_marker = self._find_segment_end(self._text) + + if eos_end_marker: + result = self._text[:eos_end_marker] + self._text = self._text[eos_end_marker:] + + return result