From 2bd59f08c31b3c20512ecf6eadc4dc65c91dd051 Mon Sep 17 00:00:00 2001 From: rootflo-hardik Date: Wed, 11 Feb 2026 11:53:37 +0530 Subject: [PATCH 1/2] sarvam integration for stt and tts --- .../client/src/config/voice-providers.ts | 93 ++++++++++++++++++- .../stt-configs/CreateSttConfigDialog.tsx | 2 +- .../stt-configs/EditSttConfigDialog.tsx | 2 +- .../tts-configs/CreateTtsConfigDialog.tsx | 2 +- .../tts-configs/EditTtsConfigDialog.tsx | 2 +- wavefront/client/src/types/stt-config.ts | 10 +- wavefront/client/src/types/tts-config.ts | 13 ++- .../call_processing/services/stt_service.py | 56 +++++++++++ .../call_processing/services/tts_service.py | 55 +++++++++++ .../apps/call_processing/pyproject.toml | 2 +- .../voice_agents_module/models/stt_schemas.py | 1 + .../voice_agents_module/models/tts_schemas.py | 1 + .../services/tts_generator_service.py | 80 ++++++++++++++++ .../utils/language_validation.py | 18 ++++ wavefront/server/uv.lock | 24 ++++- 15 files changed, 350 insertions(+), 11 deletions(-) diff --git a/wavefront/client/src/config/voice-providers.ts b/wavefront/client/src/config/voice-providers.ts index 1fa3be1d..053f4068 100644 --- a/wavefront/client/src/config/voice-providers.ts +++ b/wavefront/client/src/config/voice-providers.ts @@ -44,7 +44,7 @@ export interface VoiceProvidersConfig { */ export const VOICE_PROVIDERS_CONFIG: VoiceProvidersConfig = { tts: { - providers: ['elevenlabs', 'deepgram', 'cartesia'] as const, + providers: ['elevenlabs', 'deepgram', 'cartesia', 'sarvam'] as const, configs: { elevenlabs: { name: 'ElevenLabs', @@ -159,10 +159,68 @@ export const VOICE_PROVIDERS_CONFIG: VoiceProvidersConfig = { }, }, }, + sarvam: { + name: 'Sarvam', + badge: { + bg: 'bg-orange-100', + text: 'text-orange-800', + }, + parameters: { + model: { + type: 'string', + default: 'bulbul:v2', + options: ['bulbul:v2', 'bulbul:v3'], + description: 'Sarvam TTS model', + }, + language: { + type: 'string', + default: '', + description: 'Language code', + placeholder: 'hi', + }, + pitch: { + type: 'number', + default: 0.0, + min: -0.75, + max: 0.75, + step: 0.05, + description: 'Voice pitch (-0.75 to 0.75)', + }, + pace: { + type: 'number', + default: 1.0, + min: 0.3, + max: 3.0, + step: 0.1, + description: 'Speech pace (0.3-3.0)', + }, + loudness: { + type: 'number', + default: 1.0, + min: 0.1, + max: 3.0, + step: 0.1, + description: 'Volume (0.1-3.0)', + }, + enable_preprocessing: { + type: 'boolean', + default: false, + description: 'Enable text preprocessing', + }, + temperature: { + type: 'number', + default: 0.6, + min: 0.01, + max: 1.0, + step: 0.05, + description: 'Randomness for bulbul v3 (0.01-1.0)', + }, + }, + }, }, }, stt: { - providers: ['deepgram'] as const, + providers: ['deepgram', 'sarvam'] as const, configs: { deepgram: { name: 'Deepgram', @@ -236,6 +294,37 @@ export const VOICE_PROVIDERS_CONFIG: VoiceProvidersConfig = { }, }, }, + sarvam: { + name: 'Sarvam', + badge: { + bg: 'bg-orange-100', + text: 'text-orange-800', + }, + parameters: { + model: { + type: 'string', + default: 'saarika:v2.5', + options: ['saarika:v2.5', 'saaras:v2'], + description: 'Sarvam STT model', + }, + language: { + type: 'string', + default: '', + description: 'Language code', + placeholder: 'hi', + }, + vad_signals: { + type: 'boolean', + default: true, + description: 'Enable VAD signals', + }, + high_vad_sensitivity: { + type: 'boolean', + default: false, + description: 'High VAD sensitivity', + }, + }, + }, }, }, }; diff --git a/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/CreateSttConfigDialog.tsx b/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/CreateSttConfigDialog.tsx index 5d233c91..2d3a76d4 100644 --- a/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/CreateSttConfigDialog.tsx +++ b/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/CreateSttConfigDialog.tsx @@ -32,7 +32,7 @@ import { z } from 'zod'; const createSttConfigSchema = z.object({ display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'), description: z.string().max(500, 'Description must be 500 characters or less').optional(), - provider: z.enum(['deepgram'] as [string, ...string[]]), + provider: z.enum(['deepgram', 'sarvam'] as [string, ...string[]]), api_key: z.string().min(1, 'API key is required'), }); diff --git a/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/EditSttConfigDialog.tsx b/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/EditSttConfigDialog.tsx index bd91c6c1..4a4c7eb5 100644 --- a/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/EditSttConfigDialog.tsx +++ b/wavefront/client/src/pages/apps/[appId]/voice-agents/stt-configs/EditSttConfigDialog.tsx @@ -33,7 +33,7 @@ import { z } from 'zod'; const updateSttConfigSchema = z.object({ display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'), description: z.string().max(500, 'Description must be 500 characters or less').optional(), - provider: z.enum(['deepgram'] as [string, ...string[]]), + provider: z.enum(['deepgram', 'sarvam'] as [string, ...string[]]), api_key: z.string().optional(), }); diff --git a/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/CreateTtsConfigDialog.tsx b/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/CreateTtsConfigDialog.tsx index d2389860..77d3fad0 100644 --- a/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/CreateTtsConfigDialog.tsx +++ b/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/CreateTtsConfigDialog.tsx @@ -32,7 +32,7 @@ import { z } from 'zod'; const createTtsConfigSchema = z.object({ display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'), description: z.string().max(500, 'Description must be 500 characters or less').optional(), - provider: z.enum(['elevenlabs', 'deepgram', 'cartesia'] as [string, ...string[]]), + provider: z.enum(['elevenlabs', 'deepgram', 'cartesia', 'sarvam'] as [string, ...string[]]), api_key: z.string().min(1, 'API key is required'), }); diff --git a/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/EditTtsConfigDialog.tsx b/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/EditTtsConfigDialog.tsx index 0a27fff3..dc6df2e1 100644 --- a/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/EditTtsConfigDialog.tsx +++ b/wavefront/client/src/pages/apps/[appId]/voice-agents/tts-configs/EditTtsConfigDialog.tsx @@ -33,7 +33,7 @@ import { z } from 'zod'; const updateTtsConfigSchema = z.object({ display_name: z.string().min(1, 'Display name is required').max(100, 'Display name must be 100 characters or less'), description: z.string().max(500, 'Description must be 500 characters or less').optional(), - provider: z.enum(['elevenlabs', 'deepgram', 'cartesia'] as [string, ...string[]]), + provider: z.enum(['elevenlabs', 'deepgram', 'cartesia', 'sarvam'] as [string, ...string[]]), api_key: z.string().optional(), }); diff --git a/wavefront/client/src/types/stt-config.ts b/wavefront/client/src/types/stt-config.ts index 101c0330..13935869 100644 --- a/wavefront/client/src/types/stt-config.ts +++ b/wavefront/client/src/types/stt-config.ts @@ -1,6 +1,6 @@ import { IApiResponse } from '@app/lib/axios'; -export type SttProvider = 'deepgram'; +export type SttProvider = 'deepgram' | 'sarvam'; export interface SttConfig { id: string; @@ -52,3 +52,11 @@ export interface DeepgramSttParameters { profanity_filter?: boolean; vad_events?: boolean; } + +// Sarvam STT specific parameters +export interface SarvamSttParameters { + model?: string; // default: 'saarika:v2.5' + language?: string; + vad_signals?: boolean; + high_vad_sensitivity?: boolean; +} diff --git a/wavefront/client/src/types/tts-config.ts b/wavefront/client/src/types/tts-config.ts index 24e93155..bef161b4 100644 --- a/wavefront/client/src/types/tts-config.ts +++ b/wavefront/client/src/types/tts-config.ts @@ -1,6 +1,6 @@ import { IApiResponse } from '@app/lib/axios'; -export type TtsProvider = 'elevenlabs' | 'deepgram' | 'cartesia'; +export type TtsProvider = 'elevenlabs' | 'deepgram' | 'cartesia' | 'sarvam'; export interface TtsConfig { id: string; @@ -62,3 +62,14 @@ export interface CartesiaParameters { language?: string; // Language enum speed?: number; } + +// Sarvam TTS specific parameters +export interface SarvamTtsParameters { + model?: string; // default: 'bulbul:v2' + language?: string; + pitch?: number; // -0.75 to 0.75 + pace?: number; // 0.3 to 3.0 + loudness?: number; // 0.1 to 3.0 + enable_preprocessing?: boolean; + temperature?: number; // 0.01 to 1.0 +} diff --git a/wavefront/server/apps/call_processing/call_processing/services/stt_service.py b/wavefront/server/apps/call_processing/call_processing/services/stt_service.py index 117b2ede..6a81ee73 100644 --- a/wavefront/server/apps/call_processing/call_processing/services/stt_service.py +++ b/wavefront/server/apps/call_processing/call_processing/services/stt_service.py @@ -9,6 +9,10 @@ # Pipecat STT services from pipecat.services.deepgram.stt import DeepgramSTTService +from pipecat.services.sarvam.stt import SarvamSTTService + +# Pipecat language enum +from pipecat.transcriptions.language import Language # Deepgram options from deepgram import LiveOptions @@ -51,6 +55,8 @@ def create_stt_service(stt_config: Dict[str, Any]): if provider == 'deepgram': return STTServiceFactory._create_deepgram_stt(api_key, parameters) + elif provider == 'sarvam': + return STTServiceFactory._create_sarvam_stt(api_key, parameters) elif provider == 'assemblyai': return STTServiceFactory._create_assemblyai_stt(api_key, parameters) elif provider == 'whisper': @@ -106,6 +112,56 @@ def _create_deepgram_stt(api_key: str, parameters: Dict[str, Any]): return DeepgramSTTService(api_key=api_key, live_options=live_options) + # Mapping of short language codes to pipecat Language enum for Sarvam + SARVAM_LANGUAGE_MAP = { + 'bn': Language.BN_IN, + 'en': Language.EN_IN, + 'gu': Language.GU_IN, + 'hi': Language.HI_IN, + 'kn': Language.KN_IN, + 'ml': Language.ML_IN, + 'mr': Language.MR_IN, + 'or': Language.OR_IN, + 'pa': Language.PA_IN, + 'ta': Language.TA_IN, + 'te': Language.TE_IN, + } + + @staticmethod + def _create_sarvam_stt(api_key: str, parameters: Dict[str, Any]): + """Create Sarvam STT service""" + params_dict = {} + + # Map language code to pipecat Language enum + if 'language' in parameters and parameters['language']: + lang_code = parameters['language'] + lang_enum = STTServiceFactory.SARVAM_LANGUAGE_MAP.get(lang_code) + if lang_enum: + params_dict['language'] = lang_enum + else: + logger.warning(f"Unknown Sarvam language '{lang_code}', skipping") + + if 'vad_signals' in parameters: + params_dict['vad_signals'] = parameters['vad_signals'] + if 'high_vad_sensitivity' in parameters: + params_dict['high_vad_sensitivity'] = parameters['high_vad_sensitivity'] + + model = parameters.get('model', 'saarika:v2.5') + sample_rate = parameters.get('sample_rate', 8000) + + input_params = ( + SarvamSTTService.InputParams(**params_dict) if params_dict else None + ) + + logger.info(f'Sarvam STT config: model={model}, sample_rate={sample_rate}') + + return SarvamSTTService( + api_key=api_key, + model=model, + sample_rate=sample_rate, + params=input_params, + ) + @staticmethod def _create_assemblyai_stt(api_key: str, parameters: Dict[str, Any]): """Create AssemblyAI STT service""" diff --git a/wavefront/server/apps/call_processing/call_processing/services/tts_service.py b/wavefront/server/apps/call_processing/call_processing/services/tts_service.py index 9e0a960d..0b6e6d1c 100644 --- a/wavefront/server/apps/call_processing/call_processing/services/tts_service.py +++ b/wavefront/server/apps/call_processing/call_processing/services/tts_service.py @@ -11,6 +11,7 @@ from pipecat.services.elevenlabs.tts import ElevenLabsTTSService from pipecat.services.deepgram.tts import DeepgramTTSService from pipecat.services.cartesia.tts import CartesiaTTSService +from pipecat.services.sarvam.tts import SarvamTTSService # Language for params from pipecat.transcriptions.language import Language @@ -62,6 +63,8 @@ def create_tts_service(tts_config: Dict[str, Any]): return TTSServiceFactory._create_deepgram_tts(api_key, voice_id, parameters) elif provider == 'cartesia': return TTSServiceFactory._create_cartesia_tts(api_key, voice_id, parameters) + elif provider == 'sarvam': + return TTSServiceFactory._create_sarvam_tts(api_key, voice_id, parameters) else: raise ValueError(f'Unsupported TTS provider: {provider}') @@ -162,3 +165,55 @@ def _create_cartesia_tts(api_key: str, voice_id: str, parameters: Dict[str, Any] return CartesiaTTSService( api_key=api_key, voice_id=voice_id, model=model, params=input_params ) + + # Mapping of short language codes to pipecat Language enum for Sarvam + SARVAM_LANGUAGE_MAP = { + 'bn': Language.BN_IN, + 'en': Language.EN_IN, + 'gu': Language.GU_IN, + 'hi': Language.HI_IN, + 'kn': Language.KN_IN, + 'ml': Language.ML_IN, + 'mr': Language.MR_IN, + 'or': Language.OR_IN, + 'pa': Language.PA_IN, + 'ta': Language.TA_IN, + 'te': Language.TE_IN, + } + + @staticmethod + def _create_sarvam_tts(api_key: str, voice_id: str, parameters: Dict[str, Any]): + """Create Sarvam TTS service (WebSocket-based streaming)""" + model = parameters.get('model', 'bulbul:v2') + + # Build InputParams from the parameters dict + params_dict = {} + + if 'language' in parameters and parameters['language']: + lang_code = parameters['language'] + lang_enum = TTSServiceFactory.SARVAM_LANGUAGE_MAP.get(lang_code) + if lang_enum: + params_dict['language'] = lang_enum + else: + logger.warning(f"Unknown Sarvam language '{lang_code}', skipping") + + if 'pitch' in parameters: + params_dict['pitch'] = parameters['pitch'] + if 'pace' in parameters: + params_dict['pace'] = parameters['pace'] + if 'loudness' in parameters: + params_dict['loudness'] = parameters['loudness'] + if 'enable_preprocessing' in parameters: + params_dict['enable_preprocessing'] = parameters['enable_preprocessing'] + if 'temperature' in parameters: + params_dict['temperature'] = parameters['temperature'] + + input_params = ( + SarvamTTSService.InputParams(**params_dict) if params_dict else None + ) + + logger.info(f'Sarvam TTS config: voice={voice_id}, model={model}') + + return SarvamTTSService( + api_key=api_key, voice_id=voice_id, model=model, params=input_params + ) diff --git a/wavefront/server/apps/call_processing/pyproject.toml b/wavefront/server/apps/call_processing/pyproject.toml index a0a776bf..49a38829 100644 --- a/wavefront/server/apps/call_processing/pyproject.toml +++ b/wavefront/server/apps/call_processing/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "redis>=5.0.0", "tenacity>=8.0.0", # Pipecat and voice processing - "pipecat-ai[websocket,cartesia,google,silero,deepgram,groq,runner,azure,local-smart-turn-v3]==0.0.100", + "pipecat-ai[websocket,cartesia,google,silero,deepgram,groq,runner,azure,local-smart-turn-v3,sarvam]==0.0.100", # Twilio "twilio>=8.0.0", ] diff --git a/wavefront/server/modules/voice_agents_module/voice_agents_module/models/stt_schemas.py b/wavefront/server/modules/voice_agents_module/voice_agents_module/models/stt_schemas.py index 219011e2..298931f6 100644 --- a/wavefront/server/modules/voice_agents_module/voice_agents_module/models/stt_schemas.py +++ b/wavefront/server/modules/voice_agents_module/voice_agents_module/models/stt_schemas.py @@ -14,6 +14,7 @@ class SttProvider(str, Enum): WHISPER = 'whisper' GOOGLE = 'google' AZURE = 'azure' + SARVAM = 'sarvam' class CreateSttConfigPayload(BaseModel): diff --git a/wavefront/server/modules/voice_agents_module/voice_agents_module/models/tts_schemas.py b/wavefront/server/modules/voice_agents_module/voice_agents_module/models/tts_schemas.py index f0e702da..58ee2952 100644 --- a/wavefront/server/modules/voice_agents_module/voice_agents_module/models/tts_schemas.py +++ b/wavefront/server/modules/voice_agents_module/voice_agents_module/models/tts_schemas.py @@ -15,6 +15,7 @@ class TtsProvider(str, Enum): AZURE = 'azure' GOOGLE = 'google' AWS = 'aws' + SARVAM = 'sarvam' class CreateTtsConfigPayload(BaseModel): diff --git a/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py b/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py index 445d60eb..08ee2f76 100644 --- a/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py +++ b/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py @@ -5,6 +5,7 @@ This service is used to pre-generate welcome message audio files. """ +import base64 import httpx from typing import Dict, Any from common_module.log.logger import logger @@ -44,6 +45,8 @@ async def generate_audio(self, text: str, tts_config: Dict[str, Any]) -> bytes: return await self._generate_deepgram(text, api_key, voice_id, parameters) elif provider == 'cartesia': return await self._generate_cartesia(text, api_key, voice_id, parameters) + elif provider == 'sarvam': + return await self._generate_sarvam(text, api_key, voice_id, parameters) else: raise ValueError( f'Unsupported TTS provider for audio generation: {provider}' @@ -226,3 +229,80 @@ async def _generate_cartesia( except Exception as e: logger.error(f'Cartesia request failed: {str(e)}') raise Exception(f'Cartesia TTS generation failed: {str(e)}') + + # Mapping of short language codes to Sarvam API format + SARVAM_LANGUAGE_CODE_MAP = { + 'bn': 'bn-IN', + 'en': 'en-IN', + 'gu': 'gu-IN', + 'hi': 'hi-IN', + 'kn': 'kn-IN', + 'ml': 'ml-IN', + 'mr': 'mr-IN', + 'or': 'od-IN', + 'pa': 'pa-IN', + 'ta': 'ta-IN', + 'te': 'te-IN', + } + + async def _generate_sarvam( + self, text: str, api_key: str, voice_id: str, parameters: Dict[str, Any] + ) -> bytes: + """ + Generate audio using Sarvam AI REST API. + + API Docs: https://docs.sarvam.ai/api-reference-docs/endpoints/text-to-speech + """ + url = 'https://api.sarvam.ai/text-to-speech' + + headers = { + 'api-subscription-key': api_key, + 'Content-Type': 'application/json', + } + + # Map short language code to Sarvam format + lang = parameters.get('language', 'hi') + target_language_code = self.SARVAM_LANGUAGE_CODE_MAP.get(lang, f'{lang}-IN') + + body = { + 'text': text, + 'target_language_code': target_language_code, + 'speaker': voice_id, + 'model': parameters.get('model', 'bulbul:v2'), + } + + # Optional parameters + if 'pitch' in parameters: + body['pitch'] = parameters['pitch'] + if 'pace' in parameters: + body['pace'] = parameters['pace'] + if 'loudness' in parameters: + body['loudness'] = parameters['loudness'] + if 'sample_rate' in parameters: + body['sample_rate'] = parameters['sample_rate'] + if 'enable_preprocessing' in parameters: + body['enable_preprocessing'] = parameters['enable_preprocessing'] + + try: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post(url, headers=headers, json=body) + response.raise_for_status() + + # Sarvam returns JSON with audios[] array of base64-encoded audio + data = response.json() + audio_b64 = data['audios'][0] + audio_bytes = base64.b64decode(audio_b64) + + logger.info( + f'Sarvam audio generated successfully, size: {len(audio_bytes)} bytes' + ) + return audio_bytes + + except httpx.HTTPStatusError as e: + logger.error( + f'Sarvam API error: {e.response.status_code} - {e.response.text}' + ) + raise Exception(f'Sarvam TTS generation failed: {e.response.text}') + except Exception as e: + logger.error(f'Sarvam request failed: {str(e)}') + raise Exception(f'Sarvam TTS generation failed: {str(e)}') diff --git a/wavefront/server/modules/voice_agents_module/voice_agents_module/utils/language_validation.py b/wavefront/server/modules/voice_agents_module/voice_agents_module/utils/language_validation.py index fe509d48..2fe4a60c 100644 --- a/wavefront/server/modules/voice_agents_module/voice_agents_module/utils/language_validation.py +++ b/wavefront/server/modules/voice_agents_module/voice_agents_module/utils/language_validation.py @@ -131,6 +131,20 @@ } # Deepgram STT supports 40+ languages +SARVAM_LANGUAGES: Set[str] = { + 'bn', + 'en', + 'gu', + 'hi', + 'kn', + 'ml', + 'mr', + 'or', + 'pa', + 'ta', + 'te', +} + DEEPGRAM_STT_LANGUAGES: Set[str] = { 'ar', 'bg', @@ -194,6 +208,8 @@ def get_tts_supported_languages(provider: str) -> Set[str]: # Deepgram TTS: language is implicit in voice_id, no explicit language param # Return empty set to indicate validation should be skipped return set() + elif provider == 'sarvam': + return SARVAM_LANGUAGES elif provider in ['azure', 'google', 'aws']: # For providers not yet fully implemented, skip validation return set() @@ -218,6 +234,8 @@ def get_stt_supported_languages(provider: str) -> Set[str]: if provider == 'deepgram': return DEEPGRAM_STT_LANGUAGES + elif provider == 'sarvam': + return SARVAM_LANGUAGES elif provider in ['assemblyai', 'whisper', 'google', 'azure']: # For providers not yet fully implemented, skip validation return set() diff --git a/wavefront/server/uv.lock b/wavefront/server/uv.lock index 632120cb..afad6982 100644 --- a/wavefront/server/uv.lock +++ b/wavefront/server/uv.lock @@ -625,7 +625,7 @@ dependencies = [ { name = "dependency-injector" }, { name = "fastapi" }, { name = "httpx" }, - { name = "pipecat-ai", extra = ["azure", "cartesia", "deepgram", "google", "groq", "local-smart-turn-v3", "runner", "silero", "websocket"] }, + { name = "pipecat-ai", extra = ["azure", "cartesia", "deepgram", "google", "groq", "local-smart-turn-v3", "runner", "sarvam", "silero", "websocket"] }, { name = "pydantic" }, { name = "python-dotenv" }, { name = "python-multipart" }, @@ -640,7 +640,7 @@ requires-dist = [ { name = "dependency-injector", specifier = ">=4.46.0,<5.0.0" }, { name = "fastapi", specifier = ">=0.115.2,<1.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, - { name = "pipecat-ai", extras = ["websocket", "cartesia", "google", "silero", "deepgram", "groq", "runner", "azure", "local-smart-turn-v3"], specifier = "==0.0.100" }, + { name = "pipecat-ai", extras = ["websocket", "cartesia", "google", "silero", "deepgram", "groq", "runner", "azure", "local-smart-turn-v3", "sarvam"], specifier = "==0.0.100" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "python-dotenv", specifier = ">=1.1.0,<2.0.0" }, { name = "python-multipart", specifier = ">=0.0.9" }, @@ -3954,6 +3954,10 @@ runner = [ { name = "python-dotenv" }, { name = "uvicorn" }, ] +sarvam = [ + { name = "sarvamai" }, + { name = "websockets" }, +] silero = [ { name = "onnxruntime" }, ] @@ -5069,6 +5073,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, ] +[[package]] +name = "sarvamai" +version = "0.1.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/08/e5efcb30818ed220b818319255c22fd91e379489ebaa93efd6f444fb4987/sarvamai-0.1.21.tar.gz", hash = "sha256:865065635b2b99d40f5519308832954015627938e06a6333b5f62ae9c36278bb", size = 87386, upload-time = "2025-10-07T07:37:47.085Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/4e/b9933f72681b7aed91b86913337dd3981fad97027881fbc66c3c5eb03568/sarvamai-0.1.21-py3-none-any.whl", hash = "sha256:daa4e5d16635fe434f5f270cee416849249285369141d77132a17f0bf670f120", size = 175204, upload-time = "2025-10-07T07:37:46.024Z" }, +] + [[package]] name = "scipy" version = "1.16.2" From 415180a3849b7e818675ffa01870a2b549d4b21a Mon Sep 17 00:00:00 2001 From: rootflo-hardik Date: Wed, 11 Feb 2026 12:03:45 +0530 Subject: [PATCH 2/2] resolved review comment --- .../voice_agents_module/services/tts_generator_service.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py b/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py index 08ee2f76..36b2f673 100644 --- a/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py +++ b/wavefront/server/modules/voice_agents_module/voice_agents_module/services/tts_generator_service.py @@ -290,7 +290,10 @@ async def _generate_sarvam( # Sarvam returns JSON with audios[] array of base64-encoded audio data = response.json() - audio_b64 = data['audios'][0] + audios = data.get('audios') + if not audios: + raise ValueError(f'Sarvam API returned no audio data: {data}') + audio_b64 = audios[0] audio_bytes = base64.b64decode(audio_b64) logger.info(