diff --git a/.changeset/deepgram-flux-general-multi.md b/.changeset/deepgram-flux-general-multi.md new file mode 100644 index 000000000..04a80b631 --- /dev/null +++ b/.changeset/deepgram-flux-general-multi.md @@ -0,0 +1,6 @@ +--- +'@livekit/agents-plugin-deepgram': patch +'@livekit/agents': patch +--- + +Add Deepgram `flux-general-multi` STTv2 model support with multi-language detection. Introduces a new `languageHint` option for biasing the model toward specific languages (only used by `flux-general-multi`), and adds a new `sourceLanguages` field on `SpeechData` that carries all detected languages sorted by prevalence. For multi-language detection, the dominant language is set on `language` while `sourceLanguages` retains the full list. diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts index 69ebd1418..48ceff5ba 100644 --- a/agents/src/stt/stt.ts +++ b/agents/src/stt/stt.ts @@ -64,6 +64,18 @@ export interface SpeechData { words?: TimedString[]; /** Speaker identifier when the provider supports diarization. */ speakerId?: string | null; + /** + * The source languages spoken by the user. + * + * Populated by STT services that support translation, where `language` holds the + * target language and `sourceLanguages` holds the original spoken language(s), + * or by multi-language detection services where `language` holds the dominant + * language and `sourceLanguages` holds all detected languages sorted by prevalence. + * + * May contain multiple entries when a single utterance spans multiple source languages. + */ + // Ref: python livekit-agents/livekit/agents/stt/stt.py - 62-68 lines + sourceLanguages?: LanguageCode[]; } export interface RecognitionUsage { diff --git a/plugins/deepgram/src/models.ts b/plugins/deepgram/src/models.ts index 11099893c..5a808965e 100644 --- a/plugins/deepgram/src/models.ts +++ b/plugins/deepgram/src/models.ts @@ -18,7 +18,8 @@ export type TTSModels = export type TTSEncoding = 'linear16' | 'mulaw' | 'alaw' | 'mp3' | 'opus' | 'flac' | 'aac'; -export type V2Models = 'flux-general-en'; +// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py - 38 line +export type V2Models = 'flux-general-en' | 'flux-general-multi'; export type STTModels = | 'nova-general' diff --git a/plugins/deepgram/src/stt_v2.ts b/plugins/deepgram/src/stt_v2.ts index 0c6c20dc4..2558bc787 100644 --- a/plugins/deepgram/src/stt_v2.ts +++ b/plugins/deepgram/src/stt_v2.ts @@ -36,6 +36,12 @@ export interface STTv2Options { eotTimeoutMs?: number; mipOptOut?: boolean; tags?: string[]; + /** + * List of language hints to bias the model for improved accuracy. + * Only usable with `flux-general-multi`. + */ + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 61 line + languageHint?: string[]; } const defaultSTTv2Options: Omit = { @@ -104,6 +110,8 @@ export class STTv2 extends stt.STT { * @param opts.eotTimeoutMs - End-of-turn timeout in ms (default: 3000) * @param opts.keyterms - List of key terms to improve recognition * @param opts.tags - Tags for usage reporting (max 128 chars each) + * @param opts.languageHint - List of language hints to bias the model for improved accuracy. + * Only usable with `flux-general-multi`. * * @throws Error if no API key is provided */ @@ -129,6 +137,18 @@ export class STTv2 extends stt.STT { if (this.#opts.tags) { this.#opts.tags = validateTags(this.#opts.tags); } + + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 134-138 lines + if ( + this.#opts.languageHint && + this.#opts.languageHint.length > 0 && + this.#opts.model !== 'flux-general-multi' + ) { + this.#logger.warn( + { model: this.#opts.model }, + '`languageHint` is only supported by `flux-general-multi` and will be ignored for this model', + ); + } } /** The model being used for transcription */ @@ -165,8 +185,24 @@ export class STTv2 extends stt.STT { * @param opts - Partial options to update */ updateOptions(opts: Partial) { - this.#opts = { ...this.#opts, ...opts }; + this.#opts = { + ...this.#opts, + ...opts, + language: + opts.language !== undefined ? normalizeLanguage(opts.language) : this.#opts.language, + }; if (opts.tags) this.#opts.tags = validateTags(opts.tags); + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 244-249 lines + if ( + this.#opts.languageHint && + this.#opts.languageHint.length > 0 && + this.#opts.model !== 'flux-general-multi' + ) { + this.#logger.warn( + { model: this.#opts.model }, + '`languageHint` is only supported by `flux-general-multi` and will be ignored for this model', + ); + } this.#logger.debug('Updated STTv2 options'); } } @@ -456,6 +492,11 @@ class SpeechStreamv2 extends stt.SpeechStream { if (this.#opts.keyterms.length > 0) params.keyterm = this.#opts.keyterms; if (this.#opts.tags && this.#opts.tags.length > 0) params.tag = this.#opts.tags; + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 480-481 lines + if (this.#opts.languageHint && this.#opts.languageHint.length > 0) { + params.language_hint = this.#opts.languageHint; + } + const baseUrl = this.#opts.endpointUrl.replace(/^http/, 'ws'); const qs = queryString.stringify(params); return `${baseUrl}?${qs}`; @@ -487,12 +528,20 @@ function parseTranscription( confidence = sum / wordsData.length; } + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 587-591 lines + const detectedLanguagesRaw = Array.isArray(data.languages) ? (data.languages as string[]) : []; + const detectedLanguages = detectedLanguagesRaw.map((lang) => normalizeLanguage(lang)); + const primaryLanguage = + detectedLanguages.length > 0 ? detectedLanguages[0]! : normalizeLanguage(language); + const sd: stt.SpeechData = { - language: normalizeLanguage(language), + language: primaryLanguage, startTime: ((data.audio_window_start as number) || 0) + startTimeOffset, endTime: ((data.audio_window_end as number) || 0) + startTimeOffset, confidence: confidence, text: transcript || '', + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 598 line + sourceLanguages: detectedLanguages.length > 0 ? detectedLanguages : undefined, // Note: Deepgram V2 (Flux) API does not provide word-level timing (start/end). // Words only contain 'word' and 'confidence' fields, so startTime/endTime will be 0. // See: https://developers.deepgram.com/docs/flux/nova-3-migration