From e18df61759b988e4d7e8a24172bfa3f6b771a92a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 01:55:21 +0000 Subject: [PATCH 1/3] feat(deepgram): add flux-general-multi STTv2 support Port of livekit/agents#5486. - Add `flux-general-multi` to Deepgram V2Models. - Add `languageHint` option on STTv2 to bias the multi-language model (ignored with a warning when used with a non-multi model). - Propagate detected languages from Deepgram responses into `SpeechData.sourceLanguages`; set the dominant detected language as the primary `language` on each transcript alternative. - Add `sourceLanguages?: LanguageCode[]` to core `SpeechData`, mirroring the Python `source_languages` field used by translation-capable and multi-language-detection STT providers. --- .changeset/deepgram-flux-general-multi.md | 6 +++ agents/src/stt/stt.ts | 12 ++++++ plugins/deepgram/src/models.ts | 3 +- plugins/deepgram/src/stt_v2.ts | 46 ++++++++++++++++++++++- 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 .changeset/deepgram-flux-general-multi.md diff --git a/.changeset/deepgram-flux-general-multi.md b/.changeset/deepgram-flux-general-multi.md new file mode 100644 index 000000000..3315ed156 --- /dev/null +++ b/.changeset/deepgram-flux-general-multi.md @@ -0,0 +1,6 @@ +--- +'@livekit/agents-plugin-deepgram': minor +'@livekit/agents': minor +--- + +Add Deepgram `flux-general-multi` STTv2 model support with multi-language detection. Introduces a new `languageHint` option for biasing the model toward specific languages (only used by `flux-general-multi`), and adds a new `sourceLanguages` field on `SpeechData` that carries all detected languages sorted by prevalence. For multi-language detection, the dominant language is set on `language` while `sourceLanguages` retains the full list. diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts index 69ebd1418..48ceff5ba 100644 --- a/agents/src/stt/stt.ts +++ b/agents/src/stt/stt.ts @@ -64,6 +64,18 @@ export interface SpeechData { words?: TimedString[]; /** Speaker identifier when the provider supports diarization. */ speakerId?: string | null; + /** + * The source languages spoken by the user. + * + * Populated by STT services that support translation, where `language` holds the + * target language and `sourceLanguages` holds the original spoken language(s), + * or by multi-language detection services where `language` holds the dominant + * language and `sourceLanguages` holds all detected languages sorted by prevalence. + * + * May contain multiple entries when a single utterance spans multiple source languages. + */ + // Ref: python livekit-agents/livekit/agents/stt/stt.py - 62-68 lines + sourceLanguages?: LanguageCode[]; } export interface RecognitionUsage { diff --git a/plugins/deepgram/src/models.ts b/plugins/deepgram/src/models.ts index 11099893c..5a808965e 100644 --- a/plugins/deepgram/src/models.ts +++ b/plugins/deepgram/src/models.ts @@ -18,7 +18,8 @@ export type TTSModels = export type TTSEncoding = 'linear16' | 'mulaw' | 'alaw' | 'mp3' | 'opus' | 'flac' | 'aac'; -export type V2Models = 'flux-general-en'; +// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py - 38 line +export type V2Models = 'flux-general-en' | 'flux-general-multi'; export type STTModels = | 'nova-general' diff --git a/plugins/deepgram/src/stt_v2.ts b/plugins/deepgram/src/stt_v2.ts index 0c6c20dc4..682240e62 100644 --- a/plugins/deepgram/src/stt_v2.ts +++ b/plugins/deepgram/src/stt_v2.ts @@ -36,6 +36,12 @@ export interface STTv2Options { eotTimeoutMs?: number; mipOptOut?: boolean; tags?: string[]; + /** + * List of language hints to bias the model for improved accuracy. + * Only usable with `flux-general-multi`. + */ + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 61 line + languageHint?: string[]; } const defaultSTTv2Options: Omit = { @@ -104,6 +110,8 @@ export class STTv2 extends stt.STT { * @param opts.eotTimeoutMs - End-of-turn timeout in ms (default: 3000) * @param opts.keyterms - List of key terms to improve recognition * @param opts.tags - Tags for usage reporting (max 128 chars each) + * @param opts.languageHint - List of language hints to bias the model for improved accuracy. + * Only usable with `flux-general-multi`. * * @throws Error if no API key is provided */ @@ -129,6 +137,18 @@ export class STTv2 extends stt.STT { if (this.#opts.tags) { this.#opts.tags = validateTags(this.#opts.tags); } + + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 134-138 lines + if ( + this.#opts.languageHint && + this.#opts.languageHint.length > 0 && + this.#opts.model !== 'flux-general-multi' + ) { + this.#logger.warn( + { model: this.#opts.model }, + '`languageHint` is only supported by `flux-general-multi` and will be ignored for this model', + ); + } } /** The model being used for transcription */ @@ -167,6 +187,17 @@ export class STTv2 extends stt.STT { updateOptions(opts: Partial) { this.#opts = { ...this.#opts, ...opts }; if (opts.tags) this.#opts.tags = validateTags(opts.tags); + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 244-249 lines + if ( + opts.languageHint && + opts.languageHint.length > 0 && + this.#opts.model !== 'flux-general-multi' + ) { + this.#logger.warn( + { model: this.#opts.model }, + '`languageHint` is only supported by `flux-general-multi` and will be ignored for this model', + ); + } this.#logger.debug('Updated STTv2 options'); } } @@ -456,6 +487,11 @@ class SpeechStreamv2 extends stt.SpeechStream { if (this.#opts.keyterms.length > 0) params.keyterm = this.#opts.keyterms; if (this.#opts.tags && this.#opts.tags.length > 0) params.tag = this.#opts.tags; + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 480-481 lines + if (this.#opts.languageHint && this.#opts.languageHint.length > 0) { + params.language_hint = this.#opts.languageHint; + } + const baseUrl = this.#opts.endpointUrl.replace(/^http/, 'ws'); const qs = queryString.stringify(params); return `${baseUrl}?${qs}`; @@ -487,12 +523,20 @@ function parseTranscription( confidence = sum / wordsData.length; } + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 587-591 lines + const detectedLanguagesRaw = Array.isArray(data.languages) ? (data.languages as string[]) : []; + const detectedLanguages = detectedLanguagesRaw.map((lang) => normalizeLanguage(lang)); + const primaryLanguage = + detectedLanguages.length > 0 ? detectedLanguages[0]! : normalizeLanguage(language); + const sd: stt.SpeechData = { - language: normalizeLanguage(language), + language: primaryLanguage, startTime: ((data.audio_window_start as number) || 0) + startTimeOffset, endTime: ((data.audio_window_end as number) || 0) + startTimeOffset, confidence: confidence, text: transcript || '', + // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 598 line + sourceLanguages: detectedLanguages.length > 0 ? detectedLanguages : undefined, // Note: Deepgram V2 (Flux) API does not provide word-level timing (start/end). // Words only contain 'word' and 'confidence' fields, so startTime/endTime will be 0. // See: https://developers.deepgram.com/docs/flux/nova-3-migration From fbc5162b4d9a888279f37bfe1548479ed919004f Mon Sep 17 00:00:00 2001 From: Tina Nguyen <72938484+tinalenguyen@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:00:50 -0400 Subject: [PATCH 2/3] update changeset to patch --- .changeset/deepgram-flux-general-multi.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.changeset/deepgram-flux-general-multi.md b/.changeset/deepgram-flux-general-multi.md index 3315ed156..04a80b631 100644 --- a/.changeset/deepgram-flux-general-multi.md +++ b/.changeset/deepgram-flux-general-multi.md @@ -1,6 +1,6 @@ --- -'@livekit/agents-plugin-deepgram': minor -'@livekit/agents': minor +'@livekit/agents-plugin-deepgram': patch +'@livekit/agents': patch --- Add Deepgram `flux-general-multi` STTv2 model support with multi-language detection. Introduces a new `languageHint` option for biasing the model toward specific languages (only used by `flux-general-multi`), and adds a new `sourceLanguages` field on `SpeechData` that carries all detected languages sorted by prevalence. For multi-language detection, the dominant language is set on `language` while `sourceLanguages` retains the full list. From 11dd9130e45e629d4c4e6393788b13077bec7cd6 Mon Sep 17 00:00:00 2001 From: Tina Nguyen Date: Sun, 19 Apr 2026 22:58:45 -0400 Subject: [PATCH 3/3] addr comments --- plugins/deepgram/src/stt_v2.ts | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/plugins/deepgram/src/stt_v2.ts b/plugins/deepgram/src/stt_v2.ts index 682240e62..2558bc787 100644 --- a/plugins/deepgram/src/stt_v2.ts +++ b/plugins/deepgram/src/stt_v2.ts @@ -185,12 +185,17 @@ export class STTv2 extends stt.STT { * @param opts - Partial options to update */ updateOptions(opts: Partial) { - this.#opts = { ...this.#opts, ...opts }; + this.#opts = { + ...this.#opts, + ...opts, + language: + opts.language !== undefined ? normalizeLanguage(opts.language) : this.#opts.language, + }; if (opts.tags) this.#opts.tags = validateTags(opts.tags); // Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 244-249 lines if ( - opts.languageHint && - opts.languageHint.length > 0 && + this.#opts.languageHint && + this.#opts.languageHint.length > 0 && this.#opts.model !== 'flux-general-multi' ) { this.#logger.warn(