Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/deepgram-flux-general-multi.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
'@livekit/agents-plugin-deepgram': patch
'@livekit/agents': patch
---

Add Deepgram `flux-general-multi` STTv2 model support with multi-language detection. Introduces a new `languageHint` option for biasing the model toward specific languages (only used by `flux-general-multi`), and adds a new `sourceLanguages` field on `SpeechData` that carries all detected languages sorted by prevalence. For multi-language detection, the dominant language is set on `language` while `sourceLanguages` retains the full list.
12 changes: 12 additions & 0 deletions agents/src/stt/stt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ export interface SpeechData {
words?: TimedString[];
/** Speaker identifier when the provider supports diarization. */
speakerId?: string | null;
/**
* The source languages spoken by the user.
*
* Populated by STT services that support translation, where `language` holds the
* target language and `sourceLanguages` holds the original spoken language(s),
* or by multi-language detection services where `language` holds the dominant
* language and `sourceLanguages` holds all detected languages sorted by prevalence.
*
* May contain multiple entries when a single utterance spans multiple source languages.
*/
// Ref: python livekit-agents/livekit/agents/stt/stt.py - 62-68 lines
sourceLanguages?: LanguageCode[];
}

export interface RecognitionUsage {
Expand Down
3 changes: 2 additions & 1 deletion plugins/deepgram/src/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ export type TTSModels =

export type TTSEncoding = 'linear16' | 'mulaw' | 'alaw' | 'mp3' | 'opus' | 'flac' | 'aac';

export type V2Models = 'flux-general-en';
// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py - 38 line
export type V2Models = 'flux-general-en' | 'flux-general-multi';

export type STTModels =
| 'nova-general'
Expand Down
53 changes: 51 additions & 2 deletions plugins/deepgram/src/stt_v2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ export interface STTv2Options {
eotTimeoutMs?: number;
mipOptOut?: boolean;
tags?: string[];
/**
* List of language hints to bias the model for improved accuracy.
* Only usable with `flux-general-multi`.
*/
// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 61 line
languageHint?: string[];
}

const defaultSTTv2Options: Omit<STTv2Options, 'apiKey'> = {
Expand Down Expand Up @@ -104,6 +110,8 @@ export class STTv2 extends stt.STT {
* @param opts.eotTimeoutMs - End-of-turn timeout in ms (default: 3000)
* @param opts.keyterms - List of key terms to improve recognition
* @param opts.tags - Tags for usage reporting (max 128 chars each)
* @param opts.languageHint - List of language hints to bias the model for improved accuracy.
* Only usable with `flux-general-multi`.
*
* @throws Error if no API key is provided
*/
Expand All @@ -129,6 +137,18 @@ export class STTv2 extends stt.STT {
if (this.#opts.tags) {
this.#opts.tags = validateTags(this.#opts.tags);
}

// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 134-138 lines
if (
this.#opts.languageHint &&
this.#opts.languageHint.length > 0 &&
this.#opts.model !== 'flux-general-multi'
) {
this.#logger.warn(
{ model: this.#opts.model },
'`languageHint` is only supported by `flux-general-multi` and will be ignored for this model',
);
}
}

/** The model being used for transcription */
Expand Down Expand Up @@ -165,8 +185,24 @@ export class STTv2 extends stt.STT {
* @param opts - Partial options to update
*/
updateOptions(opts: Partial<STTv2Options>) {
this.#opts = { ...this.#opts, ...opts };
this.#opts = {
...this.#opts,
...opts,
language:
opts.language !== undefined ? normalizeLanguage(opts.language) : this.#opts.language,
};
if (opts.tags) this.#opts.tags = validateTags(opts.tags);
// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 244-249 lines
if (
this.#opts.languageHint &&
this.#opts.languageHint.length > 0 &&
this.#opts.model !== 'flux-general-multi'
) {
this.#logger.warn(
{ model: this.#opts.model },
'`languageHint` is only supported by `flux-general-multi` and will be ignored for this model',
);
}
this.#logger.debug('Updated STTv2 options');
}
}
Expand Down Expand Up @@ -456,6 +492,11 @@ class SpeechStreamv2 extends stt.SpeechStream {
if (this.#opts.keyterms.length > 0) params.keyterm = this.#opts.keyterms;
if (this.#opts.tags && this.#opts.tags.length > 0) params.tag = this.#opts.tags;

// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 480-481 lines
if (this.#opts.languageHint && this.#opts.languageHint.length > 0) {
params.language_hint = this.#opts.languageHint;
}

const baseUrl = this.#opts.endpointUrl.replace(/^http/, 'ws');
const qs = queryString.stringify(params);
return `${baseUrl}?${qs}`;
Expand Down Expand Up @@ -487,12 +528,20 @@ function parseTranscription(
confidence = sum / wordsData.length;
}

// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 587-591 lines
const detectedLanguagesRaw = Array.isArray(data.languages) ? (data.languages as string[]) : [];
const detectedLanguages = detectedLanguagesRaw.map((lang) => normalizeLanguage(lang));
const primaryLanguage =
detectedLanguages.length > 0 ? detectedLanguages[0]! : normalizeLanguage(language);

const sd: stt.SpeechData = {
language: normalizeLanguage(language),
language: primaryLanguage,
startTime: ((data.audio_window_start as number) || 0) + startTimeOffset,
endTime: ((data.audio_window_end as number) || 0) + startTimeOffset,
confidence: confidence,
text: transcript || '',
// Ref: python livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py - 598 line
sourceLanguages: detectedLanguages.length > 0 ? detectedLanguages : undefined,
// Note: Deepgram V2 (Flux) API does not provide word-level timing (start/end).
// Words only contain 'word' and 'confidence' fields, so startTime/endTime will be 0.
// See: https://developers.deepgram.com/docs/flux/nova-3-migration
Expand Down
Loading