dashscope · kevinlin09 · Oct 23, 2025 · Oct 22, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/dashscope/audio/qwen_omni/omni_realtime.py b/dashscope/audio/qwen_omni/omni_realtime.py
@@ -4,6 +4,7 @@
 import platform
 import threading
 import time
+from dataclasses import field, dataclass
 from typing import List
 import uuid
 from enum import Enum, unique
@@ -29,6 +30,26 @@ def on_event(self, message: str) -> None:
         pass
 
 
+@dataclass
+class TranslationParams:
+    """
+    TranslationParams
+    """
+    language: str = field(default=None)
+
+
+@dataclass
+class TranscriptionParams:
+    """
+    TranscriptionParams
+    """
+    language: str = field(default=None)
+    sample_rate: int = field(default=16000)
+    input_audio_format: str = field(default="pcm")
+    corpus: dict = field(default=None)
+    corpus_text: str = field(default=None)
+
+
 @unique
 class AudioFormat(Enum):
     # format, sample_rate, channels, bit_rate, name
@@ -171,7 +192,7 @@ def __send_str(self, data: str, enable_log: bool = True):
 
     def update_session(self,
                        output_modalities: List[MultiModality],
-                       voice: str,
+                       voice: str = None,
                        input_audio_format: AudioFormat = AudioFormat.
                        PCM_16000HZ_MONO_16BIT,
                        output_audio_format: AudioFormat = AudioFormat.
@@ -184,6 +205,8 @@ def update_session(self,
                        turn_detection_threshold: float = 0.2,
                        turn_detection_silence_duration_ms: int = 800,
                        turn_detection_param: dict = None,
+                       translation_params: TranslationParams = None,
+                       transcription_params: TranscriptionParams = None,
                        **kwargs) -> None:
         '''
         update session configuration, should be used before create response
@@ -206,6 +229,13 @@ def update_session(self,
             In a quiet environment, it may be necessary to decrease the threshold to improve sensitivity
         turn_detection_silence_duration_ms: int
             duration of silence in milliseconds to detect turn, range [200, 6000]
+        translation_params: TranslationParams
+            translation params, include language. Only effective with qwen3-livetranslate-flash-realtime model or
+             further models. Do not set this parameter for other models.
+        transcription_params: TranscriptionParams
+            transcription params, include language, sample_rate, input_audio_format, corpus.
+            Only effective with qwen3-asr-flash-realtime model or
+            further models. Do not set this parameter for other models.
         '''
         self.config = {
             'modalities': [m.value for m in output_modalities],
@@ -230,6 +260,20 @@ def update_session(self,
                 self.config['turn_detection'].update(turn_detection_param)
         else:
             self.config['turn_detection'] = None
+        if translation_params is not None:
+            self.config['translation'] = {
+                'language': translation_params.language
+            }
+        if transcription_params is not None:
+            self.config['language'] = transcription_params.language
+            if transcription_params.corpus is not None:
+                self.config['corpus'] = transcription_params.corpus
+            if transcription_params.corpus_text is not None:
+                self.config['corpus'] = {
+                    "text": transcription_params.corpus_text
+                }
+            self.config['input_audio_format'] = transcription_params.input_audio_format
+            self.config['sample_rate']= transcription_params.sample_rate
         self.config.update(kwargs)
         self.__send_str(
             json.dumps({

diff --git a/dashscope/audio/tts_v2/enrollment.py b/dashscope/audio/tts_v2/enrollment.py
@@ -68,20 +68,25 @@ def __call_with_input(self, input):
         logger.debug('>>>>recv', response)
         return response
 
-    def create_voice(self, target_model: str, prefix: str, url: str) -> str:
+    def create_voice(self, target_model: str, prefix: str, url: str, language_hints: List[str] = None) -> str:
         '''
         创建新克隆音色
         param: target_model 克隆音色对应的语音合成模型版本
         param: prefix 音色自定义前缀，仅允许数字和小写字母，小于十个字符。
         param: url 用于克隆的音频文件url
+        param: language_hints 克隆音色目标语言
         return: voice_id
         '''
-        response = self.__call_with_input(input={
+
+        input_params = {
             'action': 'create_voice',
             'target_model': target_model,
             'prefix': prefix,
-            'url': url,
-        }, )
+            'url': url
+        }
+        if language_hints is not None:
+            input_params['language_hints'] = language_hints
+        response = self.__call_with_input(input_params)
         self._last_request_id = response.request_id
         if response.status_code == 200:
             return response.output['voice_id']

diff --git a/dashscope/multimodal/multimodal_request_params.py b/dashscope/multimodal/multimodal_request_params.py
@@ -72,24 +72,54 @@ def to_dict(self):
             "directive": self.directive,
             "dialog_id": self.dialog_id
         }
+@dataclass
+class AsrPostProcessing:
+    replace_words: list = field(default=None)
+
+    def to_dict(self):
+        if self.replace_words is None:
+            return None
+        if len(self.replace_words) == 0:
+            return None
+        return {
+            "replace_words":  [word.to_dict() for word in self.replace_words]
+        }
+
+@dataclass
+class ReplaceWord:
+    source: str = field(default=None)
+    target: str = field(default=None)
+    match_mode: str = field(default=None)
 
+    def to_dict(self):
+        return {
+            "source": self.source,
+            "target": self.target,
+            "match_mode": self.match_mode
+        }
 
 @dataclass
 class Upstream:
     """struct for upstream"""
     audio_format: str = field(default="pcm")  # 上行语音格式，默认pcm.支持pcm/opus
     type: str = field(default="AudioOnly")  # 上行类型：AudioOnly 仅语音通话; AudioAndVideo 上传视频
     mode: str = field(default="tap2talk")  # 客户端交互模式 push2talk/tap2talk/duplex
-    # sample_rate: int  # 合成音频采样率
+    sample_rate: int = field(default=16000)  # 音频采样率
+    vocabulary_id: str = field(default=None)
+    asr_post_processing: AsrPostProcessing = field(default=None)
     pass_through_params: dict = field(default=None)
 
     def to_dict(self):
         upstream: dict = {
             "type": self.type,
             "mode": self.mode,
             "audio_format": self.audio_format,
-            # "sample_rate": self.sample_rate
+            "sample_rate": self.sample_rate,
+            "vocabulary_id": self.vocabulary_id,
         }
+        if self.asr_post_processing is not None:
+            upstream["asr_post_processing"] = self.asr_post_processing.to_dict()
+
         if self.pass_through_params is not None:
             upstream.update(self.pass_through_params)
         return upstream