From 62a8df0533972ecce972e101035368e1795e1c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= Date: Wed, 22 Oct 2025 16:17:04 +0800 Subject: [PATCH 1/3] feat(app/multimodal-dialog): add upstream.asr_post_processing --- .../multimodal/multimodal_request_params.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/dashscope/multimodal/multimodal_request_params.py b/dashscope/multimodal/multimodal_request_params.py index ce97540..49d9b7d 100644 --- a/dashscope/multimodal/multimodal_request_params.py +++ b/dashscope/multimodal/multimodal_request_params.py @@ -72,7 +72,31 @@ def to_dict(self): "directive": self.directive, "dialog_id": self.dialog_id } +@dataclass +class AsrPostProcessing: + replace_words: list = field(default=None) + + def to_dict(self): + if self.replace_words is None: + return None + if len(self.replace_words) == 0: + return None + return { + "replace_words": [word.to_dict() for word in self.replace_words] + } + +@dataclass +class ReplaceWord: + source: str = field(default=None) + target: str = field(default=None) + match_mode: str = field(default=None) + def to_dict(self): + return { + "source": self.source, + "target": self.target, + "match_mode": self.match_mode + } @dataclass class Upstream: @@ -80,7 +104,9 @@ class Upstream: audio_format: str = field(default="pcm") # 上行语音格式,默认pcm.支持pcm/opus type: str = field(default="AudioOnly") # 上行类型:AudioOnly 仅语音通话; AudioAndVideo 上传视频 mode: str = field(default="tap2talk") # 客户端交互模式 push2talk/tap2talk/duplex - # sample_rate: int # 合成音频采样率 + sample_rate: int = field(default=16000) # 音频采样率 + vocabulary_id: str = field(default=None) + asr_post_processing: AsrPostProcessing = field(default=None) pass_through_params: dict = field(default=None) def to_dict(self): @@ -88,8 +114,12 @@ def to_dict(self): "type": self.type, "mode": self.mode, "audio_format": self.audio_format, - # "sample_rate": self.sample_rate + "sample_rate": self.sample_rate, + "vocabulary_id": self.vocabulary_id, } + if self.asr_post_processing is not None: + upstream["asr_post_processing"] = self.asr_post_processing.to_dict() + if self.pass_through_params is not None: upstream.update(self.pass_through_params) return upstream From 3c650ce7b4fcc4b5cad50262def195f0bcc92d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= Date: Thu, 23 Oct 2025 14:40:03 +0800 Subject: [PATCH 2/3] feat(model/cosyvoice-v3):add language hints to voice clone --- dashscope/audio/tts_v2/enrollment.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dashscope/audio/tts_v2/enrollment.py b/dashscope/audio/tts_v2/enrollment.py index 7c763b6..a254c49 100644 --- a/dashscope/audio/tts_v2/enrollment.py +++ b/dashscope/audio/tts_v2/enrollment.py @@ -68,20 +68,25 @@ def __call_with_input(self, input): logger.debug('>>>>recv', response) return response - def create_voice(self, target_model: str, prefix: str, url: str) -> str: + def create_voice(self, target_model: str, prefix: str, url: str, language_hints: List[str] = None) -> str: ''' 创建新克隆音色 param: target_model 克隆音色对应的语音合成模型版本 param: prefix 音色自定义前缀,仅允许数字和小写字母,小于十个字符。 param: url 用于克隆的音频文件url + param: language_hints 克隆音色目标语言 return: voice_id ''' - response = self.__call_with_input(input={ + + input_params = { 'action': 'create_voice', 'target_model': target_model, 'prefix': prefix, - 'url': url, - }, ) + 'url': url + } + if language_hints is not None: + input_params['language_hints'] = language_hints + response = self.__call_with_input(input_params) self._last_request_id = response.request_id if response.status_code == 200: return response.output['voice_id'] From e78a2be8121f33dc87fbf5bc094f91a26a4fa420 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= Date: Thu, 23 Oct 2025 14:41:55 +0800 Subject: [PATCH 3/3] feat(model/qwen3-livetranslate&asr-realtime):add input params --- dashscope/audio/qwen_omni/omni_realtime.py | 46 +++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/dashscope/audio/qwen_omni/omni_realtime.py b/dashscope/audio/qwen_omni/omni_realtime.py index 75f9ddc..e08c6f3 100644 --- a/dashscope/audio/qwen_omni/omni_realtime.py +++ b/dashscope/audio/qwen_omni/omni_realtime.py @@ -4,6 +4,7 @@ import platform import threading import time +from dataclasses import field, dataclass from typing import List import uuid from enum import Enum, unique @@ -29,6 +30,26 @@ def on_event(self, message: str) -> None: pass +@dataclass +class TranslationParams: + """ + TranslationParams + """ + language: str = field(default=None) + + +@dataclass +class TranscriptionParams: + """ + TranscriptionParams + """ + language: str = field(default=None) + sample_rate: int = field(default=16000) + input_audio_format: str = field(default="pcm") + corpus: dict = field(default=None) + corpus_text: str = field(default=None) + + @unique class AudioFormat(Enum): # format, sample_rate, channels, bit_rate, name @@ -171,7 +192,7 @@ def __send_str(self, data: str, enable_log: bool = True): def update_session(self, output_modalities: List[MultiModality], - voice: str, + voice: str = None, input_audio_format: AudioFormat = AudioFormat. PCM_16000HZ_MONO_16BIT, output_audio_format: AudioFormat = AudioFormat. @@ -184,6 +205,8 @@ def update_session(self, turn_detection_threshold: float = 0.2, turn_detection_silence_duration_ms: int = 800, turn_detection_param: dict = None, + translation_params: TranslationParams = None, + transcription_params: TranscriptionParams = None, **kwargs) -> None: ''' update session configuration, should be used before create response @@ -206,6 +229,13 @@ def update_session(self, In a quiet environment, it may be necessary to decrease the threshold to improve sensitivity turn_detection_silence_duration_ms: int duration of silence in milliseconds to detect turn, range [200, 6000] + translation_params: TranslationParams + translation params, include language. Only effective with qwen3-livetranslate-flash-realtime model or + further models. Do not set this parameter for other models. + transcription_params: TranscriptionParams + transcription params, include language, sample_rate, input_audio_format, corpus. + Only effective with qwen3-asr-flash-realtime model or + further models. Do not set this parameter for other models. ''' self.config = { 'modalities': [m.value for m in output_modalities], @@ -230,6 +260,20 @@ def update_session(self, self.config['turn_detection'].update(turn_detection_param) else: self.config['turn_detection'] = None + if translation_params is not None: + self.config['translation'] = { + 'language': translation_params.language + } + if transcription_params is not None: + self.config['language'] = transcription_params.language + if transcription_params.corpus is not None: + self.config['corpus'] = transcription_params.corpus + if transcription_params.corpus_text is not None: + self.config['corpus'] = { + "text": transcription_params.corpus_text + } + self.config['input_audio_format'] = transcription_params.input_audio_format + self.config['sample_rate']= transcription_params.sample_rate self.config.update(kwargs) self.__send_str( json.dumps({