From 4a723e92f473f3a8f1963b2b4d211d5a23082e9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= Date: Wed, 17 Sep 2025 20:07:16 +0800 Subject: [PATCH] feat(model/qwen-tts): add param language_type --- dashscope/aigc/multimodal_conversation.py | 22 +++++++++++++++---- .../qwen_tts_realtime/qwen_tts_realtime.py | 5 +++++ samples/test_qwen_tts.py | 21 ++++++++++++------ 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/dashscope/aigc/multimodal_conversation.py b/dashscope/aigc/multimodal_conversation.py index 587fea1..b6b4136 100644 --- a/dashscope/aigc/multimodal_conversation.py +++ b/dashscope/aigc/multimodal_conversation.py @@ -28,6 +28,8 @@ def call( api_key: str = None, workspace: str = None, text: str = None, + voice: str = None, + language_type: str = None, **kwargs ) -> Union[MultiModalConversationResponse, Generator[ MultiModalConversationResponse, None, None]]: @@ -57,6 +59,9 @@ def call( [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501 workspace (str): The dashscope workspace id. text (str): The text to generate. + voice (str): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on, + you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts. + language_type (str): The synthesized language type, default is 'auto', useful for [qwen3-tts]. **kwargs: stream(bool, `optional`): Enable server-sent events (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501 @@ -70,8 +75,6 @@ def call( tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered[qwen-turbo,bailian-v1]. - voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on, - you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts. top_k(float, `optional`): @@ -99,6 +102,10 @@ def call( if text is not None and text: input.update({'text': text}) + if voice is not None and voice: + input.update({'voice': voice}) + if language_type is not None and language_type: + input.update({'language_type': language_type}) if msg_copy is not None: input.update({'messages': msg_copy}) response = super().call(model=model, @@ -160,6 +167,8 @@ async def call( api_key: str = None, workspace: str = None, text: str = None, + voice: str = None, + language_type: str = None, **kwargs ) -> Union[MultiModalConversationResponse, Generator[ MultiModalConversationResponse, None, None]]: @@ -189,6 +198,9 @@ async def call( [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501 workspace (str): The dashscope workspace id. text (str): The text to generate. + voice (str): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on, + you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts. + language_type (str): The synthesized language type, default is 'auto', useful for [qwen3-tts]. **kwargs: stream(bool, `optional`): Enable server-sent events (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501 @@ -202,8 +214,6 @@ async def call( tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered[qwen-turbo,bailian-v1]. - voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on, - you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts. top_k(float, `optional`): Raises: @@ -230,6 +240,10 @@ async def call( if text is not None and text: input.update({'text': text}) + if voice is not None and voice: + input.update({'voice': voice}) + if language_type is not None and language_type: + input.update({'language_type': language_type}) if msg_copy is not None: input.update({'messages': msg_copy}) response = await super().call(model=model, diff --git a/dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py b/dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py index beccb3f..4acc8a9 100644 --- a/dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py +++ b/dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py @@ -158,6 +158,7 @@ def update_session(self, response_format: AudioFormat = AudioFormat. PCM_24000HZ_MONO_16BIT, mode: str = 'server_commit', + language_type: str = None, **kwargs) -> None: ''' update session configuration, should be used before create response @@ -170,6 +171,8 @@ def update_session(self, output audio format mode: str response mode, server_commit or commit + language_type: str + language type for synthesized audio, default is 'auto' ''' self.config = { 'voice': voice, @@ -177,6 +180,8 @@ def update_session(self, 'response_format': response_format.format, 'sample_rate': response_format.sample_rate, } + if language_type is not None: + self.config['language_type'] = language_type self.config.update(kwargs) self.__send_str( json.dumps({ diff --git a/samples/test_qwen_tts.py b/samples/test_qwen_tts.py index c570cf0..d2049e6 100644 --- a/samples/test_qwen_tts.py +++ b/samples/test_qwen_tts.py @@ -20,19 +20,26 @@ response = dashscope.MultiModalConversation.call( api_key=os.getenv('DASHSCOPE_API_KEY'), - model="qwen-tts", + model="qwen3-tts-flash", text="Today is a wonderful day to build something people love!", voice="Cherry", - stream=use_stream + stream=use_stream, + language_type="zh" ) if use_stream: # print the audio data in stream mode for chunk in response: + if chunk.output is None: + print(f"error: {chunk}") + break audio = chunk.output.audio - print("base64 audio data is: {}", chunk.output.audio.data) + print(f"base64 audio data is: {chunk.output.audio.data}") if chunk.output.finish_reason == "stop": - print("finish at: {} ", chunk.output.audio.expires_at) + print(f"finish at: {chunk.output.audio.expires_at}") else: - # print the audio url in non-stream mode - print("synthesized audio url is: {}", response.output.audio.url) - print("finish at: {} ", response.output.audio.expires_at) + if response.output is None: + print(f"error: {response}") + else: + # print the audio url in non-stream mode + print(f"synthesized audio url is: {response.output.audio.url}") + print(f"finish at: {response.output.audio.expires_at}")