From ba0a774e178c41fc2f7744cc78db6196b1358b09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= <songsong.sss@alibaba-inc.com>
Date: Mon, 15 Sep 2025 15:00:37 +0800
Subject: [PATCH] feat(model/qwen-tts) interface change to
 multimodal_conversation

---
 dashscope/aigc/multimodal_conversation.py    | 61 +++++++++++++-------
 dashscope/api_entities/dashscope_response.py | 34 ++++++++++-
 samples/test_qwen_tts.py                     | 38 ++++++++++++
 3 files changed, 110 insertions(+), 23 deletions(-)
 create mode 100644 samples/test_qwen_tts.py

diff --git a/dashscope/aigc/multimodal_conversation.py b/dashscope/aigc/multimodal_conversation.py
index f3ed6c5..587fea1 100644
--- a/dashscope/aigc/multimodal_conversation.py
+++ b/dashscope/aigc/multimodal_conversation.py
@@ -24,9 +24,10 @@ class Models:
     def call(
         cls,
         model: str,
-        messages: List,
+        messages: List = None,
         api_key: str = None,
         workspace: str = None,
+        text: str = None,
         **kwargs
     ) -> Union[MultiModalConversationResponse, Generator[
             MultiModalConversationResponse, None, None]]:
@@ -55,6 +56,7 @@ def call(
                 if None, will retrieve by rule [1].
                 [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
             workspace (str): The dashscope workspace id.
+            text (str): The text to generate.
             **kwargs:
                 stream(bool, `optional`): Enable server-sent events
                     (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
@@ -68,8 +70,11 @@ def call(
                     tokens with top_p probability mass. So 0.1 means only
                     the tokens comprising the top 10% probability mass are
                     considered[qwen-turbo,bailian-v1].
+                voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
+                    you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
                 top_k(float, `optional`):
 
+
         Raises:
             InvalidInput: The history and auto_history are mutually exclusive.
 
@@ -78,18 +83,24 @@ def call(
                   Generator[MultiModalConversationResponse, None, None]]: If
             stream is True, return Generator, otherwise MultiModalConversationResponse.
         """
-        if (messages is None or not messages):
-            raise InputRequired('prompt or messages is required!')
         if model is None or not model:
             raise ModelRequired('Model is required!')
         task_group, _ = _get_task_group_and_task(__name__)
-        msg_copy = copy.deepcopy(messages)
-        has_upload = cls._preprocess_messages(model, msg_copy, api_key)
-        if has_upload:
-            headers = kwargs.pop('headers', {})
-            headers['X-DashScope-OssResourceResolve'] = 'enable'
-            kwargs['headers'] = headers
-        input = {'messages': msg_copy}
+        input = {}
+        msg_copy = None
+
+        if messages is not None and messages:
+            msg_copy = copy.deepcopy(messages)
+            has_upload = cls._preprocess_messages(model, msg_copy, api_key)
+            if has_upload:
+                headers = kwargs.pop('headers', {})
+                headers['X-DashScope-OssResourceResolve'] = 'enable'
+                kwargs['headers'] = headers
+
+        if text is not None and text:
+            input.update({'text': text})
+        if msg_copy is not None:
+            input.update({'messages': msg_copy})
         response = super().call(model=model,
                                 task_group=task_group,
                                 task=MultiModalConversation.task,
@@ -145,9 +156,10 @@ class Models:
     async def call(
         cls,
         model: str,
-        messages: List,
+        messages: List = None,
         api_key: str = None,
         workspace: str = None,
+        text: str = None,
         **kwargs
     ) -> Union[MultiModalConversationResponse, Generator[
             MultiModalConversationResponse, None, None]]:
@@ -176,6 +188,7 @@ async def call(
                 if None, will retrieve by rule [1].
                 [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
             workspace (str): The dashscope workspace id.
+            text (str): The text to generate.
             **kwargs:
                 stream(bool, `optional`): Enable server-sent events
                     (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
@@ -189,6 +202,8 @@ async def call(
                     tokens with top_p probability mass. So 0.1 means only
                     the tokens comprising the top 10% probability mass are
                     considered[qwen-turbo,bailian-v1].
+                voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
+                    you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
                 top_k(float, `optional`):
 
         Raises:
@@ -199,18 +214,24 @@ async def call(
                   Generator[MultiModalConversationResponse, None, None]]: If
             stream is True, return Generator, otherwise MultiModalConversationResponse.
         """
-        if (messages is None or not messages):
-            raise InputRequired('prompt or messages is required!')
         if model is None or not model:
             raise ModelRequired('Model is required!')
         task_group, _ = _get_task_group_and_task(__name__)
-        msg_copy = copy.deepcopy(messages)
-        has_upload = cls._preprocess_messages(model, msg_copy, api_key)
-        if has_upload:
-            headers = kwargs.pop('headers', {})
-            headers['X-DashScope-OssResourceResolve'] = 'enable'
-            kwargs['headers'] = headers
-        input = {'messages': msg_copy}
+        input = {}
+        msg_copy = None
+
+        if messages is not None and messages:
+            msg_copy = copy.deepcopy(messages)
+            has_upload = cls._preprocess_messages(model, msg_copy, api_key)
+            if has_upload:
+                headers = kwargs.pop('headers', {})
+                headers['X-DashScope-OssResourceResolve'] = 'enable'
+                kwargs['headers'] = headers
+
+        if text is not None and text:
+            input.update({'text': text})
+        if msg_copy is not None:
+            input.update({'messages': msg_copy})
         response = await super().call(model=model,
                                       task_group=task_group,
                                       task=AioMultiModalConversation.task,
diff --git a/dashscope/api_entities/dashscope_response.py b/dashscope/api_entities/dashscope_response.py
index c5e1f88..ba735bb 100644
--- a/dashscope/api_entities/dashscope_response.py
+++ b/dashscope/api_entities/dashscope_response.py
@@ -152,6 +152,26 @@ def __init__(self,
                          **kwargs)
 
 
+@dataclass(init=False)
+class Audio(DictMixin):
+    data: str
+    url: str
+    id: str
+    expires_at: int
+
+    def __init__(self,
+                 data: str = None,
+                 url: str = None,
+                 id: str = None,
+                 expires_at: int = None,
+                 **kwargs):
+        super().__init__(data=data,
+                         url=url,
+                         id=id,
+                         expires_at=expires_at,
+                         **kwargs)
+
+
 @dataclass(init=False)
 class GenerationOutput(DictMixin):
     text: str
@@ -217,20 +237,25 @@ def from_api_response(api_response: DashScopeAPIResponse):
 @dataclass(init=False)
 class MultiModalConversationOutput(DictMixin):
     choices: List[Choice]
+    audio: Audio
 
     def __init__(self,
                  text: str = None,
                  finish_reason: str = None,
                  choices: List[Choice] = None,
+                 audio: Audio = None,
                  **kwargs):
         chs = None
         if choices is not None:
             chs = []
             for choice in choices:
                 chs.append(Choice(**choice))
+        if audio is not None:
+            audio = Audio(**audio)
         super().__init__(text=text,
                          finish_reason=finish_reason,
                          choices=chs,
+                         audio=audio,
                          **kwargs)
 
 
@@ -238,15 +263,18 @@ def __init__(self,
 class MultiModalConversationUsage(DictMixin):
     input_tokens: int
     output_tokens: int
+    characters: int
 
     # TODO add image usage info.
 
     def __init__(self,
                  input_tokens: int = 0,
                  output_tokens: int = 0,
+                 characters: int = 0,
                  **kwargs):
         super().__init__(input_tokens=input_tokens,
                          output_tokens=output_tokens,
+                         characters=characters,
                          **kwargs)
 
 
@@ -378,7 +406,7 @@ def is_sentence_end(sentence: Dict[str, Any]) -> bool:
         """
         result = False
         if sentence is not None and 'end_time' in sentence and sentence[
-                'end_time'] is not None:
+            'end_time'] is not None:
             result = True
         return result
 
@@ -445,8 +473,8 @@ class ImageSynthesisOutput(DictMixin):
     results: List[ImageSynthesisResult]
 
     def __init__(self,
-                 task_id: str =  None,
-                 task_status: str =  None,
+                 task_id: str = None,
+                 task_status: str = None,
                  results: List[ImageSynthesisResult] = [],
                  **kwargs):
         res = []
diff --git a/samples/test_qwen_tts.py b/samples/test_qwen_tts.py
new file mode 100644
index 0000000..c570cf0
--- /dev/null
+++ b/samples/test_qwen_tts.py
@@ -0,0 +1,38 @@
+import os
+
+import dashscope
+import logging
+
+logger = logging.getLogger('dashscope')
+logger.setLevel(logging.DEBUG)
+console_handler = logging.StreamHandler()
+# create formatter
+formatter = logging.Formatter(
+    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# add formatter to ch
+console_handler.setFormatter(formatter)
+
+# add ch to logger
+logger.addHandler(console_handler)
+
+# switch stream or non-stream mode
+use_stream = True
+
+response = dashscope.MultiModalConversation.call(
+    api_key=os.getenv('DASHSCOPE_API_KEY'),
+    model="qwen-tts",
+    text="Today is a wonderful day to build something people love!",
+    voice="Cherry",
+    stream=use_stream
+)
+if use_stream:
+    # print the audio data in stream mode
+    for chunk in response:
+        audio = chunk.output.audio
+        print("base64 audio data is: {}", chunk.output.audio.data)
+        if chunk.output.finish_reason == "stop":
+            print("finish at: {} ", chunk.output.audio.expires_at)
+else:
+    # print the audio url in non-stream mode
+    print("synthesized audio url is: {}", response.output.audio.url)
+    print("finish at: {} ", response.output.audio.expires_at)