Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion dashscope/audio/qwen_omni/omni_realtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import platform
import threading
import time
from dataclasses import field, dataclass
from typing import List
import uuid
from enum import Enum, unique
Expand All @@ -29,6 +30,26 @@ def on_event(self, message: str) -> None:
pass


@dataclass
class TranslationParams:
"""
TranslationParams
"""
language: str = field(default=None)


@dataclass
class TranscriptionParams:
"""
TranscriptionParams
"""
language: str = field(default=None)
sample_rate: int = field(default=16000)
input_audio_format: str = field(default="pcm")
corpus: dict = field(default=None)
corpus_text: str = field(default=None)

Comment thread
songguocola marked this conversation as resolved.

@unique
class AudioFormat(Enum):
# format, sample_rate, channels, bit_rate, name
Expand Down Expand Up @@ -171,7 +192,7 @@ def __send_str(self, data: str, enable_log: bool = True):

def update_session(self,
output_modalities: List[MultiModality],
voice: str,
voice: str = None,
input_audio_format: AudioFormat = AudioFormat.
PCM_16000HZ_MONO_16BIT,
output_audio_format: AudioFormat = AudioFormat.
Expand All @@ -184,6 +205,8 @@ def update_session(self,
turn_detection_threshold: float = 0.2,
turn_detection_silence_duration_ms: int = 800,
turn_detection_param: dict = None,
translation_params: TranslationParams = None,
transcription_params: TranscriptionParams = None,
**kwargs) -> None:
'''
update session configuration, should be used before create response
Expand All @@ -206,6 +229,13 @@ def update_session(self,
In a quiet environment, it may be necessary to decrease the threshold to improve sensitivity
turn_detection_silence_duration_ms: int
duration of silence in milliseconds to detect turn, range [200, 6000]
translation_params: TranslationParams
translation params, include language. Only effective with qwen3-livetranslate-flash-realtime model or
further models. Do not set this parameter for other models.
transcription_params: TranscriptionParams
transcription params, include language, sample_rate, input_audio_format, corpus.
Only effective with qwen3-asr-flash-realtime model or
further models. Do not set this parameter for other models.
'''
self.config = {
'modalities': [m.value for m in output_modalities],
Expand All @@ -230,6 +260,20 @@ def update_session(self,
self.config['turn_detection'].update(turn_detection_param)
else:
self.config['turn_detection'] = None
if translation_params is not None:
self.config['translation'] = {
'language': translation_params.language
}
if transcription_params is not None:
self.config['language'] = transcription_params.language
if transcription_params.corpus is not None:
self.config['corpus'] = transcription_params.corpus
if transcription_params.corpus_text is not None:
self.config['corpus'] = {
"text": transcription_params.corpus_text
}
Comment thread
songguocola marked this conversation as resolved.
self.config['input_audio_format'] = transcription_params.input_audio_format
self.config['sample_rate']= transcription_params.sample_rate
Comment thread
songguocola marked this conversation as resolved.
self.config.update(kwargs)
self.__send_str(
json.dumps({
Expand Down
13 changes: 9 additions & 4 deletions dashscope/audio/tts_v2/enrollment.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,20 +68,25 @@ def __call_with_input(self, input):
logger.debug('>>>>recv', response)
return response

def create_voice(self, target_model: str, prefix: str, url: str) -> str:
def create_voice(self, target_model: str, prefix: str, url: str, language_hints: List[str] = None) -> str:
'''
创建新克隆音色
param: target_model 克隆音色对应的语音合成模型版本
param: prefix 音色自定义前缀,仅允许数字和小写字母,小于十个字符。
param: url 用于克隆的音频文件url
param: language_hints 克隆音色目标语言
return: voice_id
'''
response = self.__call_with_input(input={

input_params = {
'action': 'create_voice',
'target_model': target_model,
'prefix': prefix,
'url': url,
}, )
'url': url
}
if language_hints is not None:
input_params['language_hints'] = language_hints
response = self.__call_with_input(input_params)
self._last_request_id = response.request_id
if response.status_code == 200:
return response.output['voice_id']
Expand Down
34 changes: 32 additions & 2 deletions dashscope/multimodal/multimodal_request_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,54 @@ def to_dict(self):
"directive": self.directive,
"dialog_id": self.dialog_id
}
@dataclass
class AsrPostProcessing:
replace_words: list = field(default=None)
Comment thread
songguocola marked this conversation as resolved.

def to_dict(self):
if self.replace_words is None:
return None
if len(self.replace_words) == 0:
return None
return {
"replace_words": [word.to_dict() for word in self.replace_words]
}

@dataclass
class ReplaceWord:
source: str = field(default=None)
target: str = field(default=None)
match_mode: str = field(default=None)

def to_dict(self):
return {
"source": self.source,
"target": self.target,
"match_mode": self.match_mode
}

@dataclass
class Upstream:
"""struct for upstream"""
audio_format: str = field(default="pcm") # 上行语音格式,默认pcm.支持pcm/opus
type: str = field(default="AudioOnly") # 上行类型:AudioOnly 仅语音通话; AudioAndVideo 上传视频
mode: str = field(default="tap2talk") # 客户端交互模式 push2talk/tap2talk/duplex
# sample_rate: int # 合成音频采样率
sample_rate: int = field(default=16000) # 音频采样率
vocabulary_id: str = field(default=None)
asr_post_processing: AsrPostProcessing = field(default=None)
pass_through_params: dict = field(default=None)

def to_dict(self):
upstream: dict = {
"type": self.type,
"mode": self.mode,
"audio_format": self.audio_format,
# "sample_rate": self.sample_rate
"sample_rate": self.sample_rate,
"vocabulary_id": self.vocabulary_id,
}
if self.asr_post_processing is not None:
upstream["asr_post_processing"] = self.asr_post_processing.to_dict()
Comment thread
songguocola marked this conversation as resolved.

if self.pass_through_params is not None:
upstream.update(self.pass_through_params)
return upstream
Expand Down