Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions src/engine/openvino/qwen3_tts/qwen3_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,19 +123,20 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
CP_MAX_POS, CP_HEAD_DIM, CP_ROPE_THETA,
)

self._text_model_c = core.compile_model(str(p / "text_model.xml"), device)
self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device)
_hint = {"PERFORMANCE_HINT": "LATENCY"}
self._text_model_c = core.compile_model(str(p / "text_model.xml"), device, _hint)
self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device, _hint)
# Code predictor: many tiny inferences per frame; CPU avoids GPU launch/transfer overhead.
self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU")
self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU", _hint)
# Speech decoder: single-shot vocoding; CPU fits typical sequence lengths without GPU overhead.
self._decoder_c = core.compile_model(
str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU",
str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU", _hint,
)
self._decoder_input_name = self._decoder_c.input(0).get_any_name()

talker_c = core.compile_model(str(p / "talker.xml"), device)
talker_c = core.compile_model(str(p / "talker.xml"), device, _hint)
self._talker_req = talker_c.create_infer_request()
cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU")
cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU", _hint)
self._cp_req = cp_c.create_infer_request()
if "GPU" in device:
logger.info(
Expand All @@ -147,10 +148,10 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
self._speech_enc_c = None
if load_config.model_type == ModelType.QWEN3_TTS_VOICE_CLONE:
self._speaker_enc_c = core.compile_model(
str(p / "speaker_encoder.xml"), device,
str(p / "speaker_encoder.xml"), device, _hint,
)
self._speech_enc_c = core.compile_model(
str(p / "speech_tokenizer" / "speech_encoder.xml"), device,
str(p / "speech_tokenizer" / "speech_encoder.xml"), device, _hint,
)

self._loaded = True
Expand Down
7 changes: 5 additions & 2 deletions src/server/models/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ class OV_Qwen3TTSGenConfig(BaseModel):
subtalker_top_p: float = Field(default=1.0, description="Nucleus filter for code predictor.")
subtalker_temperature: float = Field(default=0.9, description="Temperature for code predictor.")
# --- streaming (HTTP: audio/L16 chunked response when stream=True) ---

# defaults taken from https://github.com/QwenLM/Qwen3-TTS/blob/022e286b98fbec7e1e916cb940cdf532cd9f488e/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py#L886
# these apply only for the 12.5hz tokenizer model.
stream: bool = Field(default=True, description="Enable streaming audio output (chunked PCM).")
stream_chunk_frames: int = Field(default=50, description="Codec frames per streaming chunk.")
stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity.")
stream_chunk_frames: int = Field(default=300, description="Codec frames per streaming chunk. Audio codebooks are autoregressive — each set depends on the previous — so coherent chunks require enough frames for stable prosody.")
stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity (matches upstream Qwen3-TTS left_context_size=25).")