diff --git a/src/engine/openvino/qwen3_tts/qwen3_tts.py b/src/engine/openvino/qwen3_tts/qwen3_tts.py index fabcdd0..b7ceaa6 100644 --- a/src/engine/openvino/qwen3_tts/qwen3_tts.py +++ b/src/engine/openvino/qwen3_tts/qwen3_tts.py @@ -123,19 +123,20 @@ def load_model(self, load_config: ModelLoadConfig) -> None: CP_MAX_POS, CP_HEAD_DIM, CP_ROPE_THETA, ) - self._text_model_c = core.compile_model(str(p / "text_model.xml"), device) - self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device) + _hint = {"PERFORMANCE_HINT": "LATENCY"} + self._text_model_c = core.compile_model(str(p / "text_model.xml"), device, _hint) + self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device, _hint) # Code predictor: many tiny inferences per frame; CPU avoids GPU launch/transfer overhead. - self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU") + self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU", _hint) # Speech decoder: single-shot vocoding; CPU fits typical sequence lengths without GPU overhead. self._decoder_c = core.compile_model( - str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU", + str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU", _hint, ) self._decoder_input_name = self._decoder_c.input(0).get_any_name() - talker_c = core.compile_model(str(p / "talker.xml"), device) + talker_c = core.compile_model(str(p / "talker.xml"), device, _hint) self._talker_req = talker_c.create_infer_request() - cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU") + cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU", _hint) self._cp_req = cp_c.create_infer_request() if "GPU" in device: logger.info( @@ -147,10 +148,10 @@ def load_model(self, load_config: ModelLoadConfig) -> None: self._speech_enc_c = None if load_config.model_type == ModelType.QWEN3_TTS_VOICE_CLONE: self._speaker_enc_c = core.compile_model( - str(p / "speaker_encoder.xml"), device, + str(p / "speaker_encoder.xml"), device, _hint, ) self._speech_enc_c = core.compile_model( - str(p / "speech_tokenizer" / "speech_encoder.xml"), device, + str(p / "speech_tokenizer" / "speech_encoder.xml"), device, _hint, ) self._loaded = True diff --git a/src/server/models/openvino.py b/src/server/models/openvino.py index e789eed..ba265e9 100644 --- a/src/server/models/openvino.py +++ b/src/server/models/openvino.py @@ -162,6 +162,9 @@ class OV_Qwen3TTSGenConfig(BaseModel): subtalker_top_p: float = Field(default=1.0, description="Nucleus filter for code predictor.") subtalker_temperature: float = Field(default=0.9, description="Temperature for code predictor.") # --- streaming (HTTP: audio/L16 chunked response when stream=True) --- + + # defaults taken from https://github.com/QwenLM/Qwen3-TTS/blob/022e286b98fbec7e1e916cb940cdf532cd9f488e/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py#L886 + # these apply only for the 12.5hz tokenizer model. stream: bool = Field(default=True, description="Enable streaming audio output (chunked PCM).") - stream_chunk_frames: int = Field(default=50, description="Codec frames per streaming chunk.") - stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity.") + stream_chunk_frames: int = Field(default=300, description="Codec frames per streaming chunk. Audio codebooks are autoregressive — each set depends on the previous — so coherent chunks require enough frames for stable prosody.") + stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity (matches upstream Qwen3-TTS left_context_size=25).")