SearchSavior · SearchSavior · Apr 12, 2026 · Apr 10, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/src/engine/openvino/qwen3_tts/qwen3_tts.py b/src/engine/openvino/qwen3_tts/qwen3_tts.py
@@ -123,19 +123,20 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
             CP_MAX_POS, CP_HEAD_DIM, CP_ROPE_THETA,
         )
 
-        self._text_model_c = core.compile_model(str(p / "text_model.xml"), device)
-        self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device)
+        _hint = {"PERFORMANCE_HINT": "LATENCY"}
+        self._text_model_c = core.compile_model(str(p / "text_model.xml"), device, _hint)
+        self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device, _hint)
         # Code predictor: many tiny inferences per frame; CPU avoids GPU launch/transfer overhead.
-        self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU")
+        self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU", _hint)
         # Speech decoder: single-shot vocoding; CPU fits typical sequence lengths without GPU overhead.
         self._decoder_c = core.compile_model(
-            str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU",
+            str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU", _hint,
         )
         self._decoder_input_name = self._decoder_c.input(0).get_any_name()
 
-        talker_c = core.compile_model(str(p / "talker.xml"), device)
+        talker_c = core.compile_model(str(p / "talker.xml"), device, _hint)
         self._talker_req = talker_c.create_infer_request()
-        cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU")
+        cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU", _hint)
         self._cp_req = cp_c.create_infer_request()
         if "GPU" in device:
             logger.info(
@@ -147,10 +148,10 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
         self._speech_enc_c = None
         if load_config.model_type == ModelType.QWEN3_TTS_VOICE_CLONE:
             self._speaker_enc_c = core.compile_model(
-                str(p / "speaker_encoder.xml"), device,
+                str(p / "speaker_encoder.xml"), device, _hint,
             )
             self._speech_enc_c = core.compile_model(
-                str(p / "speech_tokenizer" / "speech_encoder.xml"), device,
+                str(p / "speech_tokenizer" / "speech_encoder.xml"), device, _hint,
             )
 
         self._loaded = True

diff --git a/src/server/models/openvino.py b/src/server/models/openvino.py
@@ -162,6 +162,9 @@ class OV_Qwen3TTSGenConfig(BaseModel):
     subtalker_top_p: float = Field(default=1.0, description="Nucleus filter for code predictor.")
     subtalker_temperature: float = Field(default=0.9, description="Temperature for code predictor.")
     # --- streaming (HTTP: audio/L16 chunked response when stream=True) ---
+
+    # defaults taken from https://github.com/QwenLM/Qwen3-TTS/blob/022e286b98fbec7e1e916cb940cdf532cd9f488e/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py#L886
+    # these apply only for the 12.5hz tokenizer model. 
     stream: bool = Field(default=True, description="Enable streaming audio output (chunked PCM).")
-    stream_chunk_frames: int = Field(default=50, description="Codec frames per streaming chunk.")
-    stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity.")
+    stream_chunk_frames: int = Field(default=300, description="Codec frames per streaming chunk. Audio codebooks are autoregressive — each set depends on the previous — so coherent chunks require enough frames for stable prosody.")
+    stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity (matches upstream Qwen3-TTS left_context_size=25).")