diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index dc85b78c04c..a3adeddf122 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -69,7 +69,7 @@ def __init__( tokenizer=self.tokenizer, **processor_kwargs, ) - + self.image_patch_id = self.processor.image_token_id self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) def process_request(self, request, max_model_len=None, **kwargs): @@ -249,6 +249,16 @@ def process_request_dict(self, request, max_model_len=None): # Handle continuation of previous generation by appending existing tokens if metadata and metadata.get("generated_token_ids"): self.append_generated_tokens(outputs, metadata["generated_token_ids"]) + + enable_thinking = False + if metadata: + enable_thinking = metadata.get("enable_thinking", False) + + if request.get("chat_template_kwargs"): + chat_template_kwargs = request.get("chat_template_kwargs") + enable_thinking = chat_template_kwargs.get("enable_thinking", False) + request["enable_thinking"] = enable_thinking + outputs = self.pack_outputs(outputs) request["prompt_token_ids"] = outputs["input_ids"].tolist()