Skip to content
Merged
5 changes: 1 addition & 4 deletions fastdeploy/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,10 +465,7 @@ def add_requests(self, task, sampling_params=None, **kwargs):
request.sampling_params = sampling_params
request.preprocess_start_time = time.time()

enable_thinking = None
if kwargs is not None:
enable_thinking = kwargs.get("enable_thinking", None)
request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking)
request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
request.prompt_token_ids_len = len(request.prompt_token_ids)
request.need_prefill_tokens = request.prompt_token_ids_len
input_ids_len = request.prompt_token_ids_len
Expand Down
7 changes: 2 additions & 5 deletions fastdeploy/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def _add_request(
self,
prompts,
sampling_params,
chat_template_kwargs: Optional[dict[str, Any]] = None,
**kwargs,
):
"""
添加一个请求到 LLM Engine,并返回该请求的 ID。
Expand Down Expand Up @@ -289,10 +289,7 @@ def _add_request(
current_sampling_params = sampling_params[i]
else:
current_sampling_params = sampling_params
enable_thinking = None
if chat_template_kwargs is not None:
enable_thinking = chat_template_kwargs.get("enable_thinking", None)
self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking)
self.llm_engine.add_requests(tasks, current_sampling_params, **kwargs)
return req_ids

def _decode_token(self, token_id: int) -> str:
Expand Down
19 changes: 18 additions & 1 deletion fastdeploy/input/ernie_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,16 @@ def process_request(self, request, max_model_len=None, **kwargs):
request.prompt_token_ids = token_ids
data_processor_logger.info(f"req_id:{request.request_id}, tokens:{tokens}, token_ids: {token_ids}")
else:
request.prompt_token_ids = self.messages2ids(request.to_dict())
task = request.to_dict()
chat_template_kwargs = kwargs.get("chat_template_kwargs")
if chat_template_kwargs:
if isinstance(chat_template_kwargs, dict):
for k, v in chat_template_kwargs.items():
if k not in task:
task[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.prompt_token_ids = self.messages2ids(task)

if len(request.prompt_token_ids) == 0:
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
Expand Down Expand Up @@ -162,6 +171,14 @@ def process_request_dict(self, request, max_model_len=None):
req_id = request.get("request_id", None)
data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
else:
chat_template_kwargs = request.get("chat_template_kwargs")
if chat_template_kwargs:
if isinstance(chat_template_kwargs, dict):
for k, v in chat_template_kwargs.items():
if k not in request:
request[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request["prompt_token_ids"] = self.messages2ids(request)
if len(request["prompt_token_ids"]) == 0:
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
Expand Down
11 changes: 10 additions & 1 deletion fastdeploy/input/ernie_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def set_value(req, key, value):
def process_request(self, request, max_model_len=None, **kwargs):
"""process the input data"""
task = request.to_dict()
task["enable_thinking"] = kwargs.get("enable_thinking", True)
task["chat_template_kwargs"] = kwargs.get("chat_template_kwargs")
self.process_request_dict(task, max_model_len)
request = Request.from_dict(task)
request = self._apply_default_parameters(request)
Expand Down Expand Up @@ -217,6 +217,15 @@ def process_request_dict(self, request, max_model_len=None):
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
chat_template_kwargs = request.get("chat_template_kwargs")
if chat_template_kwargs:
if isinstance(chat_template_kwargs, dict):
for k, v in chat_template_kwargs.items():
if k not in request:
request[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", True)
outputs = self.ernie_processor.request2ids(request)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
Expand Down
20 changes: 18 additions & 2 deletions fastdeploy/input/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ def process_request(self, request, max_model_len=None, **kwargs):
request = self._apply_default_parameters(request)
if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
request.eos_token_ids = self.eos_token_ids

stop_sequences = request.get("stop", [])
if stop_sequences is not None and len(stop_sequences) != 0:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
Expand All @@ -221,7 +220,15 @@ def process_request(self, request, max_model_len=None, **kwargs):
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
task = request.to_dict()
task["enable_thinking"] = kwargs.get("enable_thinking", True)
chat_template_kwargs = kwargs.get("chat_template_kwargs")
if chat_template_kwargs:
if isinstance(chat_template_kwargs, dict):
for k, v in chat_template_kwargs.items():
if k not in task:
task[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
task.setdefault("enable_thinking", True)
request.prompt_token_ids = self.messages2ids(task)
else:
raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
Expand Down Expand Up @@ -271,6 +278,15 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
elif "messages" in request:
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
chat_template_kwargs = request.get("chat_template_kwargs")
if chat_template_kwargs:
if isinstance(chat_template_kwargs, dict):
for k, v in chat_template_kwargs.items():
if k not in request:
request[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", True)
request["prompt_token_ids"] = self.messages2ids(request)
else:
raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
Expand Down
1 change: 1 addition & 0 deletions test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ def test_chat_with_thinking(openai_client, capsys):
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
assert response.choices[0].message.reasoning_content is None
assert "</think>" not in response.choices[0].message.content

# enable thinking, streaming
reasoning_max_tokens = 3
Expand Down
Loading