From 87c0b049416af199281ffdb248d719d478053d37 Mon Sep 17 00:00:00 2001 From: zeroRains Date: Thu, 24 Jul 2025 13:24:08 +0800 Subject: [PATCH 1/2] update flake8 version to support pre-commit in python3.12 --- .flake8 | 2 +- .pre-commit-config.yaml | 4 +- .../python/ops/test_get_padding_offset.py | 8 +- .../cache_manager/prefix_cache_manager.py | 5 +- .../custom_all_reduce/cuda_wrapper.py | 10 +- fastdeploy/entrypoints/openai/protocol.py | 2 +- fastdeploy/entrypoints/openai/serving_chat.py | 24 +-- .../entrypoints/openai/serving_completion.py | 50 +++--- fastdeploy/input/ernie_processor.py | 10 +- fastdeploy/input/mm_processor/process.py | 3 +- fastdeploy/input/text_processor.py | 21 +-- .../cudagraph_piecewise_backend.py | 5 +- .../layers/attention/mla_attention_backend.py | 3 +- .../layers/backends/dcu/__init__.py | 8 +- .../layers/backends/dcu/top_p_sampling.py | 10 +- fastdeploy/model_executor/layers/moe/ep.py | 8 +- .../layers/moe/fused_moe_marlin_backend.py | 27 +++- .../layers/moe/fused_moe_triton_backend.py | 14 +- fastdeploy/model_executor/layers/moe/moe.py | 42 +++-- .../sample/ops/apply_penalty_multi_scores.py | 4 +- .../layers/sample/ops/top_k_top_p_sampling.py | 1 + fastdeploy/model_executor/layers/utils.py | 8 +- .../ops/triton_ops/triton_utils.py | 28 ++-- fastdeploy/rl/rollout_model.py | 44 +++--- pyproject.toml | 2 +- test/ci_use/EB_Lite/test_EB_Lite_serving.py | 91 +++++------ .../EB_VL_Lite/test_EB_VL_Lite_serving.py | 146 +++++++++--------- .../Qwen3-MoE/test_Qwen3-MoE_serving.py | 4 +- 28 files changed, 316 insertions(+), 268 deletions(-) diff --git a/.flake8 b/.flake8 index 869c57d3e61..1656330a998 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] -ignore = E203, E402, E501, E731, E741, W503, W605, E722 +ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271 max-line-length = 119 # E402: module level import not at top of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5939966c88..8c0fec84a19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_stages: # - manual # Run in CI repos: - repo: https://github.com/psf/black.git - rev: 22.8.0 + rev: 25.1.0 hooks: - id: black files: \.(py|pyi)$ @@ -18,7 +18,7 @@ repos: hooks: - id: isort - repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 + rev: 7.0.0 hooks: - id: flake8 # 代码检查 diff --git a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py index 441912a6d97..614386488a6 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py +++ b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py @@ -29,7 +29,13 @@ ids_len = seq_lens[i, 0] input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64") -(x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k,) = get_padding_offset( +( + x_remove_padding, + cum_offsets_out, + padding_offset, + cu_seqlens_q, + cu_seqlens_k, +) = get_padding_offset( paddle.to_tensor(input_ids), paddle.to_tensor(cum_offset), paddle.to_tensor(token_num), diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 0114914bb33..b403d394462 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -473,7 +473,10 @@ def request_block_ids(self, task, block_size, dec_token_num, *args): current_time = time.time() self._update_matched_node_info(req_id, match_block_node, current_time) # 2. prepare cache - (gpu_recv_block_ids, gpu_extra_block_ids,) = self._prepare_cache( + ( + gpu_recv_block_ids, + gpu_extra_block_ids, + ) = self._prepare_cache( req_id, input_ids, block_size, diff --git a/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py b/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py index ac321a58914..7bec993d953 100644 --- a/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py +++ b/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py @@ -113,10 +113,7 @@ class CudaRTLibrary: Function( "cudaStreamIsCapturing", cudaError_t, - [ - cudaStream_t, - ctypes.POINTER(cudaStreamCaptureStatus) - ] + [cudaStream_t, ctypes.POINTER(cudaStreamCaptureStatus)], ), ] @@ -197,9 +194,8 @@ def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p: self.funcs["cudaIpcOpenMemHandle"](ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess) ) return devPtr + def cudaStreamIsCapturing(self, stream: cudaStream_t) -> ctypes.c_int: is_capturing = ctypes.c_int() - self.CUDART_CHECK( - self.funcs["cudaStreamIsCapturing"](stream, is_capturing) - ) + self.CUDART_CHECK(self.funcs["cudaStreamIsCapturing"](stream, is_capturing)) return is_capturing diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 66076bedf10..e3c759e5739 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -478,7 +478,7 @@ class ChatCompletionRequest(BaseModel): top_p: Optional[float] = None top_k: Optional[int] = None min_p: Optional[float] = None - user: Optional[str] = None + user: Optional[str] = None metadata: Optional[dict] = None extra_body: Optional[dict] = None return_token_ids: Optional[bool] = False diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 611b0cb8de7..d79abdb39dc 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -144,7 +144,9 @@ async def chat_completion_stream_generator( if request.metadata is not None: enable_thinking = request.metadata.get("enable_thinking") include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False) - enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) + enable_return_token_ids = request.return_token_ids or ( + request.extra_body is not None and request.extra_body.get("return_token_ids", False) + ) while num_choices > 0: try: raw_data = await asyncio.wait_for(dealer.read(), timeout=10) @@ -186,13 +188,13 @@ async def chat_completion_stream_generator( choice = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage( - role="assistant", - content="", - reasoning_content="", + role="assistant", + content="", + reasoning_content="", tool_calls=None, prompt_token_ids=None, completion_token_ids=None, - ) + ), ) if enable_return_token_ids: choice.delta.prompt_token_ids = list(prompt_token_ids) @@ -231,10 +233,10 @@ async def chat_completion_stream_generator( previous_num_tokens += len(output["token_ids"]) delta_message = DeltaMessage( - content=delta_text, - reasoning_content=output.get("reasoning_content"), \ + content=delta_text, + reasoning_content=output.get("reasoning_content"), prompt_token_ids=None, - completion_token_ids=None, + completion_token_ids=None, tool_calls=output.get("tool_call_content", []), ) @@ -322,7 +324,9 @@ async def chat_completion_full_generator( final_res = None enable_thinking = None include_stop_str_in_output = False - enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) + enable_return_token_ids = request.return_token_ids or ( + request.extra_body is not None and request.extra_body.get("return_token_ids", False) + ) try: dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") dealer.write([b"", request_id.encode("utf-8")]) @@ -396,7 +400,7 @@ async def chat_completion_full_generator( reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, - completion_token_ids=completion_token_ids if enable_return_token_ids else None, + completion_token_ids=(completion_token_ids if enable_return_token_ids else None), ) logprobs_full_res = None if logprob_contents: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 34f71240946..970544555e2 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -229,7 +229,9 @@ async def completion_stream_generator( model=model_name, choices=choices, ) - enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) + enable_return_token_ids = request.return_token_ids or ( + request.extra_body is not None and request.extra_body.get("return_token_ids", False) + ) current_waiting_time = 0 while num_choices > 0: try: @@ -258,12 +260,16 @@ async def completion_stream_generator( id=request_id, created=created_time, model=model_name, - choices=[CompletionResponseStreamChoice( - index=idx, - text="", - prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None, - completion_token_ids=None, - )] + choices=[ + CompletionResponseStreamChoice( + index=idx, + text="", + prompt_token_ids=( + list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None + ), + completion_token_ids=None, + ) + ], ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" first_iteration[idx] = False @@ -277,15 +283,17 @@ async def completion_stream_generator( output = res["outputs"] - choices.append(CompletionResponseStreamChoice( - index=idx, - text=output["text"], - prompt_token_ids=None, - completion_token_ids=output.get("token_ids") if enable_return_token_ids else None, - tool_calls=output.get("tool_call_content"), - reasoning_content=output.get("reasoning_content"), - arrival_time=arrival_time - )) + choices.append( + CompletionResponseStreamChoice( + index=idx, + text=output["text"], + prompt_token_ids=None, + completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None), + tool_calls=output.get("tool_call_content"), + reasoning_content=output.get("reasoning_content"), + arrival_time=arrival_time, + ) + ) if res["finished"]: if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens: chunk.choices[0].finish_reason = "stop" @@ -344,12 +352,14 @@ def request_output_to_completion_response( created_time: int, model_name: str, prompt_batched_token_ids: list(), - completion_batched_token_ids: list() + completion_batched_token_ids: list(), ) -> CompletionResponse: choices: List[CompletionResponseChoice] = [] num_prompt_tokens = 0 num_generated_tokens = 0 - enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) + enable_return_token_ids = request.return_token_ids or ( + request.extra_body is not None and request.extra_body.get("return_token_ids", False) + ) for idx in range(len(final_res_batch)): final_res = final_res_batch[idx] @@ -376,8 +386,8 @@ def request_output_to_completion_response( index=len(choices), text=output_text, prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, - completion_token_ids=completion_token_ids if enable_return_token_ids else None, - reasoning_content=output.get('reasoning_content'), + completion_token_ids=(completion_token_ids if enable_return_token_ids else None), + reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), logprobs=None, finish_reason=None, diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index a56c7f9fb06..06d198db17d 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -99,8 +99,7 @@ def process_request(self, request, max_model_len=None, **kwargs): if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is None and request.messages is None: - raise ValueError( - f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.") + raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.") if request.prompt is not None: prompt = request.prompt if request.prompt is not None else request.messages[0] prompt = prompt[0] if isinstance(prompt, list) else prompt @@ -164,8 +163,8 @@ def process_request_dict(self, request, max_model_len=None): req_id = request.get("request_id", None) data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") else: - request['prompt_token_ids'] = self.messages2ids(request) - if len(request['prompt_token_ids']) == 0: + request["prompt_token_ids"] = self.messages2ids(request) + if len(request["prompt_token_ids"]) == 0: raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") # truncate prompts that exceed the length limit @@ -246,8 +245,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): if is_end: full_text = previous_texts + delta_text if enable_thinking and self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content else: diff --git a/fastdeploy/input/mm_processor/process.py b/fastdeploy/input/mm_processor/process.py index 566921a6011..ea2559a0fef 100644 --- a/fastdeploy/input/mm_processor/process.py +++ b/fastdeploy/input/mm_processor/process.py @@ -507,5 +507,6 @@ def apply_chat_template(self, request): tokens = self.tokenizer.tokenize(prompt_token_str) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) data_processor_logger.info( - f"req_id:{request.get('request_id', ''),} tokens: {tokens}, token_ids: {token_ids}") + f"req_id:{request.get('request_id', ''), } tokens: {tokens}, token_ids: {token_ids}" + ) return token_ids diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index a9f8c2c4971..664868a595e 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -239,9 +239,7 @@ def process_request(self, request, max_model_len=None, **kwargs): task["enable_thinking"] = kwargs.get("enable_thinking", True) request.prompt_token_ids = self.messages2ids(task) else: - raise ValueError( - f"The request should have `input_ids`, `text` or `messages`: {request}." - ) + raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") if len(request.prompt_token_ids) == 0: raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") if request.get("max_tokens") is None: @@ -281,18 +279,16 @@ def process_request_dict(self, request, max_model_len=None, **kwargs): data_processor_logger.info(f"Processing request {request}") # processing prompt_token_ids - if not request.get('prompt_token_ids'): - if 'prompt' in request: - request['prompt_token_ids'] = self.text2ids(request['prompt'], max_model_len).tolist() - elif 'messages' in request: + if not request.get("prompt_token_ids"): + if "prompt" in request: + request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len).tolist() + elif "messages" in request: if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat_template.") request["prompt_token_ids"] = self.messages2ids(request) else: - raise ValueError( - f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}" - ) - if len(request['prompt_token_ids']) == 0: + raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}") + if len(request["prompt_token_ids"]) == 0: raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) @@ -357,8 +353,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): if is_end: full_text = previous_texts + delta_text if enable_thinking and self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content else: diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index c93a3c5a4f5..56dd8d92e9a 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -21,8 +21,8 @@ from paddle.device.cuda import graphs from fastdeploy.config import FDConfig -from fastdeploy.utils import get_logger from fastdeploy.distributed.communication import capture_custom_allreduce +from fastdeploy.utils import get_logger logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log") @@ -99,7 +99,7 @@ def __call__(self, **kwargs): entry.runnable(**kwargs) logger.debug( f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, " - f"finished ({n+1}/{entry.num_finished_warmup}) times" + f"finished ({n + 1}/{entry.num_finished_warmup}) times" ) # Store input addresses for debug @@ -114,7 +114,6 @@ def __call__(self, **kwargs): new_grpah.capture_begin() output = entry.runnable(**kwargs) new_grpah.capture_end() - # Store output buffer entry.cuda_graph = new_grpah diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index 6f0581e5710..5863b5c09f8 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -217,8 +217,7 @@ def init_attention_metadata(self, forward_meta: ForwardMeta): self.attention_metadata: AttentionMetadata = metadata forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False) - forward_meta.decoder_tile_ids_per_batch.copy_( - metadata.decoder_tile_ids_per_batch, False) + forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False) def get_attntion_meta(self) -> AttentionMetadata: """get_attntion_meta""" diff --git a/fastdeploy/model_executor/layers/backends/dcu/__init__.py b/fastdeploy/model_executor/layers/backends/dcu/__init__.py index 40faa16644c..803a76010f6 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/__init__.py +++ b/fastdeploy/model_executor/layers/backends/dcu/__init__.py @@ -17,7 +17,11 @@ """ from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod -from .weight_only import DCUWeightOnlyLinearMethod from .top_p_sampling import native_top_p_sampling +from .weight_only import DCUWeightOnlyLinearMethod -__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod", "native_top_p_sampling"] +__all__ = [ + "DCUTritonWeightOnlyMoEMethod", + "DCUWeightOnlyLinearMethod", + "native_top_p_sampling", +] diff --git a/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py b/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py index 4a79a951170..1eafe135178 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py +++ b/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle -def native_top_p_sampling( - probs: paddle.Tensor, - top_p: paddle.Tensor -) -> tuple[paddle.Tensor, paddle.Tensor]: +def native_top_p_sampling(probs: paddle.Tensor, top_p: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]: sorted_indices = paddle.argsort(probs, descending=True) sorted_probs = paddle.sort(probs, descending=True) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) @@ -30,7 +28,9 @@ def native_top_p_sampling( sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1] condition = paddle.scatter( - sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten() + sorted_indices_to_remove.flatten(), + sorted_indices.flatten(), + sorted_indices_to_remove.flatten(), ) condition = paddle.cast(condition, "bool").reshape(probs.shape) diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index 16a0152045c..acc070309da 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -143,7 +143,13 @@ def low_latency_dispatch( event: the event after executing the kernel (valid only if `async_finish` is set). hook: the receiving hook function (valid only if `return_recv_hook` is set). """ - (packed_recv_x, recv_expert_count, handle, _, dispatch_hook,) = self.deepep_engine.low_latency_dispatch( + ( + packed_recv_x, + recv_expert_count, + handle, + _, + dispatch_hook, + ) = self.deepep_engine.low_latency_dispatch( hidden_states, topk_idx, expertwise_scale, diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py index d9f2f20d9c6..848f52b9535 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py @@ -21,15 +21,21 @@ from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce from fastdeploy.model_executor.ops.gpu import ( MoeWna16MarlinGemmApi, - tritonmoe_preprocess_func, noaux_tc, + tritonmoe_preprocess_func, ) from ..quantization.quant_base import QuantMethodBase -def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k, - routed_scaling_factor, - e_score_correction_bias) -> paddle.Tensor: + +def get_moe_scores( + gating_output: paddle.Tensor, + n_group, + topk_group, + top_k, + routed_scaling_factor, + e_score_correction_bias, +) -> paddle.Tensor: """ compute moe scores using e_score_correction_bias. """ @@ -45,6 +51,7 @@ def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k, ) return scores + def gptq_marlin_moe_repack( b_q_weight: paddle.Tensor, perm: paddle.Tensor, @@ -226,10 +233,14 @@ def apply( topk_method = layer.topk_method if topk_method == "noaux_tc": - gate_out = get_moe_scores(gate_out, layer.n_group, - layer.topk_group, layer.top_k, - layer.routed_scaling_factor, - layer.gate_correction_bias) + gate_out = get_moe_scores( + gate_out, + layer.n_group, + layer.topk_group, + layer.top_k, + layer.routed_scaling_factor, + layer.gate_correction_bias, + ) topk_weights, topk_ids = paddle.topk(gate_out, k=layer.top_k, axis=-1, sorted=False) else: diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index f24936138d0..352fdbca203 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -609,11 +609,11 @@ def apply( from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func( - topk_ids, num_local_experts, config["BLOCK_SIZE_M"]) + topk_ids, num_local_experts, config["BLOCK_SIZE_M"] + ) # cache13 = create_empty_tensor(tuple([token_num * top_k * max(N1, N2)]), x.dtype) cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype) - intermediate_cache1 = cache13[:token_num * top_k * N1].view( - [token_num * top_k, N1]) + intermediate_cache1 = cache13[: token_num * top_k * N1].view([token_num * top_k, N1]) max_num_tokens_padded = sorted_token_ids.shape[0] grid = ( @@ -669,11 +669,11 @@ def apply( intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1) - intermediate_cache3 = cache13[:token_num * top_k * N2].view( - [token_num * top_k, N2]) + intermediate_cache3 = cache13[: token_num * top_k * N2].view([token_num * top_k, N2]) - grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * - ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) + grid = ( + ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), + ) x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( intermediate_cache2, self.quant_config.weight_block_size[0] diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index b573ccb0def..a1d689961df 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -125,7 +125,7 @@ def __init__( self.init_moe_weights() logger.info( - f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \ + f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \ {top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \ , ep_size={self.ep_size}, \ tp_size={self.tp_size}." @@ -232,17 +232,21 @@ def load_experts_weight( up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx) up_gate_proj_weights.append( get_tensor( - state_dict.pop(up_gate_proj_expert_weight_key_name) - if up_gate_proj_expert_weight_key_name in state_dict - else up_gate_proj_expert_weight_key_name, + ( + state_dict.pop(up_gate_proj_expert_weight_key_name) + if up_gate_proj_expert_weight_key_name in state_dict + else up_gate_proj_expert_weight_key_name + ), self.fd_config.parallel_config.model_name_or_path, ) ) down_proj_weights.append( get_tensor( - state_dict.pop(down_proj_expert_weight_key_name) - if down_proj_expert_weight_key_name in state_dict - else down_proj_expert_weight_key_name, + ( + state_dict.pop(down_proj_expert_weight_key_name) + if down_proj_expert_weight_key_name in state_dict + else down_proj_expert_weight_key_name + ), self.fd_config.parallel_config.model_name_or_path, ) ) @@ -255,23 +259,29 @@ def load_experts_weight( up_expert_weight_key_name = up_expert_weight_key.format(expert_idx) down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx) gate = get_tensor( - state_dict.pop(gate_expert_weight_key_name) - if gate_expert_weight_key_name in state_dict - else gate_expert_weight_key_name, + ( + state_dict.pop(gate_expert_weight_key_name) + if gate_expert_weight_key_name in state_dict + else gate_expert_weight_key_name + ), self.fd_config.parallel_config.model_name_or_path, ) up = get_tensor( - state_dict.pop(up_expert_weight_key_name) - if up_expert_weight_key_name in state_dict - else up_expert_weight_key_name, + ( + state_dict.pop(up_expert_weight_key_name) + if up_expert_weight_key_name in state_dict + else up_expert_weight_key_name + ), self.fd_config.parallel_config.model_name_or_path, ) up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1)) down_proj_weights.append( get_tensor( - state_dict.pop(down_proj_expert_weight_key_name) - if down_proj_expert_weight_key_name in state_dict - else down_proj_expert_weight_key_name, + ( + state_dict.pop(down_proj_expert_weight_key_name) + if down_proj_expert_weight_key_name in state_dict + else down_proj_expert_weight_key_name + ), self.fd_config.parallel_config.model_name_or_path, ) ) diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py index 3a25fcd406a..06c7ece76f7 100644 --- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py +++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py @@ -54,8 +54,8 @@ def apply_penalty_multi_scores( eos_token_ids, ) elif current_platform.is_dcu(): - from fastdeploy.model_executor.ops.gpu import \ - get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores + logits = get_token_penalty_multi_scores( pre_token_ids, prompt_ids, diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py index acdc59f6f12..bbc431ddeec 100644 --- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py +++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py @@ -81,6 +81,7 @@ def top_k_top_p_sampling( _, ids = gcu_top_p_sampling(x, top_p) elif current_platform.is_dcu(): from fastdeploy.model_executor.layers.backends import native_top_p_sampling + _, ids = native_top_p_sampling(x, top_p) else: _, ids = paddle.tensor.top_p_sampling( diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index 75171982f64..ed7b4369b13 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -300,7 +300,13 @@ def speculate_remove_padding( if current_platform.is_cuda(): cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) token_num = paddle.sum(seq_lens_this_time) - (ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k,) = speculate_get_padding_offset( + ( + ids_remove_padding, + cum_offsets, + padding_offset, + cu_seqlens_q, + cu_seqlens_k, + ) = speculate_get_padding_offset( input_ids, draft_tokens, cum_offsets_now, diff --git a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py index 20663152053..c6ebd27422e 100644 --- a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py +++ b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py @@ -103,9 +103,9 @@ def extract_triton_kernel(kernel, file_name): import textwrap fn = kernel - if type(kernel) == triton.runtime.jit.JITFunction: + if isinstance(kernel, triton.runtime.jit.JITFunction): fn = kernel.fn - elif type(kernel) == triton.runtime.autotuner.Autotuner: + elif isinstance(kernel, triton.runtime.autotuner.Autotuner): fn = kernel.fn.fn else: AssertionError("error occurs") @@ -195,14 +195,14 @@ def get_value_hint(x): """ hint = "" for ele in x: - if type(ele) == int: + if isinstance(ele, int): if ele % 16 == 0 and ele > 0: hint += "i64:16," elif ele == 1: hint += "i64:1," else: hint += "i64," - if type(ele) == float: + if isinstance(ele, float): hint += "fp32," return hint @@ -467,16 +467,16 @@ def rendering_common_template( if arg_defaults[i] is None: input_and_attr += f"paddle::optional & {arg_names[i]}," paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),""" - elif type(arg_defaults[i]) == float: + elif isinstance(arg_defaults[i], float): input_and_attr += f"float {arg_names[i]}," paddle_attr_sig += f""""{arg_names[i]}: float",""" - elif type(arg_defaults[i]) == bool: + elif isinstance(arg_defaults[i], bool): input_and_attr += f"bool {arg_names[i]}," paddle_attr_sig += f""""{arg_names[i]}: bool",""" - elif type(arg_defaults[i]) == int: + elif isinstance(arg_defaults[i], int): input_and_attr += f"int64_t {arg_names[i]}," paddle_attr_sig += f""""{arg_names[i]}: int64_t",""" - elif type(arg_defaults[i]) == str: + elif isinstance(arg_defaults[i], str): input_and_attr += f"std::string {arg_names[i]}," paddle_attr_sig += f""""{arg_names[i]}: std::string",""" elif arg_names[i] == "config": @@ -629,11 +629,11 @@ def decorator(*args, **kwargs): for i in range(len(all_input)): ele = all_input[i] if ( - type(ele) == paddle.Tensor - or type(ele) == paddle.base.framework.EagerParamBase - or type(ele) == paddle.base.framework.Parameter - or type(ele) == paddle.base.framework.Variable - or type(ele) == paddle.base.libpaddle.pir.Value + isinstance(ele, paddle.Tensor) + or isinstance(ele, paddle.base.framework.EagerParamBase) + or isinstance(ele, paddle.base.framework.Parameter) + or isinstance(ele, paddle.base.framework.Variable) + or isinstance(ele, paddle.base.libpaddle.pir.Value) ): dtypes.append(ele.dtype) modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]" @@ -668,7 +668,7 @@ def decorator(*args, **kwargs): lanuch_grid = list(self.grid) for i in range(len(lanuch_grid)): ele = lanuch_grid[i] - if type(ele) == str: + if isinstance(ele, str): for key in const_hint_dict.keys(): if key in ele: ele = ele.replace(key, f"{{{key}}}") diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index 241c76df0e2..0872f3a4f0a 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -153,14 +153,14 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: # Helper function to add layer mappings def _add_layer_mappings(layer_idx: int): # MoE specific mappings - self.infer_to_train_mapping[ - f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight" - ] = f"{base_name}.{layer_idx}.mlp.gate.weight" + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = ( + f"{base_name}.{layer_idx}.mlp.gate.weight" + ) if self.fd_config.model_config.moe_use_aux_free: - self.infer_to_train_mapping[ - f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias" - ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = ( + f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + ) # MoE experts mappings for expert_idx in range(self.fd_config.model_config.moe_num_experts): @@ -184,7 +184,8 @@ def _add_layer_mappings(layer_idx: int): assert isinstance(self.fd_config.model_config.moe_layer_start_index, int) # Process MoE layers for layer_idx in range( - self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers + self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers, ): _add_layer_mappings(layer_idx) @@ -226,9 +227,9 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int): # MoE specific mappings gate_suffix = "" if moe_tag == "text" else "_1" - self.infer_to_train_mapping[ - f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight" - ] = f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}" + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"] = ( + f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}" + ) if self.fd_config.model_config.moe_use_aux_free: self.infer_to_train_mapping[ @@ -245,7 +246,10 @@ def _generate_ranges(start, end, step=16, take=8): expert_mappings = defaultdict(list) for expert_idx in _generate_ranges( - expert_start, total_moe_num, expert_num_per_rank * 2, expert_num_per_rank + expert_start, + total_moe_num, + expert_num_per_rank * 2, + expert_num_per_rank, ): for ph in place_holders: expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append( @@ -323,9 +327,9 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: def _add_layer_mappings(layer_idx): # FFN mappings for ph in place_holders: - self.infer_to_train_mapping[ - f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}" - ] = f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}" + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"] = ( + f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}" + ) for layer_idx in range(self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx) @@ -368,14 +372,14 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: # Helper function to add layer mappings def _add_layer_mappings(layer_idx: int): # MoE specific mappings - self.infer_to_train_mapping[ - f"{base_name}.{layer_idx}.mlp.gate_weight" - ] = f"{base_name}.{layer_idx}.mlp.gate.weight" + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate_weight"] = ( + f"{base_name}.{layer_idx}.mlp.gate.weight" + ) if self.fd_config.moe_config.moe_use_aux_free: - self.infer_to_train_mapping[ - f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias" - ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = ( + f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + ) # MoE experts mappings for expert_idx in range(self.fd_config.moe_config.num_experts): diff --git a/pyproject.toml b/pyproject.toml index d6720071377..9b79ec1a4a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ known_third_party = ["paddle"] [tool.black] line-length = 119 target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310'] -exclude = ['.flake8'] +exclude = '.flake8' diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 8d08cbaa29c..fe615e4658b 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -342,10 +342,12 @@ def test_streaming(openai_client, capsys): output.append(chunk.choices[0].text) assert len(output) > 0 + # ========================== # OpenAI Client additional chat/completions test # ========================== + def test_non_streaming_with_stop_str(openai_client): """ Test non-streaming chat functionality with the local service @@ -423,12 +425,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): extra_body={"return_token_ids": True}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "prompt_token_ids") assert isinstance(response.choices[0].message.prompt_token_ids, list) - assert hasattr(response.choices[0].message, 'completion_token_ids') + assert hasattr(response.choices[0].message, "completion_token_ids") assert isinstance(response.choices[0].message.completion_token_ids, list) # disable return_token_ids @@ -440,12 +442,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): extra_body={"return_token_ids": False}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "prompt_token_ids") assert response.choices[0].message.prompt_token_ids is None - assert hasattr(response.choices[0].message, 'completion_token_ids') + assert hasattr(response.choices[0].message, "completion_token_ids") assert response.choices[0].message.completion_token_ids is None @@ -464,11 +466,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): ) is_first_chunk = True for chunk in response: - assert hasattr(chunk, 'choices') + assert hasattr(chunk, "choices") assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], 'delta') - assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') - assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + assert hasattr(chunk.choices[0], "delta") + assert hasattr(chunk.choices[0].delta, "prompt_token_ids") + assert hasattr(chunk.choices[0].delta, "completion_token_ids") if is_first_chunk: is_first_chunk = False assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) @@ -487,12 +489,12 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): stream=True, ) for chunk in response: - assert hasattr(chunk, 'choices') + assert hasattr(chunk, "choices") assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], 'delta') - assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') + assert hasattr(chunk.choices[0], "delta") + assert hasattr(chunk.choices[0].delta, "prompt_token_ids") assert chunk.choices[0].delta.prompt_token_ids is None - assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + assert hasattr(chunk.choices[0].delta, "completion_token_ids") assert chunk.choices[0].delta.completion_token_ids is None @@ -509,11 +511,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys): extra_body={"return_token_ids": True}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'prompt_token_ids') + assert hasattr(response.choices[0], "prompt_token_ids") assert isinstance(response.choices[0].prompt_token_ids, list) - assert hasattr(response.choices[0], 'completion_token_ids') + assert hasattr(response.choices[0], "completion_token_ids") assert isinstance(response.choices[0].completion_token_ids, list) # disable return_token_ids @@ -525,11 +527,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys): extra_body={"return_token_ids": False}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'prompt_token_ids') + assert hasattr(response.choices[0], "prompt_token_ids") assert response.choices[0].prompt_token_ids is None - assert hasattr(response.choices[0], 'completion_token_ids') + assert hasattr(response.choices[0], "completion_token_ids") assert response.choices[0].completion_token_ids is None @@ -548,10 +550,10 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys): ) is_first_chunk = True for chunk in response: - assert hasattr(chunk, 'choices') + assert hasattr(chunk, "choices") assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], 'prompt_token_ids') - assert hasattr(chunk.choices[0], 'completion_token_ids') + assert hasattr(chunk.choices[0], "prompt_token_ids") + assert hasattr(chunk.choices[0], "completion_token_ids") if is_first_chunk: is_first_chunk = False assert isinstance(chunk.choices[0].prompt_token_ids, list) @@ -570,11 +572,11 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys): stream=True, ) for chunk in response: - assert hasattr(chunk, 'choices') + assert hasattr(chunk, "choices") assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], 'prompt_token_ids') + assert hasattr(chunk.choices[0], "prompt_token_ids") assert chunk.choices[0].prompt_token_ids is None - assert hasattr(chunk.choices[0], 'completion_token_ids') + assert hasattr(chunk.choices[0], "completion_token_ids") assert chunk.choices[0].completion_token_ids is None @@ -587,13 +589,13 @@ def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys): messages=[], temperature=1, max_tokens=5, - extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response, 'usage') - assert hasattr(response.usage, 'prompt_tokens') + assert hasattr(response, "usage") + assert hasattr(response.usage, "prompt_tokens") assert response.usage.prompt_tokens == 9 @@ -606,17 +608,17 @@ def test_streaming_chat_with_prompt_token_ids(openai_client, capsys): messages=[], temperature=1, max_tokens=5, - extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, stream=True, stream_options={"include_usage": True}, ) for chunk in response: - assert hasattr(chunk, 'choices') - assert hasattr(chunk, 'usage') + assert hasattr(chunk, "choices") + assert hasattr(chunk, "usage") if len(chunk.choices) > 0: assert chunk.usage is None else: - assert hasattr(chunk.usage, 'prompt_tokens') + assert hasattr(chunk.usage, "prompt_tokens") assert chunk.usage.prompt_tokens == 9 @@ -629,13 +631,13 @@ def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys): prompt="", temperature=1, max_tokens=5, - extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response, 'usage') - assert hasattr(response.usage, 'prompt_tokens') + assert hasattr(response, "usage") + assert hasattr(response.usage, "prompt_tokens") assert response.usage.prompt_tokens == 9 @@ -648,16 +650,15 @@ def test_streaming_completion_with_prompt_token_ids(openai_client, capsys): prompt="", temperature=1, max_tokens=5, - extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, stream=True, stream_options={"include_usage": True}, ) for chunk in response: - assert hasattr(chunk, 'choices') - assert hasattr(chunk, 'usage') + assert hasattr(chunk, "choices") + assert hasattr(chunk, "usage") if len(chunk.choices) > 0: assert chunk.usage is None else: - assert hasattr(chunk.usage, 'prompt_tokens') + assert hasattr(chunk.usage, "prompt_tokens") assert chunk.usage.prompt_tokens == 9 - diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 083f4fc7466..dccc1f55a99 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -325,11 +325,11 @@ def test_streaming_chat(openai_client, capsys): assert len(output) > 2 - # ========================== # OpenAI Client additional chat/completions test # ========================== + def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): """ Test return_token_ids option in non-streaming chat functionality with the local service @@ -340,35 +340,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] - } + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + }, ], temperature=1, max_tokens=53, extra_body={"return_token_ids": True}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "prompt_token_ids") assert isinstance(response.choices[0].message.prompt_token_ids, list) - assert hasattr(response.choices[0].message, 'completion_token_ids') + assert hasattr(response.choices[0].message, "completion_token_ids") assert isinstance(response.choices[0].message.completion_token_ids, list) # 不设定 return_token_ids @@ -377,35 +375,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] - } + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + }, ], temperature=1, max_tokens=53, extra_body={"return_token_ids": False}, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "prompt_token_ids") assert response.choices[0].message.prompt_token_ids is None - assert hasattr(response.choices[0].message, 'completion_token_ids') + assert hasattr(response.choices[0].message, "completion_token_ids") assert response.choices[0].message.completion_token_ids is None @@ -419,23 +415,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] - } + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + }, ], temperature=1, max_tokens=53, @@ -444,11 +438,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): ) is_first_chunk = True for chunk in response: - assert hasattr(chunk, 'choices') + assert hasattr(chunk, "choices") assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], 'delta') - assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') - assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + assert hasattr(chunk.choices[0], "delta") + assert hasattr(chunk.choices[0].delta, "prompt_token_ids") + assert hasattr(chunk.choices[0].delta, "completion_token_ids") if is_first_chunk: is_first_chunk = False assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) @@ -463,23 +457,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] - } + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + }, ], temperature=1, max_tokens=53, @@ -487,10 +479,10 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): stream=True, ) for chunk in response: - assert hasattr(chunk, 'choices') + assert hasattr(chunk, "choices") assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], 'delta') - assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') + assert hasattr(chunk.choices[0], "delta") + assert hasattr(chunk.choices[0].delta, "prompt_token_ids") assert chunk.choices[0].delta.prompt_token_ids is None - assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + assert hasattr(chunk.choices[0].delta, "completion_token_ids") assert chunk.choices[0].delta.completion_token_ids is None diff --git a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py index fbe1ea48c70..a4c5048af6a 100644 --- a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py +++ b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py @@ -294,4 +294,6 @@ def test_non_thinking_prompt(api_url, headers): assert False, f"Response is not valid JSON: {e}" content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower() - assert not any(x in content for x in ["根据", "我认为", "推测", "可能"]), "Expected no reasoning in non-thinking response" + assert not any( + x in content for x in ["根据", "我认为", "推测", "可能"] + ), "Expected no reasoning in non-thinking response" From 167441b59f13e3d61e9d797f71a6fbd7a90993a4 Mon Sep 17 00:00:00 2001 From: zeroRains Date: Thu, 24 Jul 2025 15:45:49 +0800 Subject: [PATCH 2/2] polish code --- fastdeploy/engine/args_utils.py | 4 ++-- fastdeploy/entrypoints/engine_client.py | 5 ++--- fastdeploy/entrypoints/openai/api_server.py | 2 +- fastdeploy/entrypoints/openai/serving_chat.py | 3 ++- fastdeploy/entrypoints/openai/serving_completion.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 444f88b7611..613831a76b9 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -559,8 +559,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--ips", type=lambda s: s.split(",") if s else None, default=EngineArgs.ips, - help= - "IP addresses of all nodes participating in distributed inference.") + help="IP addresses of all nodes participating in distributed inference.", + ) # Performance tuning parameters group perf_group = parser.add_argument_group("Performance Tuning") diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 4cbea336b8a..a7ffe0a1ad3 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -41,7 +41,7 @@ def __init__( mm_processor_kwargs, enable_mm=False, reasoning_parser=None, - data_parallel_size=1 + data_parallel_size=1, ): input_processor = InputPreprocessor( tokenizer, @@ -55,8 +55,7 @@ def __init__( self.data_processor = input_processor.create_processor() self.max_model_len = max_model_len max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 - array_size = min( - max_chips_per_node, tensor_parallel_size * data_parallel_size) + array_size = min(max_chips_per_node, tensor_parallel_size * data_parallel_size) self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32) self.worker_healthy_live_signal = IPCSignal( name="worker_healthy_live_signal", diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 895978b327a..61fa5d39655 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -113,7 +113,7 @@ async def lifespan(app: FastAPI): args.mm_processor_kwargs, args.enable_mm, args.reasoning_parser, - args.data_parallel_size + args.data_parallel_size, ) app.state.dynamic_load_weight = args.dynamic_load_weight chat_handler = OpenAIServingChat(engine_client, pid, args.ips) diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 7bcb8146c49..86da7eaea95 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -19,9 +19,10 @@ import traceback import uuid from typing import List, Optional -import numpy as np + import aiozmq import msgpack +import numpy as np from aiozmq import zmq from fastdeploy.entrypoints.openai.protocol import ( diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 2fa26859c04..a7a058858c5 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -18,9 +18,10 @@ import time import uuid from typing import List -import numpy as np + import aiozmq import msgpack +import numpy as np from aiozmq import zmq from fastdeploy.engine.request import RequestOutput @@ -48,7 +49,6 @@ def __init__(self, engine_client, pid, ips): else: self.master_ip = self.master_ip.split(",")[0] - def _check_master(self): if self.master_ip is None: return True