From 87c0b049416af199281ffdb248d719d478053d37 Mon Sep 17 00:00:00 2001
From: zeroRains <linjunlu@zerorains.top>
Date: Thu, 24 Jul 2025 13:24:08 +0800
Subject: [PATCH 1/2] update flake8 version to support pre-commit in python3.12

---
 .flake8                                       |   2 +-
 .pre-commit-config.yaml                       |   4 +-
 .../python/ops/test_get_padding_offset.py     |   8 +-
 .../cache_manager/prefix_cache_manager.py     |   5 +-
 .../custom_all_reduce/cuda_wrapper.py         |  10 +-
 fastdeploy/entrypoints/openai/protocol.py     |   2 +-
 fastdeploy/entrypoints/openai/serving_chat.py |  24 +--
 .../entrypoints/openai/serving_completion.py  |  50 +++---
 fastdeploy/input/ernie_processor.py           |  10 +-
 fastdeploy/input/mm_processor/process.py      |   3 +-
 fastdeploy/input/text_processor.py            |  21 +--
 .../cudagraph_piecewise_backend.py            |   5 +-
 .../layers/attention/mla_attention_backend.py |   3 +-
 .../layers/backends/dcu/__init__.py           |   8 +-
 .../layers/backends/dcu/top_p_sampling.py     |  10 +-
 fastdeploy/model_executor/layers/moe/ep.py    |   8 +-
 .../layers/moe/fused_moe_marlin_backend.py    |  27 +++-
 .../layers/moe/fused_moe_triton_backend.py    |  14 +-
 fastdeploy/model_executor/layers/moe/moe.py   |  42 +++--
 .../sample/ops/apply_penalty_multi_scores.py  |   4 +-
 .../layers/sample/ops/top_k_top_p_sampling.py |   1 +
 fastdeploy/model_executor/layers/utils.py     |   8 +-
 .../ops/triton_ops/triton_utils.py            |  28 ++--
 fastdeploy/rl/rollout_model.py                |  44 +++---
 pyproject.toml                                |   2 +-
 test/ci_use/EB_Lite/test_EB_Lite_serving.py   |  91 +++++------
 .../EB_VL_Lite/test_EB_VL_Lite_serving.py     | 146 +++++++++---------
 .../Qwen3-MoE/test_Qwen3-MoE_serving.py       |   4 +-
 28 files changed, 316 insertions(+), 268 deletions(-)

diff --git a/.flake8 b/.flake8
index 869c57d3e61..1656330a998 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E402, E501, E731, E741, W503, W605, E722
+ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271
 max-line-length = 119
 
 # E402: module level import not at top of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a5939966c88..8c0fec84a19 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ default_stages:
 #   - manual # Run in CI
 repos:
 -   repo: https://github.com/psf/black.git
-    rev: 22.8.0
+    rev: 25.1.0
     hooks:
     -   id: black
         files: \.(py|pyi)$
@@ -18,7 +18,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 4.0.1
+    rev: 7.0.0
     hooks:
     -   id: flake8
 # 代码检查
diff --git a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py
index 441912a6d97..614386488a6 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py
@@ -29,7 +29,13 @@
     ids_len = seq_lens[i, 0]
     input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64")
 
-(x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k,) = get_padding_offset(
+(
+    x_remove_padding,
+    cum_offsets_out,
+    padding_offset,
+    cu_seqlens_q,
+    cu_seqlens_k,
+) = get_padding_offset(
     paddle.to_tensor(input_ids),
     paddle.to_tensor(cum_offset),
     paddle.to_tensor(token_num),
diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 0114914bb33..b403d394462 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -473,7 +473,10 @@ def request_block_ids(self, task, block_size, dec_token_num, *args):
                 current_time = time.time()
                 self._update_matched_node_info(req_id, match_block_node, current_time)
                 # 2. prepare cache
-                (gpu_recv_block_ids, gpu_extra_block_ids,) = self._prepare_cache(
+                (
+                    gpu_recv_block_ids,
+                    gpu_extra_block_ids,
+                ) = self._prepare_cache(
                     req_id,
                     input_ids,
                     block_size,
diff --git a/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py b/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py
index ac321a58914..7bec993d953 100644
--- a/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py
+++ b/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py
@@ -113,10 +113,7 @@ class CudaRTLibrary:
         Function(
             "cudaStreamIsCapturing",
             cudaError_t,
-            [
-                cudaStream_t,
-                ctypes.POINTER(cudaStreamCaptureStatus)
-            ]
+            [cudaStream_t, ctypes.POINTER(cudaStreamCaptureStatus)],
         ),
     ]
 
@@ -197,9 +194,8 @@ def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
             self.funcs["cudaIpcOpenMemHandle"](ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)
         )
         return devPtr
+
     def cudaStreamIsCapturing(self, stream: cudaStream_t) -> ctypes.c_int:
         is_capturing = ctypes.c_int()
-        self.CUDART_CHECK(
-            self.funcs["cudaStreamIsCapturing"](stream, is_capturing)
-        )
+        self.CUDART_CHECK(self.funcs["cudaStreamIsCapturing"](stream, is_capturing))
         return is_capturing
diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index 66076bedf10..e3c759e5739 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -478,7 +478,7 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = None
     top_k: Optional[int] = None
     min_p: Optional[float] = None
-    user: Optional[str] = None  
+    user: Optional[str] = None
     metadata: Optional[dict] = None
     extra_body: Optional[dict] = None
     return_token_ids: Optional[bool] = False
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 611b0cb8de7..d79abdb39dc 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -144,7 +144,9 @@ async def chat_completion_stream_generator(
             if request.metadata is not None:
                 enable_thinking = request.metadata.get("enable_thinking")
                 include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
-            enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
+            enable_return_token_ids = request.return_token_ids or (
+                request.extra_body is not None and request.extra_body.get("return_token_ids", False)
+            )
             while num_choices > 0:
                 try:
                     raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
@@ -186,13 +188,13 @@ async def chat_completion_stream_generator(
                             choice = ChatCompletionResponseStreamChoice(
                                 index=i,
                                 delta=DeltaMessage(
-                                    role="assistant", 
-                                    content="", 
-                                    reasoning_content="", 
+                                    role="assistant",
+                                    content="",
+                                    reasoning_content="",
                                     tool_calls=None,
                                     prompt_token_ids=None,
                                     completion_token_ids=None,
-                                )
+                                ),
                             )
                             if enable_return_token_ids:
                                 choice.delta.prompt_token_ids = list(prompt_token_ids)
@@ -231,10 +233,10 @@ async def chat_completion_stream_generator(
 
                     previous_num_tokens += len(output["token_ids"])
                     delta_message = DeltaMessage(
-                        content=delta_text, 
-                        reasoning_content=output.get("reasoning_content"), \
+                        content=delta_text,
+                        reasoning_content=output.get("reasoning_content"),
                         prompt_token_ids=None,
-                        completion_token_ids=None, 
+                        completion_token_ids=None,
                         tool_calls=output.get("tool_call_content", []),
                     )
 
@@ -322,7 +324,9 @@ async def chat_completion_full_generator(
         final_res = None
         enable_thinking = None
         include_stop_str_in_output = False
-        enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
+        enable_return_token_ids = request.return_token_ids or (
+            request.extra_body is not None and request.extra_body.get("return_token_ids", False)
+        )
         try:
             dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
             dealer.write([b"", request_id.encode("utf-8")])
@@ -396,7 +400,7 @@ async def chat_completion_full_generator(
             reasoning_content=output.get("reasoning_content"),
             tool_calls=output.get("tool_call_content"),
             prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
-            completion_token_ids=completion_token_ids if enable_return_token_ids else None,
+            completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
         )
         logprobs_full_res = None
         if logprob_contents:
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 34f71240946..970544555e2 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -229,7 +229,9 @@ async def completion_stream_generator(
                 model=model_name,
                 choices=choices,
             )
-            enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
+            enable_return_token_ids = request.return_token_ids or (
+                request.extra_body is not None and request.extra_body.get("return_token_ids", False)
+            )
             current_waiting_time = 0
             while num_choices > 0:
                 try:
@@ -258,12 +260,16 @@ async def completion_stream_generator(
                                 id=request_id,
                                 created=created_time,
                                 model=model_name,
-                                choices=[CompletionResponseStreamChoice(
-                                    index=idx,
-                                    text="",
-                                    prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None,
-                                    completion_token_ids=None,
-                                )]
+                                choices=[
+                                    CompletionResponseStreamChoice(
+                                        index=idx,
+                                        text="",
+                                        prompt_token_ids=(
+                                            list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None
+                                        ),
+                                        completion_token_ids=None,
+                                    )
+                                ],
                             )
                             yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
                         first_iteration[idx] = False
@@ -277,15 +283,17 @@ async def completion_stream_generator(
 
                     output = res["outputs"]
 
-                    choices.append(CompletionResponseStreamChoice(
-                        index=idx,
-                        text=output["text"],
-                        prompt_token_ids=None,
-                        completion_token_ids=output.get("token_ids") if enable_return_token_ids else None,
-                        tool_calls=output.get("tool_call_content"),
-                        reasoning_content=output.get("reasoning_content"),
-                        arrival_time=arrival_time
-                    ))
+                    choices.append(
+                        CompletionResponseStreamChoice(
+                            index=idx,
+                            text=output["text"],
+                            prompt_token_ids=None,
+                            completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None),
+                            tool_calls=output.get("tool_call_content"),
+                            reasoning_content=output.get("reasoning_content"),
+                            arrival_time=arrival_time,
+                        )
+                    )
                     if res["finished"]:
                         if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
                             chunk.choices[0].finish_reason = "stop"
@@ -344,12 +352,14 @@ def request_output_to_completion_response(
         created_time: int,
         model_name: str,
         prompt_batched_token_ids: list(),
-        completion_batched_token_ids: list()
+        completion_batched_token_ids: list(),
     ) -> CompletionResponse:
         choices: List[CompletionResponseChoice] = []
         num_prompt_tokens = 0
         num_generated_tokens = 0
-        enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
+        enable_return_token_ids = request.return_token_ids or (
+            request.extra_body is not None and request.extra_body.get("return_token_ids", False)
+        )
 
         for idx in range(len(final_res_batch)):
             final_res = final_res_batch[idx]
@@ -376,8 +386,8 @@ def request_output_to_completion_response(
                 index=len(choices),
                 text=output_text,
                 prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
-                completion_token_ids=completion_token_ids if enable_return_token_ids else None,
-                reasoning_content=output.get('reasoning_content'),
+                completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
+                reasoning_content=output.get("reasoning_content"),
                 tool_calls=output.get("tool_call_content"),
                 logprobs=None,
                 finish_reason=None,
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
index a56c7f9fb06..06d198db17d 100644
--- a/fastdeploy/input/ernie_processor.py
+++ b/fastdeploy/input/ernie_processor.py
@@ -99,8 +99,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
 
         if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
             if request.prompt is None and request.messages is None:
-                raise ValueError(
-                    f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
+                raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
             if request.prompt is not None:
                 prompt = request.prompt if request.prompt is not None else request.messages[0]
                 prompt = prompt[0] if isinstance(prompt, list) else prompt
@@ -164,8 +163,8 @@ def process_request_dict(self, request, max_model_len=None):
                 req_id = request.get("request_id", None)
                 data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
             else:
-                request['prompt_token_ids'] = self.messages2ids(request)
-        if len(request['prompt_token_ids']) == 0:
+                request["prompt_token_ids"] = self.messages2ids(request)
+        if len(request["prompt_token_ids"]) == 0:
             raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
 
         # truncate prompts that exceed the length limit
@@ -246,8 +245,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         if is_end:
             full_text = previous_texts + delta_text
             if enable_thinking and self.reasoning_parser:
-                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                    full_text, response_dict)
+                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
                 response_dict["outputs"]["text"] = text
                 response_dict["outputs"]["reasoning_content"] = reasoning_content
             else:
diff --git a/fastdeploy/input/mm_processor/process.py b/fastdeploy/input/mm_processor/process.py
index 566921a6011..ea2559a0fef 100644
--- a/fastdeploy/input/mm_processor/process.py
+++ b/fastdeploy/input/mm_processor/process.py
@@ -507,5 +507,6 @@ def apply_chat_template(self, request):
         tokens = self.tokenizer.tokenize(prompt_token_str)
         token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
         data_processor_logger.info(
-            f"req_id:{request.get('request_id', ''),} tokens: {tokens}, token_ids: {token_ids}")
+            f"req_id:{request.get('request_id', ''), } tokens: {tokens}, token_ids: {token_ids}"
+        )
         return token_ids
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
index a9f8c2c4971..664868a595e 100644
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -239,9 +239,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
                 task["enable_thinking"] = kwargs.get("enable_thinking", True)
                 request.prompt_token_ids = self.messages2ids(task)
             else:
-                raise ValueError(
-                    f"The request should have `input_ids`, `text` or `messages`: {request}."
-                )
+                raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
         if len(request.prompt_token_ids) == 0:
             raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
         if request.get("max_tokens") is None:
@@ -281,18 +279,16 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
 
         data_processor_logger.info(f"Processing request {request}")
         # processing prompt_token_ids
-        if not request.get('prompt_token_ids'):
-            if 'prompt' in request:
-                request['prompt_token_ids'] = self.text2ids(request['prompt'], max_model_len).tolist()
-            elif 'messages' in request:
+        if not request.get("prompt_token_ids"):
+            if "prompt" in request:
+                request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len).tolist()
+            elif "messages" in request:
                 if self.tokenizer.chat_template is None:
                     raise ValueError("This model does not support chat_template.")
                 request["prompt_token_ids"] = self.messages2ids(request)
             else:
-                raise ValueError(
-                    f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
-                )
-        if len(request['prompt_token_ids']) == 0:
+                raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
+        if len(request["prompt_token_ids"]) == 0:
             raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
         if request.get("max_tokens") is None:
             request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
@@ -357,8 +353,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
         if is_end:
             full_text = previous_texts + delta_text
             if enable_thinking and self.reasoning_parser:
-                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                    full_text, response_dict)
+                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
                 response_dict["outputs"]["text"] = text
                 response_dict["outputs"]["reasoning_content"] = reasoning_content
             else:
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
index c93a3c5a4f5..56dd8d92e9a 100644
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -21,8 +21,8 @@
 from paddle.device.cuda import graphs
 
 from fastdeploy.config import FDConfig
-from fastdeploy.utils import get_logger
 from fastdeploy.distributed.communication import capture_custom_allreduce
+from fastdeploy.utils import get_logger
 
 logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
 
@@ -99,7 +99,7 @@ def __call__(self, **kwargs):
                 entry.runnable(**kwargs)
                 logger.debug(
                     f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
-                    f"finished ({n+1}/{entry.num_finished_warmup}) times"
+                    f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                 )
 
             # Store input addresses for debug
@@ -114,7 +114,6 @@ def __call__(self, **kwargs):
                 new_grpah.capture_begin()
                 output = entry.runnable(**kwargs)
                 new_grpah.capture_end()
-            
 
             # Store output buffer
             entry.cuda_graph = new_grpah
diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
index 6f0581e5710..5863b5c09f8 100644
--- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
+++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -217,8 +217,7 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
         self.attention_metadata: AttentionMetadata = metadata
 
         forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False)
-        forward_meta.decoder_tile_ids_per_batch.copy_(
-            metadata.decoder_tile_ids_per_batch, False)
+        forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False)
 
     def get_attntion_meta(self) -> AttentionMetadata:
         """get_attntion_meta"""
diff --git a/fastdeploy/model_executor/layers/backends/dcu/__init__.py b/fastdeploy/model_executor/layers/backends/dcu/__init__.py
index 40faa16644c..803a76010f6 100644
--- a/fastdeploy/model_executor/layers/backends/dcu/__init__.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/__init__.py
@@ -17,7 +17,11 @@
 """
 
 from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
-from .weight_only import DCUWeightOnlyLinearMethod
 from .top_p_sampling import native_top_p_sampling
+from .weight_only import DCUWeightOnlyLinearMethod
 
-__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod", "native_top_p_sampling"]
+__all__ = [
+    "DCUTritonWeightOnlyMoEMethod",
+    "DCUWeightOnlyLinearMethod",
+    "native_top_p_sampling",
+]
diff --git a/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py b/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py
index 4a79a951170..1eafe135178 100644
--- a/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py
@@ -13,13 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 import paddle
 
 
-def native_top_p_sampling(
-    probs: paddle.Tensor,
-    top_p: paddle.Tensor
-) -> tuple[paddle.Tensor, paddle.Tensor]:
+def native_top_p_sampling(probs: paddle.Tensor, top_p: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]:
     sorted_indices = paddle.argsort(probs, descending=True)
     sorted_probs = paddle.sort(probs, descending=True)
     cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
@@ -30,7 +28,9 @@ def native_top_p_sampling(
     sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
 
     condition = paddle.scatter(
-        sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
+        sorted_indices_to_remove.flatten(),
+        sorted_indices.flatten(),
+        sorted_indices_to_remove.flatten(),
     )
 
     condition = paddle.cast(condition, "bool").reshape(probs.shape)
diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py
index 16a0152045c..acc070309da 100644
--- a/fastdeploy/model_executor/layers/moe/ep.py
+++ b/fastdeploy/model_executor/layers/moe/ep.py
@@ -143,7 +143,13 @@ def low_latency_dispatch(
             event: the event after executing the kernel (valid only if `async_finish` is set).
             hook: the receiving hook function (valid only if `return_recv_hook` is set).
         """
-        (packed_recv_x, recv_expert_count, handle, _, dispatch_hook,) = self.deepep_engine.low_latency_dispatch(
+        (
+            packed_recv_x,
+            recv_expert_count,
+            handle,
+            _,
+            dispatch_hook,
+        ) = self.deepep_engine.low_latency_dispatch(
             hidden_states,
             topk_idx,
             expertwise_scale,
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
index d9f2f20d9c6..848f52b9535 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
@@ -21,15 +21,21 @@
 from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
 from fastdeploy.model_executor.ops.gpu import (
     MoeWna16MarlinGemmApi,
-    tritonmoe_preprocess_func,
     noaux_tc,
+    tritonmoe_preprocess_func,
 )
 
 from ..quantization.quant_base import QuantMethodBase
 
-def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k,
-                   routed_scaling_factor,
-                   e_score_correction_bias) -> paddle.Tensor:
+
+def get_moe_scores(
+    gating_output: paddle.Tensor,
+    n_group,
+    topk_group,
+    top_k,
+    routed_scaling_factor,
+    e_score_correction_bias,
+) -> paddle.Tensor:
     """
     compute moe scores using e_score_correction_bias.
     """
@@ -45,6 +51,7 @@ def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k,
     )
     return scores
 
+
 def gptq_marlin_moe_repack(
     b_q_weight: paddle.Tensor,
     perm: paddle.Tensor,
@@ -226,10 +233,14 @@ def apply(
         topk_method = layer.topk_method
 
         if topk_method == "noaux_tc":
-            gate_out = get_moe_scores(gate_out, layer.n_group,
-                            layer.topk_group, layer.top_k,
-                            layer.routed_scaling_factor,
-                            layer.gate_correction_bias)
+            gate_out = get_moe_scores(
+                gate_out,
+                layer.n_group,
+                layer.topk_group,
+                layer.top_k,
+                layer.routed_scaling_factor,
+                layer.gate_correction_bias,
+            )
 
             topk_weights, topk_ids = paddle.topk(gate_out, k=layer.top_k, axis=-1, sorted=False)
         else:
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
index f24936138d0..352fdbca203 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -609,11 +609,11 @@ def apply(
         from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func
 
         sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
-            topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
+            topk_ids, num_local_experts, config["BLOCK_SIZE_M"]
+        )
         # cache13 = create_empty_tensor(tuple([token_num * top_k * max(N1, N2)]), x.dtype)
         cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype)
-        intermediate_cache1 = cache13[:token_num * top_k * N1].view(
-            [token_num * top_k, N1])
+        intermediate_cache1 = cache13[: token_num * top_k * N1].view([token_num * top_k, N1])
         max_num_tokens_padded = sorted_token_ids.shape[0]
 
         grid = (
@@ -669,11 +669,11 @@ def apply(
 
         intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)
 
-        intermediate_cache3 = cache13[:token_num * top_k * N2].view(
-            [token_num * top_k, N2])
+        intermediate_cache3 = cache13[: token_num * top_k * N2].view([token_num * top_k, N2])
 
-        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
-                ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
+        grid = (
+            ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
+        )
 
         x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
             intermediate_cache2, self.quant_config.weight_block_size[0]
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index b573ccb0def..a1d689961df 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -125,7 +125,7 @@ def __init__(
             self.init_moe_weights()
 
         logger.info(
-            f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
+            f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
         {top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
             , ep_size={self.ep_size}, \
             tp_size={self.tp_size}."
@@ -232,17 +232,21 @@ def load_experts_weight(
                 up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx)
                 up_gate_proj_weights.append(
                     get_tensor(
-                        state_dict.pop(up_gate_proj_expert_weight_key_name)
-                        if up_gate_proj_expert_weight_key_name in state_dict
-                        else up_gate_proj_expert_weight_key_name,
+                        (
+                            state_dict.pop(up_gate_proj_expert_weight_key_name)
+                            if up_gate_proj_expert_weight_key_name in state_dict
+                            else up_gate_proj_expert_weight_key_name
+                        ),
                         self.fd_config.parallel_config.model_name_or_path,
                     )
                 )
                 down_proj_weights.append(
                     get_tensor(
-                        state_dict.pop(down_proj_expert_weight_key_name)
-                        if down_proj_expert_weight_key_name in state_dict
-                        else down_proj_expert_weight_key_name,
+                        (
+                            state_dict.pop(down_proj_expert_weight_key_name)
+                            if down_proj_expert_weight_key_name in state_dict
+                            else down_proj_expert_weight_key_name
+                        ),
                         self.fd_config.parallel_config.model_name_or_path,
                     )
                 )
@@ -255,23 +259,29 @@ def load_experts_weight(
                 up_expert_weight_key_name = up_expert_weight_key.format(expert_idx)
                 down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
                 gate = get_tensor(
-                    state_dict.pop(gate_expert_weight_key_name)
-                    if gate_expert_weight_key_name in state_dict
-                    else gate_expert_weight_key_name,
+                    (
+                        state_dict.pop(gate_expert_weight_key_name)
+                        if gate_expert_weight_key_name in state_dict
+                        else gate_expert_weight_key_name
+                    ),
                     self.fd_config.parallel_config.model_name_or_path,
                 )
                 up = get_tensor(
-                    state_dict.pop(up_expert_weight_key_name)
-                    if up_expert_weight_key_name in state_dict
-                    else up_expert_weight_key_name,
+                    (
+                        state_dict.pop(up_expert_weight_key_name)
+                        if up_expert_weight_key_name in state_dict
+                        else up_expert_weight_key_name
+                    ),
                     self.fd_config.parallel_config.model_name_or_path,
                 )
                 up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
                 down_proj_weights.append(
                     get_tensor(
-                        state_dict.pop(down_proj_expert_weight_key_name)
-                        if down_proj_expert_weight_key_name in state_dict
-                        else down_proj_expert_weight_key_name,
+                        (
+                            state_dict.pop(down_proj_expert_weight_key_name)
+                            if down_proj_expert_weight_key_name in state_dict
+                            else down_proj_expert_weight_key_name
+                        ),
                         self.fd_config.parallel_config.model_name_or_path,
                     )
                 )
diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
index 3a25fcd406a..06c7ece76f7 100644
--- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
+++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -54,8 +54,8 @@ def apply_penalty_multi_scores(
             eos_token_ids,
         )
     elif current_platform.is_dcu():
-        from fastdeploy.model_executor.ops.gpu import \
-            get_token_penalty_multi_scores
+        from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores
+
         logits = get_token_penalty_multi_scores(
             pre_token_ids,
             prompt_ids,
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
index acdc59f6f12..bbc431ddeec 100644
--- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -81,6 +81,7 @@ def top_k_top_p_sampling(
             _, ids = gcu_top_p_sampling(x, top_p)
         elif current_platform.is_dcu():
             from fastdeploy.model_executor.layers.backends import native_top_p_sampling
+
             _, ids = native_top_p_sampling(x, top_p)
         else:
             _, ids = paddle.tensor.top_p_sampling(
diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
index 75171982f64..ed7b4369b13 100644
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -300,7 +300,13 @@ def speculate_remove_padding(
     if current_platform.is_cuda():
         cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
-        (ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k,) = speculate_get_padding_offset(
+        (
+            ids_remove_padding,
+            cum_offsets,
+            padding_offset,
+            cu_seqlens_q,
+            cu_seqlens_k,
+        ) = speculate_get_padding_offset(
             input_ids,
             draft_tokens,
             cum_offsets_now,
diff --git a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py
index 20663152053..c6ebd27422e 100644
--- a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py
+++ b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py
@@ -103,9 +103,9 @@ def extract_triton_kernel(kernel, file_name):
     import textwrap
 
     fn = kernel
-    if type(kernel) == triton.runtime.jit.JITFunction:
+    if isinstance(kernel, triton.runtime.jit.JITFunction):
         fn = kernel.fn
-    elif type(kernel) == triton.runtime.autotuner.Autotuner:
+    elif isinstance(kernel, triton.runtime.autotuner.Autotuner):
         fn = kernel.fn.fn
     else:
         AssertionError("error occurs")
@@ -195,14 +195,14 @@ def get_value_hint(x):
     """
     hint = ""
     for ele in x:
-        if type(ele) == int:
+        if isinstance(ele, int):
             if ele % 16 == 0 and ele > 0:
                 hint += "i64:16,"
             elif ele == 1:
                 hint += "i64:1,"
             else:
                 hint += "i64,"
-        if type(ele) == float:
+        if isinstance(ele, float):
             hint += "fp32,"
     return hint
 
@@ -467,16 +467,16 @@ def rendering_common_template(
         if arg_defaults[i] is None:
             input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]},"
             paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),"""
-        elif type(arg_defaults[i]) == float:
+        elif isinstance(arg_defaults[i], float):
             input_and_attr += f"float {arg_names[i]},"
             paddle_attr_sig += f""""{arg_names[i]}: float","""
-        elif type(arg_defaults[i]) == bool:
+        elif isinstance(arg_defaults[i], bool):
             input_and_attr += f"bool {arg_names[i]},"
             paddle_attr_sig += f""""{arg_names[i]}: bool","""
-        elif type(arg_defaults[i]) == int:
+        elif isinstance(arg_defaults[i], int):
             input_and_attr += f"int64_t {arg_names[i]},"
             paddle_attr_sig += f""""{arg_names[i]}: int64_t","""
-        elif type(arg_defaults[i]) == str:
+        elif isinstance(arg_defaults[i], str):
             input_and_attr += f"std::string {arg_names[i]},"
             paddle_attr_sig += f""""{arg_names[i]}: std::string","""
         elif arg_names[i] == "config":
@@ -629,11 +629,11 @@ def decorator(*args, **kwargs):
             for i in range(len(all_input)):
                 ele = all_input[i]
                 if (
-                    type(ele) == paddle.Tensor
-                    or type(ele) == paddle.base.framework.EagerParamBase
-                    or type(ele) == paddle.base.framework.Parameter
-                    or type(ele) == paddle.base.framework.Variable
-                    or type(ele) == paddle.base.libpaddle.pir.Value
+                    isinstance(ele, paddle.Tensor)
+                    or isinstance(ele, paddle.base.framework.EagerParamBase)
+                    or isinstance(ele, paddle.base.framework.Parameter)
+                    or isinstance(ele, paddle.base.framework.Variable)
+                    or isinstance(ele, paddle.base.libpaddle.pir.Value)
                 ):
                     dtypes.append(ele.dtype)
                     modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]"
@@ -668,7 +668,7 @@ def decorator(*args, **kwargs):
             lanuch_grid = list(self.grid)
             for i in range(len(lanuch_grid)):
                 ele = lanuch_grid[i]
-                if type(ele) == str:
+                if isinstance(ele, str):
                     for key in const_hint_dict.keys():
                         if key in ele:
                             ele = ele.replace(key, f"{{{key}}}")
diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py
index 241c76df0e2..0872f3a4f0a 100644
--- a/fastdeploy/rl/rollout_model.py
+++ b/fastdeploy/rl/rollout_model.py
@@ -153,14 +153,14 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]:
         # Helper function to add layer mappings
         def _add_layer_mappings(layer_idx: int):
             # MoE specific mappings
-            self.infer_to_train_mapping[
-                f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"
-            ] = f"{base_name}.{layer_idx}.mlp.gate.weight"
+            self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = (
+                f"{base_name}.{layer_idx}.mlp.gate.weight"
+            )
 
             if self.fd_config.model_config.moe_use_aux_free:
-                self.infer_to_train_mapping[
-                    f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"
-                ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
+                self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = (
+                    f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
+                )
 
             # MoE experts mappings
             for expert_idx in range(self.fd_config.model_config.moe_num_experts):
@@ -184,7 +184,8 @@ def _add_layer_mappings(layer_idx: int):
         assert isinstance(self.fd_config.model_config.moe_layer_start_index, int)
         # Process MoE layers
         for layer_idx in range(
-            self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers
+            self.fd_config.model_config.moe_layer_start_index,
+            self.fd_config.model_config.num_hidden_layers,
         ):
             _add_layer_mappings(layer_idx)
 
@@ -226,9 +227,9 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]:
         def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int):
             # MoE specific mappings
             gate_suffix = "" if moe_tag == "text" else "_1"
-            self.infer_to_train_mapping[
-                f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"
-            ] = f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}"
+            self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"] = (
+                f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}"
+            )
 
             if self.fd_config.model_config.moe_use_aux_free:
                 self.infer_to_train_mapping[
@@ -245,7 +246,10 @@ def _generate_ranges(start, end, step=16, take=8):
 
             expert_mappings = defaultdict(list)
             for expert_idx in _generate_ranges(
-                expert_start, total_moe_num, expert_num_per_rank * 2, expert_num_per_rank
+                expert_start,
+                total_moe_num,
+                expert_num_per_rank * 2,
+                expert_num_per_rank,
             ):
                 for ph in place_holders:
                     expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append(
@@ -323,9 +327,9 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]:
         def _add_layer_mappings(layer_idx):
             # FFN mappings
             for ph in place_holders:
-                self.infer_to_train_mapping[
-                    f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"
-                ] = f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}"
+                self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"] = (
+                    f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}"
+                )
 
         for layer_idx in range(self.fd_config.model_config.num_hidden_layers):
             _add_layer_mappings(layer_idx)
@@ -368,14 +372,14 @@ def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]:
         # Helper function to add layer mappings
         def _add_layer_mappings(layer_idx: int):
             # MoE specific mappings
-            self.infer_to_train_mapping[
-                f"{base_name}.{layer_idx}.mlp.gate_weight"
-            ] = f"{base_name}.{layer_idx}.mlp.gate.weight"
+            self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate_weight"] = (
+                f"{base_name}.{layer_idx}.mlp.gate.weight"
+            )
 
             if self.fd_config.moe_config.moe_use_aux_free:
-                self.infer_to_train_mapping[
-                    f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"
-                ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
+                self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = (
+                    f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
+                )
 
             # MoE experts mappings
             for expert_idx in range(self.fd_config.moe_config.num_experts):
diff --git a/pyproject.toml b/pyproject.toml
index d6720071377..9b79ec1a4a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ known_third_party = ["paddle"]
 [tool.black]
 line-length = 119
 target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
-exclude = ['.flake8']
+exclude = '.flake8'
 
 
 
diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
index 8d08cbaa29c..fe615e4658b 100644
--- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py
+++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
@@ -342,10 +342,12 @@ def test_streaming(openai_client, capsys):
         output.append(chunk.choices[0].text)
     assert len(output) > 0
 
+
 # ==========================
 # OpenAI Client additional chat/completions test
 # ==========================
 
+
 def test_non_streaming_with_stop_str(openai_client):
     """
     Test non-streaming chat functionality with the local service
@@ -423,12 +425,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
         extra_body={"return_token_ids": True},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response.choices[0], 'message')
-    assert hasattr(response.choices[0].message, 'prompt_token_ids')
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
     assert isinstance(response.choices[0].message.prompt_token_ids, list)
-    assert hasattr(response.choices[0].message, 'completion_token_ids')
+    assert hasattr(response.choices[0].message, "completion_token_ids")
     assert isinstance(response.choices[0].message.completion_token_ids, list)
 
     #  disable return_token_ids
@@ -440,12 +442,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
         extra_body={"return_token_ids": False},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response.choices[0], 'message')
-    assert hasattr(response.choices[0].message, 'prompt_token_ids')
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
     assert response.choices[0].message.prompt_token_ids is None
-    assert hasattr(response.choices[0].message, 'completion_token_ids')
+    assert hasattr(response.choices[0].message, "completion_token_ids")
     assert response.choices[0].message.completion_token_ids is None
 
 
@@ -464,11 +466,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
     )
     is_first_chunk = True
     for chunk in response:
-        assert hasattr(chunk, 'choices')
+        assert hasattr(chunk, "choices")
         assert len(chunk.choices) > 0
-        assert hasattr(chunk.choices[0], 'delta')
-        assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
-        assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
+        assert hasattr(chunk.choices[0], "delta")
+        assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
+        assert hasattr(chunk.choices[0].delta, "completion_token_ids")
         if is_first_chunk:
             is_first_chunk = False
             assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
@@ -487,12 +489,12 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
         stream=True,
     )
     for chunk in response:
-        assert hasattr(chunk, 'choices')
+        assert hasattr(chunk, "choices")
         assert len(chunk.choices) > 0
-        assert hasattr(chunk.choices[0], 'delta')
-        assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
+        assert hasattr(chunk.choices[0], "delta")
+        assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
         assert chunk.choices[0].delta.prompt_token_ids is None
-        assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
+        assert hasattr(chunk.choices[0].delta, "completion_token_ids")
         assert chunk.choices[0].delta.completion_token_ids is None
 
 
@@ -509,11 +511,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
         extra_body={"return_token_ids": True},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response.choices[0], 'prompt_token_ids')
+    assert hasattr(response.choices[0], "prompt_token_ids")
     assert isinstance(response.choices[0].prompt_token_ids, list)
-    assert hasattr(response.choices[0], 'completion_token_ids')
+    assert hasattr(response.choices[0], "completion_token_ids")
     assert isinstance(response.choices[0].completion_token_ids, list)
 
     # disable return_token_ids
@@ -525,11 +527,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
         extra_body={"return_token_ids": False},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response.choices[0], 'prompt_token_ids')
+    assert hasattr(response.choices[0], "prompt_token_ids")
     assert response.choices[0].prompt_token_ids is None
-    assert hasattr(response.choices[0], 'completion_token_ids')
+    assert hasattr(response.choices[0], "completion_token_ids")
     assert response.choices[0].completion_token_ids is None
 
 
@@ -548,10 +550,10 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys):
     )
     is_first_chunk = True
     for chunk in response:
-        assert hasattr(chunk, 'choices')
+        assert hasattr(chunk, "choices")
         assert len(chunk.choices) > 0
-        assert hasattr(chunk.choices[0], 'prompt_token_ids')
-        assert hasattr(chunk.choices[0], 'completion_token_ids')
+        assert hasattr(chunk.choices[0], "prompt_token_ids")
+        assert hasattr(chunk.choices[0], "completion_token_ids")
         if is_first_chunk:
             is_first_chunk = False
             assert isinstance(chunk.choices[0].prompt_token_ids, list)
@@ -570,11 +572,11 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys):
         stream=True,
     )
     for chunk in response:
-        assert hasattr(chunk, 'choices')
+        assert hasattr(chunk, "choices")
         assert len(chunk.choices) > 0
-        assert hasattr(chunk.choices[0], 'prompt_token_ids')
+        assert hasattr(chunk.choices[0], "prompt_token_ids")
         assert chunk.choices[0].prompt_token_ids is None
-        assert hasattr(chunk.choices[0], 'completion_token_ids')
+        assert hasattr(chunk.choices[0], "completion_token_ids")
         assert chunk.choices[0].completion_token_ids is None
 
 
@@ -587,13 +589,13 @@ def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys):
         messages=[],
         temperature=1,
         max_tokens=5,
-        extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
+        extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response, 'usage')
-    assert hasattr(response.usage, 'prompt_tokens')
+    assert hasattr(response, "usage")
+    assert hasattr(response.usage, "prompt_tokens")
     assert response.usage.prompt_tokens == 9
 
 
@@ -606,17 +608,17 @@ def test_streaming_chat_with_prompt_token_ids(openai_client, capsys):
         messages=[],
         temperature=1,
         max_tokens=5,
-        extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
+        extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
         stream=True,
         stream_options={"include_usage": True},
     )
     for chunk in response:
-        assert hasattr(chunk, 'choices')
-        assert hasattr(chunk, 'usage')
+        assert hasattr(chunk, "choices")
+        assert hasattr(chunk, "usage")
         if len(chunk.choices) > 0:
             assert chunk.usage is None
         else:
-            assert hasattr(chunk.usage, 'prompt_tokens')
+            assert hasattr(chunk.usage, "prompt_tokens")
             assert chunk.usage.prompt_tokens == 9
 
 
@@ -629,13 +631,13 @@ def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys):
         prompt="",
         temperature=1,
         max_tokens=5,
-        extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
+        extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response, 'usage')
-    assert hasattr(response.usage, 'prompt_tokens')
+    assert hasattr(response, "usage")
+    assert hasattr(response.usage, "prompt_tokens")
     assert response.usage.prompt_tokens == 9
 
 
@@ -648,16 +650,15 @@ def test_streaming_completion_with_prompt_token_ids(openai_client, capsys):
         prompt="",
         temperature=1,
         max_tokens=5,
-        extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
+        extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
         stream=True,
         stream_options={"include_usage": True},
     )
     for chunk in response:
-        assert hasattr(chunk, 'choices')
-        assert hasattr(chunk, 'usage')
+        assert hasattr(chunk, "choices")
+        assert hasattr(chunk, "usage")
         if len(chunk.choices) > 0:
             assert chunk.usage is None
         else:
-            assert hasattr(chunk.usage, 'prompt_tokens')
+            assert hasattr(chunk.usage, "prompt_tokens")
             assert chunk.usage.prompt_tokens == 9
-
diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index 083f4fc7466..dccc1f55a99 100644
--- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -325,11 +325,11 @@ def test_streaming_chat(openai_client, capsys):
     assert len(output) > 2
 
 
-
 # ==========================
 # OpenAI Client additional chat/completions test
 # ==========================
 
+
 def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
     """
     Test return_token_ids option in non-streaming chat functionality with the local service
@@ -340,35 +340,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
         messages=[
             {
                 "role": "system",
-                "content": "You are a helpful AI assistant."
+                "content": "You are a helpful AI assistant.",
             },  # system不是必需，可选
             {
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url":
-                        "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
-                        "detail": "high"
-                    }
-                }, {
-                    "type": "text",
-                    "text": "请描述图片内容"
-                }]
-            }
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
         ],
         temperature=1,
         max_tokens=53,
         extra_body={"return_token_ids": True},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response.choices[0], 'message')
-    assert hasattr(response.choices[0].message, 'prompt_token_ids')
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
     assert isinstance(response.choices[0].message.prompt_token_ids, list)
-    assert hasattr(response.choices[0].message, 'completion_token_ids')
+    assert hasattr(response.choices[0].message, "completion_token_ids")
     assert isinstance(response.choices[0].message.completion_token_ids, list)
 
     # 不设定 return_token_ids
@@ -377,35 +375,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
         messages=[
             {
                 "role": "system",
-                "content": "You are a helpful AI assistant."
+                "content": "You are a helpful AI assistant.",
             },  # system不是必需，可选
             {
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url":
-                        "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
-                        "detail": "high"
-                    }
-                }, {
-                    "type": "text",
-                    "text": "请描述图片内容"
-                }]
-            }
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
         ],
         temperature=1,
         max_tokens=53,
         extra_body={"return_token_ids": False},
         stream=False,
     )
-    assert hasattr(response, 'choices')
+    assert hasattr(response, "choices")
     assert len(response.choices) > 0
-    assert hasattr(response.choices[0], 'message')
-    assert hasattr(response.choices[0].message, 'prompt_token_ids')
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
     assert response.choices[0].message.prompt_token_ids is None
-    assert hasattr(response.choices[0].message, 'completion_token_ids')
+    assert hasattr(response.choices[0].message, "completion_token_ids")
     assert response.choices[0].message.completion_token_ids is None
 
 
@@ -419,23 +415,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
         messages=[
             {
                 "role": "system",
-                "content": "You are a helpful AI assistant."
+                "content": "You are a helpful AI assistant.",
             },  # system不是必需，可选
             {
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url":
-                        "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
-                        "detail": "high"
-                    }
-                }, {
-                    "type": "text",
-                    "text": "请描述图片内容"
-                }]
-            }
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
         ],
         temperature=1,
         max_tokens=53,
@@ -444,11 +438,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
     )
     is_first_chunk = True
     for chunk in response:
-        assert hasattr(chunk, 'choices')
+        assert hasattr(chunk, "choices")
         assert len(chunk.choices) > 0
-        assert hasattr(chunk.choices[0], 'delta')
-        assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
-        assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
+        assert hasattr(chunk.choices[0], "delta")
+        assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
+        assert hasattr(chunk.choices[0].delta, "completion_token_ids")
         if is_first_chunk:
             is_first_chunk = False
             assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
@@ -463,23 +457,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
         messages=[
             {
                 "role": "system",
-                "content": "You are a helpful AI assistant."
+                "content": "You are a helpful AI assistant.",
             },  # system不是必需，可选
             {
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url":
-                        "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
-                        "detail": "high"
-                    }
-                }, {
-                    "type": "text",
-                    "text": "请描述图片内容"
-                }]
-            }
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
         ],
         temperature=1,
         max_tokens=53,
@@ -487,10 +479,10 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
         stream=True,
     )
     for chunk in response:
-        assert hasattr(chunk, 'choices')
+        assert hasattr(chunk, "choices")
         assert len(chunk.choices) > 0
-        assert hasattr(chunk.choices[0], 'delta')
-        assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
+        assert hasattr(chunk.choices[0], "delta")
+        assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
         assert chunk.choices[0].delta.prompt_token_ids is None
-        assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
+        assert hasattr(chunk.choices[0].delta, "completion_token_ids")
         assert chunk.choices[0].delta.completion_token_ids is None
diff --git a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py
index fbe1ea48c70..a4c5048af6a 100644
--- a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py
+++ b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py
@@ -294,4 +294,6 @@ def test_non_thinking_prompt(api_url, headers):
         assert False, f"Response is not valid JSON: {e}"
 
     content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower()
-    assert not any(x in content for x in ["根据", "我认为", "推测", "可能"]), "Expected no reasoning in non-thinking response"
+    assert not any(
+        x in content for x in ["根据", "我认为", "推测", "可能"]
+    ), "Expected no reasoning in non-thinking response"

From 167441b59f13e3d61e9d797f71a6fbd7a90993a4 Mon Sep 17 00:00:00 2001
From: zeroRains <linjunlu@zerorains.top>
Date: Thu, 24 Jul 2025 15:45:49 +0800
Subject: [PATCH 2/2] polish code

---
 fastdeploy/engine/args_utils.py                     | 4 ++--
 fastdeploy/entrypoints/engine_client.py             | 5 ++---
 fastdeploy/entrypoints/openai/api_server.py         | 2 +-
 fastdeploy/entrypoints/openai/serving_chat.py       | 3 ++-
 fastdeploy/entrypoints/openai/serving_completion.py | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index 444f88b7611..613831a76b9 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -559,8 +559,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--ips",
             type=lambda s: s.split(",") if s else None,
             default=EngineArgs.ips,
-            help=
-            "IP addresses of all nodes participating in distributed inference.")
+            help="IP addresses of all nodes participating in distributed inference.",
+        )
 
         # Performance tuning parameters group
         perf_group = parser.add_argument_group("Performance Tuning")
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index 4cbea336b8a..a7ffe0a1ad3 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -41,7 +41,7 @@ def __init__(
         mm_processor_kwargs,
         enable_mm=False,
         reasoning_parser=None,
-        data_parallel_size=1
+        data_parallel_size=1,
     ):
         input_processor = InputPreprocessor(
             tokenizer,
@@ -55,8 +55,7 @@ def __init__(
         self.data_processor = input_processor.create_processor()
         self.max_model_len = max_model_len
         max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
-        array_size = min(
-            max_chips_per_node, tensor_parallel_size * data_parallel_size)
+        array_size = min(max_chips_per_node, tensor_parallel_size * data_parallel_size)
         self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32)
         self.worker_healthy_live_signal = IPCSignal(
             name="worker_healthy_live_signal",
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
index 895978b327a..61fa5d39655 100644
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -113,7 +113,7 @@ async def lifespan(app: FastAPI):
         args.mm_processor_kwargs,
         args.enable_mm,
         args.reasoning_parser,
-        args.data_parallel_size
+        args.data_parallel_size,
     )
     app.state.dynamic_load_weight = args.dynamic_load_weight
     chat_handler = OpenAIServingChat(engine_client, pid, args.ips)
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 7bcb8146c49..86da7eaea95 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -19,9 +19,10 @@
 import traceback
 import uuid
 from typing import List, Optional
-import numpy as np
+
 import aiozmq
 import msgpack
+import numpy as np
 from aiozmq import zmq
 
 from fastdeploy.entrypoints.openai.protocol import (
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 2fa26859c04..a7a058858c5 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -18,9 +18,10 @@
 import time
 import uuid
 from typing import List
-import numpy as np
+
 import aiozmq
 import msgpack
+import numpy as np
 from aiozmq import zmq
 
 from fastdeploy.engine.request import RequestOutput
@@ -48,7 +49,6 @@ def __init__(self, engine_client, pid, ips):
             else:
                 self.master_ip = self.master_ip.split(",")[0]
 
-
     def _check_master(self):
         if self.master_ip is None:
             return True