PaddlePaddle · yuanlehome · Jan 24, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 13, 2026
diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml
@@ -161,7 +161,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 

diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml
@@ -186,7 +186,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 

diff --git a/.github/workflows/_build_linux.yml b/.github/workflows/_build_linux.yml
@@ -173,7 +173,7 @@ jobs:
             elif [[ "${PADDLEVERSION}" != "" ]];then
               python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
             else
-              python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+              python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
             fi
 
             pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml
@@ -156,7 +156,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
@@ -185,7 +185,7 @@ jobs:
             -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
           set +e
           rm -rf ./baseline_output
-          cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output
+          cp -r baseline_24/ERNIE-4.5-0.3B-Paddle ./baseline_output
           LOGPROB_EXIT_CODE=0
           python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions  --case ./cases/demo.yaml  --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$?
           echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env

diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml
@@ -172,7 +172,7 @@ jobs:
           --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
           git config --global --add safe.directory /workspace/FastDeploy
           cd FastDeploy
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
           python -m pip install ${fd_wheel_url}
           bash scripts/run_pre_ce.sh
           '
diff --git a/.github/workflows/_stable_test.yml b/.github/workflows/_stable_test.yml
@@ -164,7 +164,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 

diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml
@@ -203,7 +203,7 @@ jobs:
           git config --global --add safe.directory /workspace/FastDeploy
           cd FastDeploy
           git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
           pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
           python -m pip install -r scripts/unittest_requirement.txt

diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py
@@ -120,6 +120,8 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
         Returns:
             Tensor: Output tensor.
         """
+        if self.bias is None and self.quant_scale == -1:
+            return paddle.nn.functional.swiglu(x)
         return fused_bias_act(
             x,
             bias=self.bias,

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -155,9 +155,10 @@ def apply_ep_prefill(
             topk_ids_hookfunc(topk_ids=topk_idx)
 
         # 2. Dynamic compute blockwise quantization scales
-        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            x, self.quant_config.weight_block_size[0]
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=False
         )
-        )
+        )
+        # fp8_quant_blockwise may return an extra padded dimension on the scale tensor
+        # when output_scale_transpose=False. Slice by x.shape[0] to keep only the
+        # valid batch entries so that x_scale_tensor matches the layout expected by EP.
-        )
+        )
+        # fp8_quant_blockwise may return an extra padded dimension on the scale tensor
+        # when output_scale_transpose=False. Slice by x.shape[0] to keep only the
+        # valid batch entries so that x_scale_tensor matches the layout expected by EP.
+        x_scale_tensor = x_scale_tensor[: x.shape[0]]
 
         event = deep_ep.Buffer.capture()
         let_another_thread_run()
@@ -225,11 +226,10 @@ def apply_ep_prefill(
             ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
 
             # down_proj
-            ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-                ffn_out, self.quant_config.weight_block_size[0]
+            ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+                ffn_out, using_pow2_scale=False
-                ffn_out, using_pow2_scale=False
+                ffn_out,
+                using_pow2_scale=False,
+                output_scale_transpose=True,
-                ffn_out, using_pow2_scale=False
+                ffn_out,
+                using_pow2_scale=False,
+                output_scale_transpose=False,
-                ffn_out, using_pow2_scale=False
+                ffn_out,
+                using_pow2_scale=False,
+                output_scale_transpose=True,
-                ffn_out, using_pow2_scale=False
+                ffn_out,
+                using_pow2_scale=False,
+                output_scale_transpose=False,
             )
-            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous()
-            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]]
 
             ffn_out = paddle.empty(
                 (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]),
@@ -381,7 +381,12 @@ def apply_tp(
 
         tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
 
-        recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128)
+        recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x,
+            using_pow2_scale=False,
+            output_scale_transpose=False,
+        )
+        recv_x_scale = recv_x_scale[: recv_x.shape[0]]
 
         (
             permute_input,
@@ -422,12 +427,10 @@ def apply_tp(
         ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
 
         # down_proj
-        ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            ffn_out, self.quant_config.weight_block_size[0]
+        ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            ffn_out, using_pow2_scale=False
         )
-
-        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous()
-        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]]
 
         ffn_out = paddle.empty(
             (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]),

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -1525,7 +1525,10 @@ def apply(
 
         from .triton_moe_kernels import fused_moe_kernel_paddle
 
-        x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, self.quant_config.weight_block_size[0])
+        x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=False
+        )
-        )
+        )
+        # fp8_quant_blockwise may pad the leading dimension of x_q/x_scale to
+        # a multiple of the internal block size (e.g. BLOCK_SIZE_M). Only the
+        # first x.shape[0] entries correspond to real tokens, so we slice here
+        # to match the original token dimension. The padded region is handled
+        # separately via max_num_tokens_padded and related Triton kernel args.
-        )
+        )
+        # fp8_quant_blockwise may pad the leading dimension of x_q/x_scale to
+        # a multiple of the internal block size (e.g. BLOCK_SIZE_M). Only the
+        # first x.shape[0] entries correspond to real tokens, so we slice here
+        # to match the original token dimension. The padded region is handled
+        # separately via max_num_tokens_padded and related Triton kernel args.
+        x_scale = x_scale[: x.shape[0]]
 
         fused_moe_kernel_paddle[grid](
             x_q,
@@ -1578,9 +1581,10 @@ def apply(
             ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
         )
 
-        x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            intermediate_cache2, self.quant_config.weight_block_size[0]
+        x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False
         )
-        )
+        )
+        # Align the activation scale with the quantized activation rows.
+        # fp8_quant_blockwise may return extra scale rows due to block padding,
+        # but the fused Triton kernel expects one scale row per row in x_q.
-        )
+        )
+        # Align the activation scale with the quantized activation rows.
+        # fp8_quant_blockwise may return extra scale rows due to block padding,
+        # but the fused Triton kernel expects one scale row per row in x_q.
+        x_scale = x_scale[: x_q.shape[0]]
 
         fused_moe_kernel_paddle[grid](
             x_q,

diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -18,7 +18,6 @@
 
 import paddle
 
-import fastdeploy
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -226,9 +225,10 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal
         layer.weight_scale_inv.set_value(weight_scale)
 
     def apply(self, layer, x):
-        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
-            x, self.quant_config.weight_block_size[0]
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=True
         )
+        x_scale_tensor = x_scale_tensor.T
-        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
-            x, using_pow2_scale=False, output_scale_transpose=True
-        )
-        x_scale_tensor = x_scale_tensor.T
+        # output_scale_transpose=True returns the scale tensor in the layout required by deep_gemm
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=True
+        )
-        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
-            x, using_pow2_scale=False, output_scale_transpose=True
-        )
-        x_scale_tensor = x_scale_tensor.T
+        # output_scale_transpose=True returns the scale tensor in the layout required by deep_gemm
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=True
+        )
         linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
         from fastdeploy.model_executor.ops.gpu import deep_gemm
 

diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
@@ -220,6 +220,35 @@ def group_wise_int4_weight_quantize(weight: paddle.Tensor, group_size: int = 128
     return quant_weight.astype(paddle.int8), weight_scale
 
 
+def scale_wrapper(x_amax: paddle.Tensor, eps: float = 0.0) -> paddle.Tensor:
+    """
+    Paddle implementation of CUDA ScaleWrapper logic.
+    Args:
+        x_amax (paddle.Tensor): amax tensor (float32 recommended)
+        eps (float): epsilon to avoid division by zero
+    Returns:
+        paddle.Tensor: scale tensor, same shape as x_amax
+    """
+    fp8_max = 448.0
+    float_max = paddle.finfo(paddle.float32).max
+    amax_mod = paddle.maximum(
+        x_amax,
+        paddle.full_like(x_amax, eps),
+    )
+    scale = fp8_max / amax_mod
+    scale = paddle.where(
+        amax_mod == 0,
+        paddle.ones_like(scale),
+        scale,
+    )
+    scale = paddle.where(
-    Args:
-        x_amax (paddle.Tensor): amax tensor (float32 recommended)
-        eps (float): epsilon to avoid division by zero
-    Returns:
-        paddle.Tensor: scale tensor, same shape as x_amax
-    """
-    fp8_max = 448.0
-    float_max = paddle.finfo(paddle.float32).max
-    amax_mod = paddle.maximum(
-        x_amax,
-        paddle.full_like(x_amax, eps),
-    )
-    scale = fp8_max / amax_mod
-    scale = paddle.where(
-        amax_mod == 0,
-        paddle.ones_like(scale),
-        scale,
-    )
-    scale = paddle.where(
+
+    Args:
+        x_amax (paddle.Tensor): amax tensor (float32 recommended)
+        eps (float): Non-negative epsilon to avoid division by zero.
+            When eps == 0.0 and x_amax contains zeros, the corresponding
+            scale values are set to 1.0 to avoid infinite results.
+
+    Returns:
+        paddle.Tensor: scale tensor, same shape as x_amax
+    """
+    fp8_max = 448.0
+    float_max = paddle.finfo(paddle.float32).max
+    amax_floor = paddle.full_like(x_amax, eps)
+    amax_mod = paddle.maximum(x_amax, amax_floor)
+    scale = fp8_max / amax_mod
+    # Only apply zero-guard when eps <= 0.0; for eps > 0.0, amax_mod is
+    # already guaranteed to be at least eps, so this condition would be redundant.
+    if eps <= 0.0:
+        scale = paddle.where(
+            amax_mod == 0,
+            paddle.ones_like(scale),
+            scale,
+        )
+    scale = paddle.where(
-    Args:
-        x_amax (paddle.Tensor): amax tensor (float32 recommended)
-        eps (float): epsilon to avoid division by zero
-    Returns:
-        paddle.Tensor: scale tensor, same shape as x_amax
-    """
-    fp8_max = 448.0
-    float_max = paddle.finfo(paddle.float32).max
-    amax_mod = paddle.maximum(
-        x_amax,
-        paddle.full_like(x_amax, eps),
-    )
-    scale = fp8_max / amax_mod
-    scale = paddle.where(
-        amax_mod == 0,
-        paddle.ones_like(scale),
-        scale,
-    )
-    scale = paddle.where(
+
+    Args:
+        x_amax (paddle.Tensor): amax tensor (float32 recommended)
+        eps (float): Non-negative epsilon to avoid division by zero.
+            When eps == 0.0 and x_amax contains zeros, the corresponding
+            scale values are set to 1.0 to avoid infinite results.
+
+    Returns:
+        paddle.Tensor: scale tensor, same shape as x_amax
+    """
+    fp8_max = 448.0
+    float_max = paddle.finfo(paddle.float32).max
+    amax_floor = paddle.full_like(x_amax, eps)
+    amax_mod = paddle.maximum(x_amax, amax_floor)
+    scale = fp8_max / amax_mod
+    # Only apply zero-guard when eps <= 0.0; for eps > 0.0, amax_mod is
+    # already guaranteed to be at least eps, so this condition would be redundant.
+    if eps <= 0.0:
+        scale = paddle.where(
+            amax_mod == 0,
+            paddle.ones_like(scale),
+            scale,
+        )
+    scale = paddle.where(
+        paddle.isinf(scale),
+        paddle.full_like(scale, float_max),
+        scale,
+    )
+    return scale
+
+
 def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]:
     """
     Only used in deep_gemm block wise quant weight.
@@ -244,11 +273,10 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten
 
     x_abs = paddle.abs(x_view).astype(paddle.float32)
     x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
-    x_amax = paddle.clip(x_amax, min=1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn)
-
+    scale = scale_wrapper(x_amax)
+    x_scaled = (x_view * scale).astype(paddle.float8_e4m3fn)
     return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2]))
+        paddle.view(1.0 / scale, (x_view.shape[0], x_view.shape[2]))
     )
 
 

diff --git a/tests/ce/server/test_logprobs.py b/tests/ce/server/test_logprobs.py
@@ -25,10 +25,10 @@ def test_unstream_with_logprobs():
     # 校验返回内容与概率信息
     assert resp_json["choices"][0]["message"]["content"] == "牛顿的"
     assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
-    assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
+    assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
     assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
         "token": "牛顿",
-        "logprob": -0.031025361269712448,
+        "logprob": -0.03113006055355072,
         "bytes": [231, 137, 155, 233, 161, 191],
         "top_logprobs": None,
     }
@@ -102,10 +102,10 @@ def test_stream_with_logprobs():
     # 校验概率字段
     assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
-    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
+    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
         "token": "牛顿",
-        "logprob": -0.031025361269712448,
+        "logprob": -0.03113006055355072,
         "bytes": [231, 137, 155, 233, 161, 191],
     }
 
@@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs():
     # 校验概率字段
     assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
-    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953
+    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
         "token": "牛顿",
-        "logprob": -0.006811376195400953,
+        "logprob": -0.0068125599063932896,
         "bytes": [231, 137, 155, 233, 161, 191],
     }
 

diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     # base result
     base_path = os.getenv("MODEL_PATH")
     if base_path:
-        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev")
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24")
     else:
-        base_file = "ernie-4_5-vl-base-tp2-dev"
+        base_file = "ernie-4_5-vl-base-tp2-24"
     with open(base_file, "r") as f:
         content2 = f.read()
 

diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py
@@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     # base result
     base_path = os.getenv("MODEL_PATH")
     if base_path:
-        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev")
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24")
     else:
-        base_file = "ernie-4_5-vl-base-tp2-dev"
+        base_file = "ernie-4_5-vl-base-tp2-24"
     with open(base_file, "r") as f:
         content2 = f.read()
 

diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py
@@ -151,9 +151,9 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
     cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/"
     model_path = os.getenv("MODEL_PATH")
     if model_path:
-        baseline_path = os.path.join(model_path, f"R3_BaseLine/routing_replay_output_baseline_{model_name}")
+        baseline_path = os.path.join(model_path, f"R3_BaseLine_24/routing_replay_output_baseline_{model_name}")
     else:
-        baseline_path = f"./R3_BaseLine/routing_replay_output_baseline_{model_name}"
+        baseline_path = f"./R3_BaseLine_24/routing_replay_output_baseline_{model_name}"
     stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")
 
     nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")

diff --git a/tests/layers/test_activation.py b/tests/layers/test_activation.py
@@ -84,8 +84,11 @@ def test_forward_cuda(self, mock_fused, mock_platform):
         layer = SiluAndMul(fd_config)
         x = paddle.ones([2, 2])
         out = layer.forward(x)
-        self.assertTrue((out.numpy() == 1).all())
-        mock_fused.assert_called_once()
+        if layer.bias is None and layer.quant_scale == -1:
+            self.assertTrue((out.numpy() == 0.73105854).all())
+        else:
+            self.assertTrue((out.numpy() == 1).all())
+            mock_fused.assert_called_once()
 
     # Test forward computation on GCU platform
     @patch(

diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py
@@ -140,7 +140,7 @@ def test_model_against_baseline(
 
     # Get baseline suffix from config
     model_config = hugging_face_model_param_map.get(model_name_or_path, {})
-    baseline_suffix = model_config.get("baseline_suffix", "tp2")
+    baseline_suffix = model_config.get("baseline_suffix", "tp2-24")
     baseline_filename = f"{model_name_or_path}-{baseline_suffix}"
 
     if base_path: