From 9e15dfca3e479b4787f57e88c04b007a224a4dc8 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Mon, 12 Jan 2026 15:23:16 +0800
Subject: [PATCH 01/14] quant stash

---
 .../layers/moe/fused_moe_deepgemm_backend.py  | 31 ++++++++++++++-----
 .../layers/quantization/block_wise_fp8.py     |  9 ++++--
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
index 881f9a22c4d..8cac63277c5 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -155,9 +155,15 @@ def apply_ep_prefill(
             topk_ids_hookfunc(topk_ids=topk_idx)
 
         # 2. Dynamic compute blockwise quantization scales
-        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            x, self.quant_config.weight_block_size[0]
+        # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+        #     x, self.quant_config.weight_block_size[0]
+        # )
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x,
+            using_pow2_scale=False,
+            output_scale_transpose=False
         )
+        x_scale_tensor = x_scale_tensor[:x.shape[0]]
 
         event = deep_ep.Buffer.capture()
         let_another_thread_run()
@@ -225,11 +231,15 @@ def apply_ep_prefill(
             ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
 
             # down_proj
-            ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-                ffn_out, self.quant_config.weight_block_size[0]
+            # ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            #     ffn_out, self.quant_config.weight_block_size[0]
+            # )
+            # ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous().transpose([1, 0])
+            ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+                ffn_out,
+                using_pow2_scale=False
             )
-            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous()
-            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+            paddle_scale_fp8 = paddle_scale_fp8.T[:ffn_in_x.shape[0]]
 
             ffn_out = paddle.empty(
                 (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]),
@@ -381,7 +391,14 @@ def apply_tp(
 
         tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
 
-        recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128)
+        # recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128)
+        recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x,
+            using_pow2_scale=False,
+            output_scale_transpose=False,
+        )
+        recv_x_scale = recv_x_scale[:recv_x.shape[0]]
+
 
         (
             permute_input,
diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
index 59daa238480..59aecd3083f 100644
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -226,9 +226,14 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal
         layer.weight_scale_inv.set_value(weight_scale)
 
     def apply(self, layer, x):
-        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
-            x, self.quant_config.weight_block_size[0]
+        # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
+        #     x, self.quant_config.weight_block_size[0]
+        # )
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x,
+            using_pow2_scale=False
         )
+        x_scale_tensor = x_scale_tensor.T[:x.shape[0]]
         linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
         from fastdeploy.model_executor.ops.gpu import deep_gemm
 

From db6202ac4ceb5bf61b14a9c8174137834a0048a0 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Mon, 12 Jan 2026 16:08:31 +0800
Subject: [PATCH 02/14] blockwise_quant

---
 fastdeploy/model_executor/layers/utils.py | 34 ++++++++++++++---------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
index c18f062457e..da71448674d 100644
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -237,19 +237,27 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten
         dtype=x.dtype,
     )
     x_padded[:m, :n] = x
-    x_view = paddle.view(
-        x_padded,
-        (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]),
-    )
-
-    x_abs = paddle.abs(x_view).astype(paddle.float32)
-    x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
-    x_amax = paddle.clip(x_amax, min=1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn)
-
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2]))
-    )
+    # x_view = paddle.view(
+    #     x_padded,
+    #     (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]),
+    # )
+    from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise
+    x_q, scale = fp8_quant_blockwise(
+                x_padded,
+                quant_method="128x128",
+                input_transpose=False,
+                output_scale_transpose=False,
+                using_pow2_scale=False
+            )
+    return x_q[:m, :n].contiguous(), scale
+    # x_abs = paddle.abs(x_view).astype(paddle.float32)
+    # x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
+    # x_amax = paddle.clip(x_amax, min=1e-4)
+    # x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn)
+
+    # return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+    #     paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2]))
+    # )
 
 
 def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:

From 3bc27694967ab64d1ae63a62f83e055fbb1b18d0 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Tue, 13 Jan 2026 10:40:57 +0800
Subject: [PATCH 03/14] rm tensor.cut

---
 .../model_executor/layers/quantization/block_wise_fp8.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
index 59aecd3083f..f4903e2f0ba 100644
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -226,14 +226,7 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal
         layer.weight_scale_inv.set_value(weight_scale)
 
     def apply(self, layer, x):
-        # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
-        #     x, self.quant_config.weight_block_size[0]
-        # )
-        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
-            x,
-            using_pow2_scale=False
-        )
-        x_scale_tensor = x_scale_tensor.T[:x.shape[0]]
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(x, using_pow2_scale=False)
         linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
         from fastdeploy.model_executor.ops.gpu import deep_gemm
 

From 4ca6e63ee22cbe3b1aa666f91e63e4137711d9c1 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Tue, 13 Jan 2026 11:01:45 +0800
Subject: [PATCH 04/14] tp ok

---
 .../model_executor/layers/quantization/block_wise_fp8.py     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
index f4903e2f0ba..24abf681791 100644
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -226,7 +226,10 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal
         layer.weight_scale_inv.set_value(weight_scale)
 
     def apply(self, layer, x):
-        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(x, using_pow2_scale=False)
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=True
+        )
+        x_scale_tensor = x_scale_tensor.T
         linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
         from fastdeploy.model_executor.ops.gpu import deep_gemm
 

From 2ce15d8ee85244086bf1e5477337974bc293d991 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Tue, 13 Jan 2026 15:40:06 +0800
Subject: [PATCH 05/14] add paddle swiglu

---
 fastdeploy/model_executor/layers/activation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py
index 35aa40b77e0..9b038bae62b 100644
--- a/fastdeploy/model_executor/layers/activation.py
+++ b/fastdeploy/model_executor/layers/activation.py
@@ -120,6 +120,8 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
         Returns:
             Tensor: Output tensor.
         """
+        if self.bias is None and self.quant_scale == -1:
+            return paddle.nn.functional.swiglu(x)
         return fused_bias_act(
             x,
             bias=self.bias,

From 63dcefa35a1ed4c94daef10e34c0c78028348a41 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Tue, 13 Jan 2026 15:58:54 +0800
Subject: [PATCH 06/14] 21B test ok

---
 .../layers/moe/fused_moe_deepgemm_backend.py  | 30 +++++--------------
 .../layers/moe/fused_moe_triton_backend.py    | 10 +++++--
 fastdeploy/model_executor/layers/utils.py     | 22 +++-----------
 3 files changed, 19 insertions(+), 43 deletions(-)

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
index 8cac63277c5..dc088cf9eb9 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -155,15 +155,10 @@ def apply_ep_prefill(
             topk_ids_hookfunc(topk_ids=topk_idx)
 
         # 2. Dynamic compute blockwise quantization scales
-        # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-        #     x, self.quant_config.weight_block_size[0]
-        # )
         x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
-            x,
-            using_pow2_scale=False,
-            output_scale_transpose=False
+            x, using_pow2_scale=False, output_scale_transpose=False
         )
-        x_scale_tensor = x_scale_tensor[:x.shape[0]]
+        x_scale_tensor = x_scale_tensor[: x.shape[0]]
 
         event = deep_ep.Buffer.capture()
         let_another_thread_run()
@@ -231,15 +226,10 @@ def apply_ep_prefill(
             ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
 
             # down_proj
-            # ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            #     ffn_out, self.quant_config.weight_block_size[0]
-            # )
-            # ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous().transpose([1, 0])
             ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                ffn_out,
-                using_pow2_scale=False
+                ffn_out, using_pow2_scale=False
             )
-            paddle_scale_fp8 = paddle_scale_fp8.T[:ffn_in_x.shape[0]]
+            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]]
 
             ffn_out = paddle.empty(
                 (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]),
@@ -391,14 +381,12 @@ def apply_tp(
 
         tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
 
-        # recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128)
         recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
             x,
             using_pow2_scale=False,
             output_scale_transpose=False,
         )
-        recv_x_scale = recv_x_scale[:recv_x.shape[0]]
-
+        recv_x_scale = recv_x_scale[: recv_x.shape[0]]
 
         (
             permute_input,
@@ -439,12 +427,10 @@ def apply_tp(
         ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
 
         # down_proj
-        ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            ffn_out, self.quant_config.weight_block_size[0]
+        ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            ffn_out, using_pow2_scale=False
         )
-
-        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous()
-        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]]
 
         ffn_out = paddle.empty(
             (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]),
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
index da705357c12..922729d91bd 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -1525,7 +1525,10 @@ def apply(
 
         from .triton_moe_kernels import fused_moe_kernel_paddle
 
-        x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, self.quant_config.weight_block_size[0])
+        x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=False
+        )
+        x_scale = x_scale[: x.shape[0]]
 
         fused_moe_kernel_paddle[grid](
             x_q,
@@ -1578,9 +1581,10 @@ def apply(
             ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
         )
 
-        x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
-            intermediate_cache2, self.quant_config.weight_block_size[0]
+        x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False
         )
+        x_scale = x_scale[: x_q.shape[0]]
 
         fused_moe_kernel_paddle[grid](
             x_q,
diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
index da71448674d..63b4fe7ffc4 100644
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -237,27 +237,13 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten
         dtype=x.dtype,
     )
     x_padded[:m, :n] = x
-    # x_view = paddle.view(
-    #     x_padded,
-    #     (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]),
-    # )
+
     from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise
+
     x_q, scale = fp8_quant_blockwise(
-                x_padded,
-                quant_method="128x128",
-                input_transpose=False,
-                output_scale_transpose=False,
-                using_pow2_scale=False
-            )
+        x_padded, quant_method="128x128", input_transpose=False, output_scale_transpose=False, using_pow2_scale=False
+    )
     return x_q[:m, :n].contiguous(), scale
-    # x_abs = paddle.abs(x_view).astype(paddle.float32)
-    # x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
-    # x_amax = paddle.clip(x_amax, min=1e-4)
-    # x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn)
-
-    # return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-    #     paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2]))
-    # )
 
 
 def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:

From 01bed15491beea9c990671ded8c25b250c56430b Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Tue, 13 Jan 2026 16:09:11 +0800
Subject: [PATCH 07/14] pre-commit

---
 fastdeploy/model_executor/layers/quantization/block_wise_fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
index 24abf681791..c13b429095a 100644
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -18,7 +18,6 @@
 
 import paddle
 
-import fastdeploy
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
     MergedColumnParallelLinear,

From 7c1bd99018cc7938ff4d2c514d3fea5bcc38ecf5 Mon Sep 17 00:00:00 2001
From: fxyfxy777 <fxyfxy777@163.com>
Date: Thu, 15 Jan 2026 17:21:12 +0800
Subject: [PATCH 08/14] fix ut error

---
 tests/layers/test_activation.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/layers/test_activation.py b/tests/layers/test_activation.py
index 70f011d3964..b564c267520 100644
--- a/tests/layers/test_activation.py
+++ b/tests/layers/test_activation.py
@@ -84,8 +84,11 @@ def test_forward_cuda(self, mock_fused, mock_platform):
         layer = SiluAndMul(fd_config)
         x = paddle.ones([2, 2])
         out = layer.forward(x)
-        self.assertTrue((out.numpy() == 1).all())
-        mock_fused.assert_called_once()
+        if layer.bias is None and layer.quant_scale == -1:
+            self.assertTrue((out.numpy() == 0.73105854).all())
+        else:
+            self.assertTrue((out.numpy() == 1).all())
+            mock_fused.assert_called_once()
 
     # Test forward computation on GCU platform
     @patch(

From 3261a53c18b9e8c144fd8646d0e6bfe4d1f3ecfc Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Fri, 23 Jan 2026 16:40:58 +0800
Subject: [PATCH 09/14] fix block quant

---
 fastdeploy/model_executor/layers/utils.py | 44 ++++++++++++++++++++---
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
index 63b4fe7ffc4..fd55846aba7 100644
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -220,6 +220,35 @@ def group_wise_int4_weight_quantize(weight: paddle.Tensor, group_size: int = 128
     return quant_weight.astype(paddle.int8), weight_scale
 
 
+def scale_wrapper(x_amax: paddle.Tensor, eps: float = 0.0) -> paddle.Tensor:
+    """
+    Paddle implementation of CUDA ScaleWrapper logic.
+    Args:
+        x_amax (paddle.Tensor): amax tensor (float32 recommended)
+        eps (float): epsilon to avoid division by zero
+    Returns:
+        paddle.Tensor: scale tensor, same shape as x_amax
+    """
+    fp8_max = 448.0
+    float_max = paddle.finfo(paddle.float32).max
+    amax_mod = paddle.maximum(
+        x_amax,
+        paddle.full_like(x_amax, eps),
+    )
+    scale = fp8_max / amax_mod
+    scale = paddle.where(
+        amax_mod == 0,
+        paddle.ones_like(scale),
+        scale,
+    )
+    scale = paddle.where(
+        paddle.isinf(scale),
+        paddle.full_like(scale, float_max),
+        scale,
+    )
+    return scale
+
+
 def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]:
     """
     Only used in deep_gemm block wise quant weight.
@@ -237,13 +266,18 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten
         dtype=x.dtype,
     )
     x_padded[:m, :n] = x
+    x_view = paddle.view(
+        x_padded,
+        (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]),
+    )
 
-    from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise
-
-    x_q, scale = fp8_quant_blockwise(
-        x_padded, quant_method="128x128", input_transpose=False, output_scale_transpose=False, using_pow2_scale=False
+    x_abs = paddle.abs(x_view).astype(paddle.float32)
+    x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
+    scale = scale_wrapper(x_amax)
+    x_scaled = (x_view * scale).astype(paddle.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        paddle.view(1.0 / scale, (x_view.shape[0], x_view.shape[2]))
     )
-    return x_q[:m, :n].contiguous(), scale
 
 
 def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:

From 2e20113d528152b88c1052d31f8dbdff6d4c2145 Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Fri, 23 Jan 2026 19:26:16 +0800
Subject: [PATCH 10/14] edit whl

---
 .github/workflows/_accuracy_test.yml      | 2 +-
 .github/workflows/_base_test.yml          | 2 +-
 .github/workflows/_build_linux.yml        | 2 +-
 .github/workflows/_logprob_test_linux.yml | 2 +-
 .github/workflows/_pre_ce_test.yml        | 2 +-
 .github/workflows/_stable_test.yml        | 2 +-
 .github/workflows/_unit_test_coverage.yml | 2 +-
 scripts/run_xpu_ci_pytest.sh              | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml
index 832d6f266a4..7f969fa7397 100644
--- a/.github/workflows/_accuracy_test.yml
+++ b/.github/workflows/_accuracy_test.yml
@@ -161,7 +161,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml
index 56808b9fd49..377714b05bc 100644
--- a/.github/workflows/_base_test.yml
+++ b/.github/workflows/_base_test.yml
@@ -186,7 +186,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
diff --git a/.github/workflows/_build_linux.yml b/.github/workflows/_build_linux.yml
index 32c689d1ada..d6bb583d2d0 100644
--- a/.github/workflows/_build_linux.yml
+++ b/.github/workflows/_build_linux.yml
@@ -173,7 +173,7 @@ jobs:
             elif [[ "${PADDLEVERSION}" != "" ]];then
               python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
             else
-              python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+              python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
             fi
 
             pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml
index fd71f57c350..cf29580a05e 100644
--- a/.github/workflows/_logprob_test_linux.yml
+++ b/.github/workflows/_logprob_test_linux.yml
@@ -156,7 +156,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml
index 768d73b1c85..72720a6a682 100644
--- a/.github/workflows/_pre_ce_test.yml
+++ b/.github/workflows/_pre_ce_test.yml
@@ -172,7 +172,7 @@ jobs:
           --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
           git config --global --add safe.directory /workspace/FastDeploy
           cd FastDeploy
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
           python -m pip install ${fd_wheel_url}
           bash scripts/run_pre_ce.sh
           '
diff --git a/.github/workflows/_stable_test.yml b/.github/workflows/_stable_test.yml
index 175f6288d76..4fd8739c41a 100644
--- a/.github/workflows/_stable_test.yml
+++ b/.github/workflows/_stable_test.yml
@@ -164,7 +164,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml
index 92843fd15bf..146df7e0fa7 100644
--- a/.github/workflows/_unit_test_coverage.yml
+++ b/.github/workflows/_unit_test_coverage.yml
@@ -203,7 +203,7 @@ jobs:
           git config --global --add safe.directory /workspace/FastDeploy
           cd FastDeploy
           git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
-          python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+          python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
           pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
           python -m pip install -r scripts/unittest_requirement.txt
diff --git a/scripts/run_xpu_ci_pytest.sh b/scripts/run_xpu_ci_pytest.sh
index f5053988eb3..f57e096f71e 100644
--- a/scripts/run_xpu_ci_pytest.sh
+++ b/scripts/run_xpu_ci_pytest.sh
@@ -74,7 +74,7 @@ python -m pip uninstall fastdeploy-xpu -y
 
 # 安装PaddlePaddle Release分支安装对应的paddle
 echo "安装release分支PaddlePaddle..."
-python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl
 
 # ============ 编译项目 ============
 

From b9e180fbd0874f825d25f4f4d19e00a8cb02628e Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Sat, 24 Jan 2026 11:13:46 +0800
Subject: [PATCH 11/14] e baseline

---
 .github/workflows/_logprob_test_linux.yml          |  2 +-
 tests/ce/server/test_logprobs.py                   | 12 ++++++------
 tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py |  4 ++--
 tests/e2e/test_EB_VL_Lite_serving.py               |  4 ++--
 tests/model_loader/test_torch_model.py             |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml
index cf29580a05e..3af3b7a6052 100644
--- a/.github/workflows/_logprob_test_linux.yml
+++ b/.github/workflows/_logprob_test_linux.yml
@@ -185,7 +185,7 @@ jobs:
             -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
           set +e
           rm -rf ./baseline_output
-          cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output
+          cp -r baseline_24/ERNIE-4.5-0.3B-Paddle ./baseline_output
           LOGPROB_EXIT_CODE=0
           python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions  --case ./cases/demo.yaml  --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$?
           echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env
diff --git a/tests/ce/server/test_logprobs.py b/tests/ce/server/test_logprobs.py
index 83ca89486c9..3674b3a6b96 100644
--- a/tests/ce/server/test_logprobs.py
+++ b/tests/ce/server/test_logprobs.py
@@ -25,10 +25,10 @@ def test_unstream_with_logprobs():
     # 校验返回内容与概率信息
     assert resp_json["choices"][0]["message"]["content"] == "牛顿的"
     assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
-    assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
+    assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
     assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
         "token": "牛顿",
-        "logprob": -0.031025361269712448,
+        "logprob": -0.03113006055355072,
         "bytes": [231, 137, 155, 233, 161, 191],
         "top_logprobs": None,
     }
@@ -102,10 +102,10 @@ def test_stream_with_logprobs():
     # 校验概率字段
     assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
-    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
+    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
         "token": "牛顿",
-        "logprob": -0.031025361269712448,
+        "logprob": -0.03113006055355072,
         "bytes": [231, 137, 155, 233, 161, 191],
     }
 
@@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs():
     # 校验概率字段
     assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
-    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953
+    assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896
     assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
         "token": "牛顿",
-        "logprob": -0.006811376195400953,
+        "logprob": -0.0068125599063932896,
         "bytes": [231, 137, 155, 233, 161, 191],
     }
 
diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index e51018f201e..8d4d157de8d 100644
--- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     # base result
     base_path = os.getenv("MODEL_PATH")
     if base_path:
-        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev")
+        base_file = os.path.join(base_path, " ernie-4_5-vl-base-tp2-24")
     else:
-        base_file = "ernie-4_5-vl-base-tp2-dev"
+        base_file = " ernie-4_5-vl-base-tp2-24"
     with open(base_file, "r") as f:
         content2 = f.read()
 
diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py
index f93f355a754..7783b844148 100644
--- a/tests/e2e/test_EB_VL_Lite_serving.py
+++ b/tests/e2e/test_EB_VL_Lite_serving.py
@@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     # base result
     base_path = os.getenv("MODEL_PATH")
     if base_path:
-        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev")
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24")
     else:
-        base_file = "ernie-4_5-vl-base-tp2-dev"
+        base_file = "ernie-4_5-vl-base-tp2-24"
     with open(base_file, "r") as f:
         content2 = f.read()
 
diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py
index bc8252a4427..0170bef1da6 100644
--- a/tests/model_loader/test_torch_model.py
+++ b/tests/model_loader/test_torch_model.py
@@ -140,7 +140,7 @@ def test_model_against_baseline(
 
     # Get baseline suffix from config
     model_config = hugging_face_model_param_map.get(model_name_or_path, {})
-    baseline_suffix = model_config.get("baseline_suffix", "tp2")
+    baseline_suffix = model_config.get("baseline_suffix", "tp2-24")
     baseline_filename = f"{model_name_or_path}-{baseline_suffix}"
 
     if base_path:

From 0ef0a00edabed6e54a9a9fed9f192bdb13ded21f Mon Sep 17 00:00:00 2001
From: fanxiangyu <fxyfxy777@163.com>
Date: Sat, 24 Jan 2026 11:20:20 +0800
Subject: [PATCH 12/14] e baseline 2

---
 tests/e2e/utils/rollout_routing_replay_test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py
index 499bbbed688..e5ecd4ca33f 100644
--- a/tests/e2e/utils/rollout_routing_replay_test_utils.py
+++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py
@@ -151,9 +151,9 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
     cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/"
     model_path = os.getenv("MODEL_PATH")
     if model_path:
-        baseline_path = os.path.join(model_path, f"R3_BaseLine/routing_replay_output_baseline_{model_name}")
+        baseline_path = os.path.join(model_path, f"R3_BaseLine_24/routing_replay_output_baseline_{model_name}")
     else:
-        baseline_path = f"./R3_BaseLine/routing_replay_output_baseline_{model_name}"
+        baseline_path = f"./R3_BaseLine_24/routing_replay_output_baseline_{model_name}"
     stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")
 
     nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")

From 7f8c74ebe41f84acefccfce17ea5ae3fd43fd75c Mon Sep 17 00:00:00 2001
From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Date: Sat, 24 Jan 2026 12:47:10 +0800
Subject: [PATCH 13/14] chore: remove extra whitespace in
 test_EB_VL_Lite_serving.py

---
 tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index 8d4d157de8d..acbf7872e66 100644
--- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     # base result
     base_path = os.getenv("MODEL_PATH")
     if base_path:
-        base_file = os.path.join(base_path, " ernie-4_5-vl-base-tp2-24")
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24")
     else:
-        base_file = " ernie-4_5-vl-base-tp2-24"
+        base_file = "ernie-4_5-vl-base-tp2-24"
     with open(base_file, "r") as f:
         content2 = f.read()
 

From b72882c23963fa6cc13d3a25d0a2980a3c891846 Mon Sep 17 00:00:00 2001
From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Date: Sat, 24 Jan 2026 14:45:01 +0800
Subject: [PATCH 14/14] chore: keep paddlepaddle-xpu unchanged

---
 scripts/run_xpu_ci_pytest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_xpu_ci_pytest.sh b/scripts/run_xpu_ci_pytest.sh
index f57e096f71e..f5053988eb3 100644
--- a/scripts/run_xpu_ci_pytest.sh
+++ b/scripts/run_xpu_ci_pytest.sh
@@ -74,7 +74,7 @@ python -m pip uninstall fastdeploy-xpu -y
 
 # 安装PaddlePaddle Release分支安装对应的paddle
 echo "安装release分支PaddlePaddle..."
-python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl
+python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 
 # ============ 编译项目 ============