From 9e15dfca3e479b4787f57e88c04b007a224a4dc8 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Mon, 12 Jan 2026 15:23:16 +0800 Subject: [PATCH 01/14] quant stash --- .../layers/moe/fused_moe_deepgemm_backend.py | 31 ++++++++++++++----- .../layers/quantization/block_wise_fp8.py | 9 ++++-- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 881f9a22c4d..8cac63277c5 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -155,9 +155,15 @@ def apply_ep_prefill( topk_ids_hookfunc(topk_ids=topk_idx) # 2. Dynamic compute blockwise quantization scales - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, self.quant_config.weight_block_size[0] + # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( + # x, self.quant_config.weight_block_size[0] + # ) + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, + using_pow2_scale=False, + output_scale_transpose=False ) + x_scale_tensor = x_scale_tensor[:x.shape[0]] event = deep_ep.Buffer.capture() let_another_thread_run() @@ -225,11 +231,15 @@ def apply_ep_prefill( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + # ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( + # ffn_out, self.quant_config.weight_block_size[0] + # ) + # ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous().transpose([1, 0]) + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, + using_pow2_scale=False ) - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + paddle_scale_fp8 = paddle_scale_fp8.T[:ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), @@ -381,7 +391,14 @@ def apply_tp( tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) - recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) + # recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) + recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, + using_pow2_scale=False, + output_scale_transpose=False, + ) + recv_x_scale = recv_x_scale[:recv_x.shape[0]] + ( permute_input, diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 59daa238480..59aecd3083f 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -226,9 +226,14 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal layer.weight_scale_inv.set_value(weight_scale) def apply(self, layer, x): - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( - x, self.quant_config.weight_block_size[0] + # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( + # x, self.quant_config.weight_block_size[0] + # ) + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, + using_pow2_scale=False ) + x_scale_tensor = x_scale_tensor.T[:x.shape[0]] linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) from fastdeploy.model_executor.ops.gpu import deep_gemm From db6202ac4ceb5bf61b14a9c8174137834a0048a0 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Mon, 12 Jan 2026 16:08:31 +0800 Subject: [PATCH 02/14] blockwise_quant --- fastdeploy/model_executor/layers/utils.py | 34 ++++++++++++++--------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index c18f062457e..da71448674d 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -237,19 +237,27 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten dtype=x.dtype, ) x_padded[:m, :n] = x - x_view = paddle.view( - x_padded, - (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]), - ) - - x_abs = paddle.abs(x_view).astype(paddle.float32) - x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) - x_amax = paddle.clip(x_amax, min=1e-4) - x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) - - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) - ) + # x_view = paddle.view( + # x_padded, + # (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]), + # ) + from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise + x_q, scale = fp8_quant_blockwise( + x_padded, + quant_method="128x128", + input_transpose=False, + output_scale_transpose=False, + using_pow2_scale=False + ) + return x_q[:m, :n].contiguous(), scale + # x_abs = paddle.abs(x_view).astype(paddle.float32) + # x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) + # x_amax = paddle.clip(x_amax, min=1e-4) + # x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) + + # return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + # paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) + # ) def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]: From 3bc27694967ab64d1ae63a62f83e055fbb1b18d0 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Tue, 13 Jan 2026 10:40:57 +0800 Subject: [PATCH 03/14] rm tensor.cut --- .../model_executor/layers/quantization/block_wise_fp8.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 59aecd3083f..f4903e2f0ba 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -226,14 +226,7 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal layer.weight_scale_inv.set_value(weight_scale) def apply(self, layer, x): - # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( - # x, self.quant_config.weight_block_size[0] - # ) - x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( - x, - using_pow2_scale=False - ) - x_scale_tensor = x_scale_tensor.T[:x.shape[0]] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(x, using_pow2_scale=False) linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) from fastdeploy.model_executor.ops.gpu import deep_gemm From 4ca6e63ee22cbe3b1aa666f91e63e4137711d9c1 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Tue, 13 Jan 2026 11:01:45 +0800 Subject: [PATCH 04/14] tp ok --- .../model_executor/layers/quantization/block_wise_fp8.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index f4903e2f0ba..24abf681791 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -226,7 +226,10 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal layer.weight_scale_inv.set_value(weight_scale) def apply(self, layer, x): - x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(x, using_pow2_scale=False) + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=True + ) + x_scale_tensor = x_scale_tensor.T linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) from fastdeploy.model_executor.ops.gpu import deep_gemm From 2ce15d8ee85244086bf1e5477337974bc293d991 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Tue, 13 Jan 2026 15:40:06 +0800 Subject: [PATCH 05/14] add paddle swiglu --- fastdeploy/model_executor/layers/activation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py index 35aa40b77e0..9b038bae62b 100644 --- a/fastdeploy/model_executor/layers/activation.py +++ b/fastdeploy/model_executor/layers/activation.py @@ -120,6 +120,8 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: Returns: Tensor: Output tensor. """ + if self.bias is None and self.quant_scale == -1: + return paddle.nn.functional.swiglu(x) return fused_bias_act( x, bias=self.bias, From 63dcefa35a1ed4c94daef10e34c0c78028348a41 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Tue, 13 Jan 2026 15:58:54 +0800 Subject: [PATCH 06/14] 21B test ok --- .../layers/moe/fused_moe_deepgemm_backend.py | 30 +++++-------------- .../layers/moe/fused_moe_triton_backend.py | 10 +++++-- fastdeploy/model_executor/layers/utils.py | 22 +++----------- 3 files changed, 19 insertions(+), 43 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 8cac63277c5..dc088cf9eb9 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -155,15 +155,10 @@ def apply_ep_prefill( topk_ids_hookfunc(topk_ids=topk_idx) # 2. Dynamic compute blockwise quantization scales - # x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - # x, self.quant_config.weight_block_size[0] - # ) x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( - x, - using_pow2_scale=False, - output_scale_transpose=False + x, using_pow2_scale=False, output_scale_transpose=False ) - x_scale_tensor = x_scale_tensor[:x.shape[0]] + x_scale_tensor = x_scale_tensor[: x.shape[0]] event = deep_ep.Buffer.capture() let_another_thread_run() @@ -231,15 +226,10 @@ def apply_ep_prefill( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None) # down_proj - # ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - # ffn_out, self.quant_config.weight_block_size[0] - # ) - # ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous().transpose([1, 0]) ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( - ffn_out, - using_pow2_scale=False + ffn_out, using_pow2_scale=False ) - paddle_scale_fp8 = paddle_scale_fp8.T[:ffn_in_x.shape[0]] + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), @@ -391,14 +381,12 @@ def apply_tp( tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) - # recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( x, using_pow2_scale=False, output_scale_transpose=False, ) - recv_x_scale = recv_x_scale[:recv_x.shape[0]] - + recv_x_scale = recv_x_scale[: recv_x.shape[0]] ( permute_input, @@ -439,12 +427,10 @@ def apply_tp( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index da705357c12..922729d91bd 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -1525,7 +1525,10 @@ def apply( from .triton_moe_kernels import fused_moe_kernel_paddle - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, self.quant_config.weight_block_size[0]) + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False + ) + x_scale = x_scale[: x.shape[0]] fused_moe_kernel_paddle[grid]( x_q, @@ -1578,9 +1581,10 @@ def apply( ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - intermediate_cache2, self.quant_config.weight_block_size[0] + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False ) + x_scale = x_scale[: x_q.shape[0]] fused_moe_kernel_paddle[grid]( x_q, diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index da71448674d..63b4fe7ffc4 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -237,27 +237,13 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten dtype=x.dtype, ) x_padded[:m, :n] = x - # x_view = paddle.view( - # x_padded, - # (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]), - # ) + from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise + x_q, scale = fp8_quant_blockwise( - x_padded, - quant_method="128x128", - input_transpose=False, - output_scale_transpose=False, - using_pow2_scale=False - ) + x_padded, quant_method="128x128", input_transpose=False, output_scale_transpose=False, using_pow2_scale=False + ) return x_q[:m, :n].contiguous(), scale - # x_abs = paddle.abs(x_view).astype(paddle.float32) - # x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) - # x_amax = paddle.clip(x_amax, min=1e-4) - # x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) - - # return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - # paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) - # ) def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]: From 01bed15491beea9c990671ded8c25b250c56430b Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Tue, 13 Jan 2026 16:09:11 +0800 Subject: [PATCH 07/14] pre-commit --- fastdeploy/model_executor/layers/quantization/block_wise_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 24abf681791..c13b429095a 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -18,7 +18,6 @@ import paddle -import fastdeploy from fastdeploy import envs from fastdeploy.model_executor.layers.linear import ( MergedColumnParallelLinear, From 7c1bd99018cc7938ff4d2c514d3fea5bcc38ecf5 Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Thu, 15 Jan 2026 17:21:12 +0800 Subject: [PATCH 08/14] fix ut error --- tests/layers/test_activation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/layers/test_activation.py b/tests/layers/test_activation.py index 70f011d3964..b564c267520 100644 --- a/tests/layers/test_activation.py +++ b/tests/layers/test_activation.py @@ -84,8 +84,11 @@ def test_forward_cuda(self, mock_fused, mock_platform): layer = SiluAndMul(fd_config) x = paddle.ones([2, 2]) out = layer.forward(x) - self.assertTrue((out.numpy() == 1).all()) - mock_fused.assert_called_once() + if layer.bias is None and layer.quant_scale == -1: + self.assertTrue((out.numpy() == 0.73105854).all()) + else: + self.assertTrue((out.numpy() == 1).all()) + mock_fused.assert_called_once() # Test forward computation on GCU platform @patch( From 3261a53c18b9e8c144fd8646d0e6bfe4d1f3ecfc Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Fri, 23 Jan 2026 16:40:58 +0800 Subject: [PATCH 09/14] fix block quant --- fastdeploy/model_executor/layers/utils.py | 44 ++++++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index 63b4fe7ffc4..fd55846aba7 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -220,6 +220,35 @@ def group_wise_int4_weight_quantize(weight: paddle.Tensor, group_size: int = 128 return quant_weight.astype(paddle.int8), weight_scale +def scale_wrapper(x_amax: paddle.Tensor, eps: float = 0.0) -> paddle.Tensor: + """ + Paddle implementation of CUDA ScaleWrapper logic. + Args: + x_amax (paddle.Tensor): amax tensor (float32 recommended) + eps (float): epsilon to avoid division by zero + Returns: + paddle.Tensor: scale tensor, same shape as x_amax + """ + fp8_max = 448.0 + float_max = paddle.finfo(paddle.float32).max + amax_mod = paddle.maximum( + x_amax, + paddle.full_like(x_amax, eps), + ) + scale = fp8_max / amax_mod + scale = paddle.where( + amax_mod == 0, + paddle.ones_like(scale), + scale, + ) + scale = paddle.where( + paddle.isinf(scale), + paddle.full_like(scale, float_max), + scale, + ) + return scale + + def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]: """ Only used in deep_gemm block wise quant weight. @@ -237,13 +266,18 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten dtype=x.dtype, ) x_padded[:m, :n] = x + x_view = paddle.view( + x_padded, + (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]), + ) - from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise - - x_q, scale = fp8_quant_blockwise( - x_padded, quant_method="128x128", input_transpose=False, output_scale_transpose=False, using_pow2_scale=False + x_abs = paddle.abs(x_view).astype(paddle.float32) + x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) + scale = scale_wrapper(x_amax) + x_scaled = (x_view * scale).astype(paddle.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + paddle.view(1.0 / scale, (x_view.shape[0], x_view.shape[2])) ) - return x_q[:m, :n].contiguous(), scale def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]: From 2e20113d528152b88c1052d31f8dbdff6d4c2145 Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Fri, 23 Jan 2026 19:26:16 +0800 Subject: [PATCH 10/14] edit whl --- .github/workflows/_accuracy_test.yml | 2 +- .github/workflows/_base_test.yml | 2 +- .github/workflows/_build_linux.yml | 2 +- .github/workflows/_logprob_test_linux.yml | 2 +- .github/workflows/_pre_ce_test.yml | 2 +- .github/workflows/_stable_test.yml | 2 +- .github/workflows/_unit_test_coverage.yml | 2 +- scripts/run_xpu_ci_pytest.sh | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml index 832d6f266a4..7f969fa7397 100644 --- a/.github/workflows/_accuracy_test.yml +++ b/.github/workflows/_accuracy_test.yml @@ -161,7 +161,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index 56808b9fd49..377714b05bc 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -186,7 +186,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_build_linux.yml b/.github/workflows/_build_linux.yml index 32c689d1ada..d6bb583d2d0 100644 --- a/.github/workflows/_build_linux.yml +++ b/.github/workflows/_build_linux.yml @@ -173,7 +173,7 @@ jobs: elif [[ "${PADDLEVERSION}" != "" ]];then python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ else - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl fi pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index fd71f57c350..cf29580a05e 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -156,7 +156,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index 768d73b1c85..72720a6a682 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -172,7 +172,7 @@ jobs: --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl python -m pip install ${fd_wheel_url} bash scripts/run_pre_ce.sh ' diff --git a/.github/workflows/_stable_test.yml b/.github/workflows/_stable_test.yml index 175f6288d76..4fd8739c41a 100644 --- a/.github/workflows/_stable_test.yml +++ b/.github/workflows/_stable_test.yml @@ -164,7 +164,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 92843fd15bf..146df7e0fa7 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -203,7 +203,7 @@ jobs: git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install -r scripts/unittest_requirement.txt diff --git a/scripts/run_xpu_ci_pytest.sh b/scripts/run_xpu_ci_pytest.sh index f5053988eb3..f57e096f71e 100644 --- a/scripts/run_xpu_ci_pytest.sh +++ b/scripts/run_xpu_ci_pytest.sh @@ -74,7 +74,7 @@ python -m pip uninstall fastdeploy-xpu -y # 安装PaddlePaddle Release分支安装对应的paddle echo "安装release分支PaddlePaddle..." -python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl # ============ 编译项目 ============ From b9e180fbd0874f825d25f4f4d19e00a8cb02628e Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Sat, 24 Jan 2026 11:13:46 +0800 Subject: [PATCH 11/14] e baseline --- .github/workflows/_logprob_test_linux.yml | 2 +- tests/ce/server/test_logprobs.py | 12 ++++++------ tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 4 ++-- tests/e2e/test_EB_VL_Lite_serving.py | 4 ++-- tests/model_loader/test_torch_model.py | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index cf29580a05e..3af3b7a6052 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -185,7 +185,7 @@ jobs: -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}" set +e rm -rf ./baseline_output - cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output + cp -r baseline_24/ERNIE-4.5-0.3B-Paddle ./baseline_output LOGPROB_EXIT_CODE=0 python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$? echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env diff --git a/tests/ce/server/test_logprobs.py b/tests/ce/server/test_logprobs.py index 83ca89486c9..3674b3a6b96 100644 --- a/tests/ce/server/test_logprobs.py +++ b/tests/ce/server/test_logprobs.py @@ -25,10 +25,10 @@ def test_unstream_with_logprobs(): # 校验返回内容与概率信息 assert resp_json["choices"][0]["message"]["content"] == "牛顿的" assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], "top_logprobs": None, } @@ -102,10 +102,10 @@ def test_stream_with_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], } @@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.006811376195400953, + "logprob": -0.0068125599063932896, "bytes": [231, 137, 155, 233, 161, 191], } diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index e51018f201e..8d4d157de8d 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev") + base_file = os.path.join(base_path, " ernie-4_5-vl-base-tp2-24") else: - base_file = "ernie-4_5-vl-base-tp2-dev" + base_file = " ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index f93f355a754..7783b844148 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24") else: - base_file = "ernie-4_5-vl-base-tp2-dev" + base_file = "ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index bc8252a4427..0170bef1da6 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -140,7 +140,7 @@ def test_model_against_baseline( # Get baseline suffix from config model_config = hugging_face_model_param_map.get(model_name_or_path, {}) - baseline_suffix = model_config.get("baseline_suffix", "tp2") + baseline_suffix = model_config.get("baseline_suffix", "tp2-24") baseline_filename = f"{model_name_or_path}-{baseline_suffix}" if base_path: From 0ef0a00edabed6e54a9a9fed9f192bdb13ded21f Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Sat, 24 Jan 2026 11:20:20 +0800 Subject: [PATCH 12/14] e baseline 2 --- tests/e2e/utils/rollout_routing_replay_test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py index 499bbbed688..e5ecd4ca33f 100644 --- a/tests/e2e/utils/rollout_routing_replay_test_utils.py +++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py @@ -151,9 +151,9 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/" model_path = os.getenv("MODEL_PATH") if model_path: - baseline_path = os.path.join(model_path, f"R3_BaseLine/routing_replay_output_baseline_{model_name}") + baseline_path = os.path.join(model_path, f"R3_BaseLine_24/routing_replay_output_baseline_{model_name}") else: - baseline_path = f"./R3_BaseLine/routing_replay_output_baseline_{model_name}" + baseline_path = f"./R3_BaseLine_24/routing_replay_output_baseline_{model_name}" stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream") From 7f8c74ebe41f84acefccfce17ea5ae3fd43fd75c Mon Sep 17 00:00:00 2001 From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Date: Sat, 24 Jan 2026 12:47:10 +0800 Subject: [PATCH 13/14] chore: remove extra whitespace in test_EB_VL_Lite_serving.py --- tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 8d4d157de8d..acbf7872e66 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, " ernie-4_5-vl-base-tp2-24") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24") else: - base_file = " ernie-4_5-vl-base-tp2-24" + base_file = "ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() From b72882c23963fa6cc13d3a25d0a2980a3c891846 Mon Sep 17 00:00:00 2001 From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Date: Sat, 24 Jan 2026 14:45:01 +0800 Subject: [PATCH 14/14] chore: keep paddlepaddle-xpu unchanged --- scripts/run_xpu_ci_pytest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_xpu_ci_pytest.sh b/scripts/run_xpu_ci_pytest.sh index f57e096f71e..f5053988eb3 100644 --- a/scripts/run_xpu_ci_pytest.sh +++ b/scripts/run_xpu_ci_pytest.sh @@ -74,7 +74,7 @@ python -m pip uninstall fastdeploy-xpu -y # 安装PaddlePaddle Release分支安装对应的paddle echo "安装release分支PaddlePaddle..." -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl +python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ # ============ 编译项目 ============