diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml index 832d6f266a4..7f969fa7397 100644 --- a/.github/workflows/_accuracy_test.yml +++ b/.github/workflows/_accuracy_test.yml @@ -161,7 +161,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index 56808b9fd49..377714b05bc 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -186,7 +186,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_build_linux.yml b/.github/workflows/_build_linux.yml index 32c689d1ada..d6bb583d2d0 100644 --- a/.github/workflows/_build_linux.yml +++ b/.github/workflows/_build_linux.yml @@ -173,7 +173,7 @@ jobs: elif [[ "${PADDLEVERSION}" != "" ]];then python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ else - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl fi pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index fd71f57c350..3af3b7a6052 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -156,7 +156,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -185,7 +185,7 @@ jobs: -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}" set +e rm -rf ./baseline_output - cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output + cp -r baseline_24/ERNIE-4.5-0.3B-Paddle ./baseline_output LOGPROB_EXIT_CODE=0 python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$? echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index 768d73b1c85..72720a6a682 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -172,7 +172,7 @@ jobs: --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl python -m pip install ${fd_wheel_url} bash scripts/run_pre_ce.sh ' diff --git a/.github/workflows/_stable_test.yml b/.github/workflows/_stable_test.yml index 175f6288d76..4fd8739c41a 100644 --- a/.github/workflows/_stable_test.yml +++ b/.github/workflows/_stable_test.yml @@ -164,7 +164,7 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 92843fd15bf..146df7e0fa7 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -203,7 +203,7 @@ jobs: git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt - python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ + python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install -r scripts/unittest_requirement.txt diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py index 35aa40b77e0..9b038bae62b 100644 --- a/fastdeploy/model_executor/layers/activation.py +++ b/fastdeploy/model_executor/layers/activation.py @@ -120,6 +120,8 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: Returns: Tensor: Output tensor. """ + if self.bias is None and self.quant_scale == -1: + return paddle.nn.functional.swiglu(x) return fused_bias_act( x, bias=self.bias, diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 881f9a22c4d..dc088cf9eb9 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -155,9 +155,10 @@ def apply_ep_prefill( topk_ids_hookfunc(topk_ids=topk_idx) # 2. Dynamic compute blockwise quantization scales - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, self.quant_config.weight_block_size[0] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False ) + x_scale_tensor = x_scale_tensor[: x.shape[0]] event = deep_ep.Buffer.capture() let_another_thread_run() @@ -225,11 +226,10 @@ def apply_ep_prefill( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), @@ -381,7 +381,12 @@ def apply_tp( tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) - recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) + recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, + using_pow2_scale=False, + output_scale_transpose=False, + ) + recv_x_scale = recv_x_scale[: recv_x.shape[0]] ( permute_input, @@ -422,12 +427,10 @@ def apply_tp( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index da705357c12..922729d91bd 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -1525,7 +1525,10 @@ def apply( from .triton_moe_kernels import fused_moe_kernel_paddle - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, self.quant_config.weight_block_size[0]) + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False + ) + x_scale = x_scale[: x.shape[0]] fused_moe_kernel_paddle[grid]( x_q, @@ -1578,9 +1581,10 @@ def apply( ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - intermediate_cache2, self.quant_config.weight_block_size[0] + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False ) + x_scale = x_scale[: x_q.shape[0]] fused_moe_kernel_paddle[grid]( x_q, diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 59daa238480..c13b429095a 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -18,7 +18,6 @@ import paddle -import fastdeploy from fastdeploy import envs from fastdeploy.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -226,9 +225,10 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal layer.weight_scale_inv.set_value(weight_scale) def apply(self, layer, x): - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( - x, self.quant_config.weight_block_size[0] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=True ) + x_scale_tensor = x_scale_tensor.T linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) from fastdeploy.model_executor.ops.gpu import deep_gemm diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index c18f062457e..fd55846aba7 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -220,6 +220,35 @@ def group_wise_int4_weight_quantize(weight: paddle.Tensor, group_size: int = 128 return quant_weight.astype(paddle.int8), weight_scale +def scale_wrapper(x_amax: paddle.Tensor, eps: float = 0.0) -> paddle.Tensor: + """ + Paddle implementation of CUDA ScaleWrapper logic. + Args: + x_amax (paddle.Tensor): amax tensor (float32 recommended) + eps (float): epsilon to avoid division by zero + Returns: + paddle.Tensor: scale tensor, same shape as x_amax + """ + fp8_max = 448.0 + float_max = paddle.finfo(paddle.float32).max + amax_mod = paddle.maximum( + x_amax, + paddle.full_like(x_amax, eps), + ) + scale = fp8_max / amax_mod + scale = paddle.where( + amax_mod == 0, + paddle.ones_like(scale), + scale, + ) + scale = paddle.where( + paddle.isinf(scale), + paddle.full_like(scale, float_max), + scale, + ) + return scale + + def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]: """ Only used in deep_gemm block wise quant weight. @@ -244,11 +273,10 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten x_abs = paddle.abs(x_view).astype(paddle.float32) x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) - x_amax = paddle.clip(x_amax, min=1e-4) - x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) - + scale = scale_wrapper(x_amax) + x_scaled = (x_view * scale).astype(paddle.float8_e4m3fn) return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) + paddle.view(1.0 / scale, (x_view.shape[0], x_view.shape[2])) ) diff --git a/tests/ce/server/test_logprobs.py b/tests/ce/server/test_logprobs.py index 83ca89486c9..3674b3a6b96 100644 --- a/tests/ce/server/test_logprobs.py +++ b/tests/ce/server/test_logprobs.py @@ -25,10 +25,10 @@ def test_unstream_with_logprobs(): # 校验返回内容与概率信息 assert resp_json["choices"][0]["message"]["content"] == "牛顿的" assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], "top_logprobs": None, } @@ -102,10 +102,10 @@ def test_stream_with_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], } @@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.006811376195400953, + "logprob": -0.0068125599063932896, "bytes": [231, 137, 155, 233, 161, 191], } diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index e51018f201e..acbf7872e66 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24") else: - base_file = "ernie-4_5-vl-base-tp2-dev" + base_file = "ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index f93f355a754..7783b844148 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24") else: - base_file = "ernie-4_5-vl-base-tp2-dev" + base_file = "ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py index 499bbbed688..e5ecd4ca33f 100644 --- a/tests/e2e/utils/rollout_routing_replay_test_utils.py +++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py @@ -151,9 +151,9 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/" model_path = os.getenv("MODEL_PATH") if model_path: - baseline_path = os.path.join(model_path, f"R3_BaseLine/routing_replay_output_baseline_{model_name}") + baseline_path = os.path.join(model_path, f"R3_BaseLine_24/routing_replay_output_baseline_{model_name}") else: - baseline_path = f"./R3_BaseLine/routing_replay_output_baseline_{model_name}" + baseline_path = f"./R3_BaseLine_24/routing_replay_output_baseline_{model_name}" stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream") diff --git a/tests/layers/test_activation.py b/tests/layers/test_activation.py index 70f011d3964..b564c267520 100644 --- a/tests/layers/test_activation.py +++ b/tests/layers/test_activation.py @@ -84,8 +84,11 @@ def test_forward_cuda(self, mock_fused, mock_platform): layer = SiluAndMul(fd_config) x = paddle.ones([2, 2]) out = layer.forward(x) - self.assertTrue((out.numpy() == 1).all()) - mock_fused.assert_called_once() + if layer.bias is None and layer.quant_scale == -1: + self.assertTrue((out.numpy() == 0.73105854).all()) + else: + self.assertTrue((out.numpy() == 1).all()) + mock_fused.assert_called_once() # Test forward computation on GCU platform @patch( diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index bc8252a4427..0170bef1da6 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -140,7 +140,7 @@ def test_model_against_baseline( # Get baseline suffix from config model_config = hugging_face_model_param_map.get(model_name_or_path, {}) - baseline_suffix = model_config.get("baseline_suffix", "tp2") + baseline_suffix = model_config.get("baseline_suffix", "tp2-24") baseline_filename = f"{model_name_or_path}-{baseline_suffix}" if base_path: