diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml index 2dfd68aa9d9..40c9d5bd98b 100644 --- a/.github/workflows/_accuracy_test.yml +++ b/.github/workflows/_accuracy_test.yml @@ -39,29 +39,47 @@ jobs: docker_image: ${{ inputs.DOCKER_IMAGE }} fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} run: | - set -x - REPO="https://github.com/${{ github.repository }}.git" - FULL_REPO="${{ github.repository }}" - REPO_NAME="${FULL_REPO##*/}" - BASE_BRANCH="${{ github.base_ref }}" - docker pull ${docker_image} - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME}* + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break fi - ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz - rm -rf FastDeploy.tar.gz - cd FastDeploy - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git log -n 3 --oneline + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline - name: Run FastDeploy Base Tests shell: bash @@ -143,14 +161,16 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install ${fastdeploy_wheel_url} python -m pip install pytest - wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 + wget --no-proxy https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 chmod +x ./llm-deploy-linux-amd64 ./llm-deploy-linux-amd64 -python python3.10 \ -model_name ERNIE-4.5-0.3B-Paddle \ diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index 60e650e9184..a6ae5cf07c9 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -39,29 +39,72 @@ jobs: docker_image: ${{ inputs.DOCKER_IMAGE }} fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} run: | - set -x - REPO="https://github.com/${{ github.repository }}.git" - FULL_REPO="${{ github.repository }}" - REPO_NAME="${FULL_REPO##*/}" - BASE_BRANCH="${{ github.base_ref }}" - docker pull ${docker_image} - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME}* + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 fi ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz - rm -rf FastDeploy.tar.gz - cd FastDeploy - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git log -n 3 --oneline + # Download with retry and validation + MAX_RETRIES=3 + RETRY_COUNT=0 + while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do + if wget -q --no-proxy ${fd_archive_url} && [ -f FastDeploy.tar.gz ] && [ -s FastDeploy.tar.gz ]; then + echo "Download successful, file size: $(stat -c%s FastDeploy.tar.gz) bytes" + break + else + RETRY_COUNT=$((RETRY_COUNT + 1)) + echo "Download failed or file is empty, retry $RETRY_COUNT/$MAX_RETRIES..." + rm -f FastDeploy.tar.gz + sleep 2 + fi + done + + if [ ! -f FastDeploy.tar.gz ] || [ ! -s FastDeploy.tar.gz ]; then + echo "ERROR: Failed to download FastDeploy.tar.gz after $MAX_RETRIES attempts" + exit 1 + fi + + # Verify tar.gz integrity before extraction + if ! tar -tzf FastDeploy.tar.gz > /dev/null 2>&1; then + echo "ERROR: FastDeploy.tar.gz is corrupted or incomplete" + exit 1 + fi + + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline - name: Run FastDeploy Base Tests shell: bash @@ -143,7 +186,9 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -209,7 +254,7 @@ jobs: export TEMPLATE=TOKEN_NORMAL curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \ -H "Content-Type: application/json" \ - -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-VL-28B-A3B-Thinking\", \"--reasoning-parser\": \"ernie-45-vl-thinking\", \"--tool-call-parser\": \"ernie-45-vl-thinking\", \"--tensor-parallel-size\": 1, \"--quantization\": \"wint4\", \"--max-model-len\": 131072, \"--max-num-seqs\": 32}" + -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-VL-28B-A3B-Thinking\", \"--reasoning-parser\": \"ernie-45-vl-thinking\", \"--tool-call-parser\": \"ernie-45-vl-thinking\", \"--tensor-parallel-size\": 1, \"--quantization\": \"wint4\", \"--max-model-len\": 131072, \"--max-num-seqs\": 32, \"--no-enable-prefix-caching\": true}" check_service 90 python -m pytest -sv test_prompt_ids.py || TEST_EXIT_CODE=1 diff --git a/.github/workflows/_build_linux.yml b/.github/workflows/_build_linux.yml index d723e4b2ac5..0ead47d1ce8 100644 --- a/.github/workflows/_build_linux.yml +++ b/.github/workflows/_build_linux.yml @@ -76,9 +76,27 @@ jobs: docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ -e "REPO_NAME=${REPO_NAME}" \ ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME}* + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 fi ' @@ -155,7 +173,9 @@ jobs: elif [[ "${PADDLEVERSION}" != "" ]];then python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ else - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl fi pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_build_linux_rl.yml b/.github/workflows/_build_linux_rl.yml new file mode 100644 index 00000000000..38f052473e8 --- /dev/null +++ b/.github/workflows/_build_linux_rl.yml @@ -0,0 +1,204 @@ +name: FastDeploy Linux GPU Build Task +description: "FastDeploy packages build and upload" + +on: + workflow_call: + inputs: + DOCKER_IMAGE: + description: "Build Images" + required: true + type: string + default: "iregistry.baidu-int.com/tiangexiao/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-rc2" + FASTDEPLOY_ARCHIVE_URL: + description: "URL of the compressed FastDeploy code archive." + required: true + type: string + COMPILE_ARCH: + description: "Build GPU Archs" + required: true + type: string + default: "80,90" + WITH_NIGHTLY_BUILD: + description: "Enable nightly build mode (e.g. add date suffix to version)" + required: false + type: string + default: "OFF" + FD_VERSION: + description: "FastDeploy Package Version" + required: false + type: string + default: "" + PADDLEVERSION: + description: "Paddle Version Build Use" + required: false + type: string + default: "" + PADDLE_WHL_URL: + description: "Paddle Wheel Package URL" + required: false + type: string + default: "" + UPLOAD: + description: "Upload Package" + required: false + type: string + default: "ON" + CACHE_DIR: + description: "Cache Dir Use" + required: false + type: string + default: "" + outputs: + wheel_path_rl: + description: "Output path of the generated wheel" + value: ${{ jobs.fd-build-rl.outputs.wheel_path_rl }} +jobs: + fd-build-rl: + runs-on: [self-hosted, GPU-Build] + timeout-minutes: 360 + outputs: + wheel_path_rl: ${{ steps.set_output.outputs.wheel_path_rl }} + steps: + - name: Code Prepare + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + IS_PR: ${{ github.event_name == 'pull_request' }} + run: | + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + - name: FastDeploy Build + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + compile_arch: ${{ inputs.COMPILE_ARCH }} + fd_version: ${{ inputs.FD_VERSION }} + CACHE_DIR: ${{ inputs.CACHE_DIR }} + BRANCH_REF: ${{ github.ref_name }} + PADDLEVERSION: ${{ inputs.PADDLEVERSION }} + PADDLE_WHL_URL: ${{ inputs.PADDLE_WHL_URL }} + WITH_NIGHTLY_BUILD: ${{ inputs.WITH_NIGHTLY_BUILD }} + run: | + set -x + runner_name="${{ runner.name }}" + CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') + gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + + IFS='/' read -ra parts <<< "${GITHUB_WORKSPACE}" + len=${#parts[@]} + CCACHE_DEFAULT_DIR="/$(IFS=/; echo "${parts[*]:1:$((len-5))}")" + echo "$CCACHE_DEFAULT_DIR" + + CACHE_DIR="${CACHE_DIR:-$CCACHE_DEFAULT_DIR}" + echo "CACHE_DIR is set to ${CACHE_DIR}" + if [ ! -f "${CACHE_DIR}/gitconfig" ]; then + touch "${CACHE_DIR}/gitconfig" + fi + PARENT_DIR=$(dirname "$WORKSPACE") + echo "PARENT_DIR:$PARENT_DIR" + docker run --rm --net=host \ + --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + -v $(pwd):/workspace -w /workspace \ + -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ + -v "${CACHE_DIR}/.cache_rl:/root/.cache" \ + -v "${CACHE_DIR}/.ccache_rl:/root/.ccache" \ + -v "${CACHE_DIR}/ConfigDir:/root/.config" \ + -e TZ="Asia/Shanghai" \ + -e "COMPILE_ARCH=${compile_arch}" \ + -e "FD_VERSION=${fd_version}" \ + -e "WITH_NIGHTLY_BUILD=${WITH_NIGHTLY_BUILD}" \ + -e "PADDLEVERSION=${PADDLEVERSION}" \ + -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \ + -e "BRANCH_REF=${BRANCH_REF}" \ + -e "CCACHE_MAXSIZE=50G" \ + --gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c ' + if [[ -n "${FD_VERSION}" ]]; then + export FASTDEPLOY_VERSION=${FD_VERSION} + echo "Custom FastDeploy version: ${FASTDEPLOY_VERSION}" + fi + + git config --global --add safe.directory /workspace/FastDeploy + chown -R $(whoami) /workspace/FastDeploy + cd FastDeploy + + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Paddle-RL-Compile/release/3.3/latest/paddlepaddle_gpu-3.3.0.dev-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu* + + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + python -m pip install wheel + # 编译RDMA + export FD_ENABLE_RDMA_COMPILE=1 + bash build.sh 1 python false [${COMPILE_ARCH}] + ls ./dist/*.whl + ' + - name: Package Upload + id: set_output + env: + compile_arch: ${{ inputs.COMPILE_ARCH }} + run: | + set -x + commit_id=${{ github.sha }} + branch_name=${{ github.ref_name }} + target_path=paddle-github-action/BRANCH/FastDeploy_RL/${branch_name}/${commit_id}/SM${compile_arch//,/_} + + wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py + push_file=$(realpath bos_tools.py) + python --version + python -m pip install bce-python-sdk==0.9.29 + cd FastDeploy/dist/ + matches=($(ls fastdeploy*.whl)) + if [ ${#matches[@]} -ne 1 ]; then + echo "Error: Found ${#matches[@]} matching files, expected exactly 1" + exit 1 + fi + fd_wheel_name=${matches[0]} + echo "Found: $fd_wheel_name" + tree -L 3 + python ${push_file} fastdeploy*.whl ${target_path} + target_path_stripped="${target_path#paddle-github-action/}" + WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name} + echo "wheel_path_rl=${WHEEL_PATH}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index 8ca3c7d7f64..066acd79c95 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -40,21 +40,43 @@ jobs: docker_image: ${{ inputs.DOCKER_IMAGE }} paddletest_archive_url: ${{ inputs.PADDLETEST_ARCHIVE_URL }} run: | - docker pull ${docker_image} - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - -e "BASE_BRANCH=${BASE_BRANCH}" \ - ${docker_image} /bin/bash -c ' - rm -rf /workspace/* - ' - wget -q --no-proxy ${paddletest_archive_url} - tar -xf PaddleTest.tar.gz - rm -rf PaddleTest.tar.gz - cd PaddleTest - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git log -n 3 --oneline + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + -e "BASE_BRANCH=${BASE_BRANCH}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove /workspace/* ..." + rm -rf /workspace/* || true + sleep 2 + + # Check if anything matching /workspace/* still exists + if ! ls /workspace/* >/dev/null 2>&1; then + echo "All /workspace/* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls /workspace/* >/dev/null 2>&1; then + echo "ERROR: Failed to clean /workspace/* after multiple attempts" + ls -ld /workspace/* + exit 1 + fi + ' + wget -q --no-proxy ${paddletest_archive_url} + tar -xf PaddleTest.tar.gz + rm -rf PaddleTest.tar.gz + cd PaddleTest + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + - name: logprob test shell: bash env: @@ -134,13 +156,15 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install ${fastdeploy_wheel_url} - wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 + wget --no-proxy https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 chmod +x ./llm-deploy-linux-amd64 ./llm-deploy-linux-amd64 -python python3.10 \ -model_name ERNIE-4.5-0.3B-Paddle \ @@ -163,7 +187,7 @@ jobs: -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}" set +e rm -rf ./baseline_output - cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output + cp -r baseline_24/ERNIE-4.5-0.3B-Paddle ./baseline_output LOGPROB_EXIT_CODE=0 python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$? echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index 4db32567796..b78a5862507 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -41,29 +41,47 @@ jobs: docker_image: ${{ inputs.DOCKER_IMAGE }} fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} run: | - set -x - REPO="https://github.com/${{ github.repository }}.git" - FULL_REPO="${{ github.repository }}" - REPO_NAME="${FULL_REPO##*/}" - BASE_BRANCH="${{ github.base_ref }}" - docker pull ${docker_image} - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME}* + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break fi - ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz - rm -rf FastDeploy.tar.gz - cd FastDeploy - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git log -n 3 --oneline + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline - name: Run CI unittest env: @@ -154,7 +172,11 @@ jobs: --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install ${fd_wheel_url} bash scripts/run_pre_ce.sh ' diff --git a/.github/workflows/_stable_test.yml b/.github/workflows/_stable_test.yml index f39b90767e8..c4857f4b474 100644 --- a/.github/workflows/_stable_test.yml +++ b/.github/workflows/_stable_test.yml @@ -39,29 +39,47 @@ jobs: docker_image: ${{ inputs.DOCKER_IMAGE }} fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} run: | - set -x - REPO="https://github.com/${{ github.repository }}.git" - FULL_REPO="${{ github.repository }}" - REPO_NAME="${FULL_REPO##*/}" - BASE_BRANCH="${{ github.base_ref }}" - docker pull ${docker_image} - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME}* + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break fi - ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz - rm -rf FastDeploy.tar.gz - cd FastDeploy - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git log -n 3 --oneline + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline - name: Run FastDeploy Stable Tests shell: bash @@ -146,7 +164,9 @@ jobs: -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 3559cc66505..ce4ad4fd79a 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -55,29 +55,48 @@ jobs: docker_image: ${{ inputs.DOCKER_IMAGE }} fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} run: | - set -x - REPO="https://github.com/${{ github.repository }}.git" - FULL_REPO="${{ github.repository }}" - REPO_NAME="${FULL_REPO##*/}" - BASE_BRANCH="${{ github.base_ref }}" - docker pull ${docker_image} - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME}* + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break fi - ' - - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz - rm -rf FastDeploy.tar.gz - cd FastDeploy - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git log -n 3 --oneline + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + - name: Run FastDeploy Unit Tests and Coverage shell: bash env: @@ -184,7 +203,9 @@ jobs: git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + # Avoid using pip cache to ensure the wheel is updated to the latest version + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install -r scripts/unittest_requirement.txt diff --git a/.github/workflows/ce_job.yml b/.github/workflows/ce_job.yml index ccb728018e8..92ebe43245b 100644 --- a/.github/workflows/ce_job.yml +++ b/.github/workflows/ce_job.yml @@ -156,6 +156,19 @@ jobs: FD_VERSION: 0.0.0 PADDLE_WHL_URL: ${{ needs.ce_job_pre_check.outputs.compile_use_paddle_whl_url }} + build_sm8090_rl: + name: BUILD_SM8090_RL + needs: [clone, ce_job_pre_check] + if: ${{ needs.ce_job_pre_check.outputs.sm8090_match == 'true' }} + uses: ./.github/workflows/_build_linux_rl.yml + with: + DOCKER_IMAGE: iregistry.baidu-int.com/new_rl_infra/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-v2.4.0-rc1 + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + COMPILE_ARCH: "80,90" + WITH_NIGHTLY_BUILD: OFF + FD_VERSION: 0.0.0 + PADDLE_WHL_URL: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Paddle-RL-Compile-test/release/3.3/cbf3469113cd76b7d5f4cba7b8d7d5f55d9e9911/7/paddlepaddle_gpu-3.3.0.dev-cp310-cp310-linux_x86_64.whl + build_sm8689: name: BUILD_SM8689 needs: [clone, ce_job_pre_check] @@ -219,6 +232,52 @@ jobs: echo "commit wheel url is ${WHEEL_PATH}" echo "latest wheel url is ${WHEEL_PATH_LATEST}" + ce_upload_sm8090_rl: + environment: CodeSync + name: CE_UPLOAD_RL + needs: build_sm8090_rl + runs-on: ubuntu-latest + env: + AK: ${{ secrets.BOS_AK }} + SK: ${{ secrets.BOS_SK }} + FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090_rl.outputs.wheel_path_rl }} + COMPILE_ARCH: "80,90" + steps: + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Wheel Info Show and Upload + run: | + echo "The wheel is located at: ${{ needs.build_sm8090_rl.outputs.wheel_path_rl }}" + wget -q --no-check-certificate ${{ needs.build_sm8090_rl.outputs.wheel_path_rl }} + filename=$(basename ${{ needs.build_sm8090_rl.outputs.wheel_path_rl }}) + + commit_id=${{ github.sha }} + branch_name=${{ github.ref_name }} + + wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py + push_file=$(realpath bos_tools.py) + python -m pip install bce-python-sdk==0.9.29 + + target_paths=( + "paddle-qa/paddle-pipeline/FastDeploy_ActionCE_RL/cu129/SM_8090/${branch_name}/${commit_id}" + "paddle-qa/paddle-pipeline/FastDeploy_ActionCE_RL/cu129/SM_8090/${branch_name}/latest" + ) + + for target_path in "${target_paths[@]}"; do + echo "Uploading ${filename} to ${target_path}" + python "${push_file}" "${filename}" "${target_path}" + done + + base_prefix="paddle-qa/" + commit_path_stripped="${target_paths[0]#${base_prefix}}" + latest_path_stripped="${target_paths[1]#${base_prefix}}" + WHEEL_PATH="https://paddle-qa.bj.bcebos.com/${commit_path_stripped}/${filename}" + WHEEL_PATH_LATEST="https://paddle-qa.bj.bcebos.com/${latest_path_stripped}/${filename}" + + echo "commit wheel url is ${WHEEL_PATH}" + echo "latest wheel url is ${WHEEL_PATH_LATEST}" + ce_upload_sm8689: environment: CodeSync name: CE_UPLOAD diff --git a/.github/workflows/ci_image_update.yml b/.github/workflows/ci_image_update.yml index 7e6544e6364..da1256e204c 100644 --- a/.github/workflows/ci_image_update.yml +++ b/.github/workflows/ci_image_update.yml @@ -137,10 +137,19 @@ jobs: FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }} MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" + stable_test: + name: Run Stable Tests + needs: [clone,build_sm8090,ci_image_build] + uses: ./.github/workflows/_stable_test.yml + with: + DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }} + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" publish_pre_check: name: Publish Docker Images Pre Check - needs: [ci_image_build, unittest_coverage,logprob_test,pre_ce_test,base_test] + needs: [ci_image_build,unittest_coverage,logprob_test,pre_ce_test,base_test,stable_test] runs-on: [self-hosted, Docker-Build] steps: - name: Images Uploading diff --git a/.github/workflows/pr_build_and_test.yml b/.github/workflows/pr_build_and_test.yml index 5abd24966d8..da1630e07cf 100644 --- a/.github/workflows/pr_build_and_test.yml +++ b/.github/workflows/pr_build_and_test.yml @@ -75,3 +75,13 @@ jobs: FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" + + stable_test: + name: Run Stable Tests + needs: [clone,build] + uses: ./.github/workflows/_stable_test.yml + with: + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" diff --git a/.github/workflows/publish_job.yml b/.github/workflows/publish_job.yml index 45b1331c725..a301a79309e 100644 --- a/.github/workflows/publish_job.yml +++ b/.github/workflows/publish_job.yml @@ -287,11 +287,11 @@ jobs: shell: bash env: docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate - fd_archive_url: ${{ env.FASTDEPLOY_ARCHIVE_URL }} run: | set -x FULL_REPO="${{ github.repository }}" REPO_NAME="${FULL_REPO##*/}" + fd_archive_url="${{ needs.clone.outputs.repo_archive_url }}" # Clean the repository directory before starting docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ @@ -310,6 +310,8 @@ jobs: git config --global user.email "fastdeploy_ci@example.com" git log -n 3 --oneline + cd ./dockerfiles + PRODUCT_NAME=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:${FD_VERSION} docker build --no-cache -t ${PRODUCT_NAME} -f Dockerfile.gpu . \ --network host \ diff --git a/custom_ops/gpu_ops/append_attn/append_attention_func.cuh b/custom_ops/gpu_ops/append_attn/append_attention_func.cuh index 9f0b9eba1f3..cc0deb82bf7 100644 --- a/custom_ops/gpu_ops/append_attn/append_attention_func.cuh +++ b/custom_ops/gpu_ops/append_attn/append_attention_func.cuh @@ -2414,7 +2414,8 @@ template + bool ENABLE_PREFILL = true, + bool DECODE_ONLY = true> __global__ void merge_multi_chunks_v2_kernel( const T* __restrict__ multi_out, // [token_num, num_chunks, num_heads, // head_dim] @@ -2451,21 +2452,23 @@ __global__ void merge_multi_chunks_v2_kernel( if (bid == -1) { continue; } + const uint32_t local_seq_id = qid - cu_seqlens_q[bid]; const int seq_len_q = seq_lens_q[bid]; if (seq_len_q == 0) continue; int seq_len_kv = seq_lens_kv[bid]; if (ENABLE_PREFILL) { seq_len_kv += seq_len_q; if (seq_len_kv == 0) continue; - - const int seq_len_enc = seq_lens_encoder[bid]; - if (seq_len_enc <= 0) { - continue; - } } else { if (seq_len_kv == 0) continue; seq_len_kv += seq_len_q; } + if constexpr (DECODE_ONLY) { + const int seq_len_enc = seq_lens_encoder[bid]; + if (seq_len_enc > 0) { + continue; + } + } const int num_chunks_this_seq = div_up(seq_len_kv, chunk_size); if (num_chunks_this_seq <= 1) { continue; @@ -2494,14 +2497,32 @@ __global__ void merge_multi_chunks_v2_kernel( } #pragma unroll 2 for (int i = ty; i < num_chunks_this_seq; i += bdy) { - uint32_t offset = (qid * num_chunks + i) * num_heads + hid; + uint32_t offset; + if (ENABLE_PREFILL) { + offset = (qid * num_chunks + i) * num_heads + hid; + } else { + offset = + ((bid * speculate_max_draft_token_num + local_seq_id) * num_chunks + + i) * + num_heads + + hid; + } float m_prev = m; float d_prev = d; const float m_now = multi_m[offset]; const float d_now = multi_d[offset]; m = max(m_prev, m_now); - offset = (qid * num_chunks * num_heads + i * num_heads + hid) * head_dim + - vid * vec_size; + if (ENABLE_PREFILL) { + offset = + (qid * num_chunks * num_heads + i * num_heads + hid) * head_dim + + vid * vec_size; + } else { + offset = ((bid * speculate_max_draft_token_num + local_seq_id) * + num_chunks * num_heads + + i * num_heads + hid) * + head_dim + + vid * vec_size; + } Load(&multi_out[offset], &load_vec); const float scale1 = __expf(m_prev - m), scale2 = __expf(m_now - m); const T scale1_T = static_cast(scale1), diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh index 5c141d7e334..5d1daed91e6 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh @@ -849,6 +849,315 @@ __global__ void append_decode_cache_T_quant_neox_rope_kernel( #endif } +template +__global__ void append_decode_cache_T_int8_neox_rope_kernel( + const T* __restrict__ quant_qkv, // [bsz, num_heads + 2 * kv_num_heads, + // head_size] + uint8_t* __restrict__ key_cache, // [num_blocks, kv_num_heads, + // block_size, head_size // 2] + uint8_t* __restrict__ value_cache, // [num_blocks, kv_num_heads, + // block_size, head_size // 2] + T* __restrict__ qkv_out, + const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq] + const int* __restrict__ cu_seqlens_q, + const int* __restrict__ seq_lens, // [bsz] + const int* __restrict__ seq_lens_encoder, // [bsz] + const float* __restrict__ cos_emb, + const float* __restrict__ sin_emb, + T* __restrict__ cache_k_scale, + T* __restrict__ cache_v_scale, + const int max_seq_len, + const int max_blocks_per_seq, + const int num_heads, + const int block_size, + const float max_bound, + const float min_bound, + const int kv_num_heads, + const bool rope_3d, + const float rms_norm_eps) { + static_assert(HeadDim == 128, "just support HeadDim be 128 now!"); + static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!"); + constexpr int NUM_WARPS = 4; + const int tid = threadIdx.x; + const int wid = tid / 32; + const int lane_id = tid % 32; + const int bid = blockIdx.x, head_idx = blockIdx.y * NUM_WARPS + wid; + int q_head_idx, k_head_idx, v_idx; + const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim; + constexpr int half_head_size = HeadDim / 2; + const int start_token_idx = cu_seqlens_q[bid]; + if (seq_lens_encoder[bid] > 0) return; + const int write_seq_id = seq_lens[bid]; + if (write_seq_id == 0) return; + const int* block_table_now = nullptr; + + block_table_now = block_tables + bid * max_blocks_per_seq; + const int block_idx = __ldg(&block_table_now[write_seq_id / block_size]); + const int block_offset = write_seq_id % block_size; + + float thread_m2 = 0.0f; + float warp_m2 = 0.0f; +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif + if (head_idx < num_heads) { + // q + using LoadT = AlignedVector; + using LoadBiasT = AlignedVector; + constexpr int HalfVecSize = VecSize / 2; + using LoadEmbT = AlignedVector; + + LoadT src_vec; + LoadT src_vec_right; + LoadBiasT out_vec; + LoadBiasT out_vec_right; + LoadEmbT cos_emb_vec; + LoadEmbT sin_emb_vec; + const T* qkv_now = quant_qkv + start_token_idx * hidden_size; + T* qkv_out_now = qkv_out + start_token_idx * hidden_size; +#pragma unroll + for (uint32_t head_bias = lane_id * VecSize; head_bias < half_head_size; + head_bias += 32 * VecSize) { + const int bias_idx = head_idx * HeadDim + head_bias; + Load(&qkv_now[bias_idx], &src_vec); + Load(&qkv_now[bias_idx + half_head_size], &src_vec_right); + // q rope + const uint32_t emb_idx = write_seq_id * HeadDim + head_bias; + const uint32_t new_emb_idx = + rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + Load(&cos_emb[new_emb_idx], &cos_emb_vec); + Load(&sin_emb[new_emb_idx], &sin_emb_vec); +#pragma unroll + for (int i = 0; i < VecSize; i++) { + // dequant + add_bias + rope + float input_left = static_cast(src_vec[i]); + float input_right = static_cast(src_vec_right[i]); + + const float cos_tmp = cos_emb_vec[i]; + const float sin_tmp = sin_emb_vec[i]; + float tmp1 = input_left * cos_tmp - input_right * sin_tmp; + float tmp2 = input_right * cos_tmp + input_left * sin_tmp; + thread_m2 += tmp1 * tmp1 + tmp2 * tmp2; + out_vec[i] = static_cast(tmp1); + out_vec_right[i] = static_cast(tmp2); + } + Store(out_vec, &qkv_out_now[bias_idx]); + Store(out_vec_right, &qkv_out_now[bias_idx + half_head_size]); + } + } else if (head_idx < num_heads + 2 * kv_num_heads) { + // k + constexpr int KV_VEC_SIZE = 16 / sizeof(uint8_t); // 16 + using LoadPadKVT = AlignedVector; + const uint32_t kv_head_idx = (head_idx - num_heads) % kv_num_heads; + if (block_offset == 0) { + // pad zero for this kv_head_idx for this block + LoadPadKVT pad_cache_vec; + *(reinterpret_cast(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0); + if (head_idx < num_heads + kv_num_heads) { + constexpr int num_vecs_per_head_dim = HeadDim / KV_VEC_SIZE; + constexpr int num_token_each_time = 32 / num_vecs_per_head_dim; + const uint32_t tgt_idx = + (block_idx * kv_num_heads + kv_head_idx) * block_size * HeadDim + + lane_id % num_vecs_per_head_dim * KV_VEC_SIZE; + for (int block_i = lane_id / num_vecs_per_head_dim; + block_i < block_size; + block_i += num_token_each_time) { + Store(pad_cache_vec, + &key_cache[tgt_idx + block_i * HeadDim]); + } + } else { + const int num_vecs_per_head_dim = block_size / KV_VEC_SIZE; + const int num_token_each_time = 32 / num_vecs_per_head_dim; + const uint32_t tgt_idx = + (block_idx * kv_num_heads + kv_head_idx) * HeadDim * block_size + + lane_id % num_vecs_per_head_dim * KV_VEC_SIZE; + for (int block_i = lane_id / num_vecs_per_head_dim; block_i < HeadDim; + block_i += num_token_each_time) { + Store( + pad_cache_vec, &value_cache[tgt_idx + block_i * block_size]); + } + } + __syncwarp(); + } + + constexpr int K_VEC_SIZE = 4; + constexpr int HALF_K_VEC_SIZE = 2; + using LoadKVResT = AlignedVector; + using LoadKVT = AlignedVector; + using LoadT = AlignedVector; + using LoadBiasT = AlignedVector; + using LoadEmbT = AlignedVector; + LoadKVResT cache_vec; + LoadT src_vec1, src_vec1_right, src_vec2, src_vec2_right; + LoadBiasT out_vec1, out_vec2; + LoadEmbT cos_emb_vec1, cos_emb_vec2; + LoadEmbT sin_emb_vec1, sin_emb_vec2; + + const T* qkv_now = quant_qkv + start_token_idx * hidden_size; + const int head_bias = lane_id / 4 * 16 + lane_id % 4 * 2; + const int bias_idx = head_idx * HeadDim + head_bias; + Load(&qkv_now[bias_idx], &src_vec1); + Load(&qkv_now[bias_idx + 8], &src_vec2); + T scale = T(1.0f); + const int k_head_idx = head_idx - num_heads; + const int v_head_idx = head_idx - num_heads - kv_num_heads; + if (head_idx < num_heads + kv_num_heads) { + Load( + &qkv_now[head_idx * HeadDim + (head_bias + half_head_size) % HeadDim], + &src_vec1_right); + Load( + &qkv_now[head_idx * HeadDim + + (head_bias + 8 + half_head_size) % HeadDim], + &src_vec2_right); + + const uint32_t emb_idx = write_seq_id * HeadDim + head_bias; + const uint32_t new_emb_idx = + rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + Load(&cos_emb[new_emb_idx], &cos_emb_vec1); + Load(&cos_emb[new_emb_idx + 8], &cos_emb_vec2); + Load(&sin_emb[new_emb_idx], &sin_emb_vec1); + Load(&sin_emb[new_emb_idx + 8], &sin_emb_vec2); + } + + if (head_idx < num_heads + kv_num_heads) { + float input_left = static_cast(src_vec1[0]); + float input_right = static_cast(src_vec1_right[0]); + float cos_tmp = cos_emb_vec1[0]; + float sin_tmp = sin_emb_vec1[0]; + float tmp1 = 0; + if (head_bias < half_head_size) { + tmp1 = input_left * cos_tmp - input_right * sin_tmp; + } else { + tmp1 = input_left * cos_tmp + input_right * sin_tmp; + } + out_vec1[0] = static_cast(tmp1); + input_left = static_cast(src_vec1[1]); + input_right = static_cast(src_vec1_right[1]); + cos_tmp = cos_emb_vec1[1]; + sin_tmp = sin_emb_vec1[1]; + if (head_bias < half_head_size) { + tmp1 = input_left * cos_tmp - input_right * sin_tmp; + } else { + tmp1 = input_left * cos_tmp + input_right * sin_tmp; + } + out_vec1[1] = static_cast(tmp1); + } else { + out_vec1[0] = src_vec1[0]; + out_vec1[1] = src_vec1[1]; + } + + // rope + if (head_idx < num_heads + kv_num_heads) { + float input_left = static_cast(src_vec2[0]); + float input_right = static_cast(src_vec2_right[0]); + float cos_tmp = cos_emb_vec2[0]; + float sin_tmp = sin_emb_vec2[0]; + float tmp1 = 0; + if (head_bias < half_head_size) { + tmp1 = input_left * cos_tmp - input_right * sin_tmp; + } else { + tmp1 = input_left * cos_tmp + input_right * sin_tmp; + } + out_vec2[0] = static_cast(tmp1); + input_left = static_cast(src_vec2[1]); + input_right = static_cast(src_vec2_right[1]); + cos_tmp = cos_emb_vec2[1]; + sin_tmp = sin_emb_vec2[1]; + if (head_bias < half_head_size) { + tmp1 = input_left * cos_tmp - input_right * sin_tmp; + } else { + tmp1 = input_left * cos_tmp + input_right * sin_tmp; + } + out_vec2[1] = static_cast(tmp1); + } else { + out_vec2[0] = src_vec2[0]; + out_vec2[1] = src_vec2[1]; + } + if constexpr (IsDynamic) { + // reduce max, 1 head per warp + T local_max = -INFINITY; +#pragma unroll + for (int i = 0; i < HALF_K_VEC_SIZE; i++) { + local_max = __hmax(local_max, __habs(out_vec1[i])); + local_max = __hmax(local_max, __habs(out_vec2[i])); + } +#pragma unroll + for (int m_offset = 16; m_offset > 0; m_offset /= 2) { + local_max = + __hmax(local_max, __shfl_xor_sync(0xffffffff, local_max, m_offset)); + } + scale = __hdiv(448, local_max); + + int cache_offset; + if (head_idx < num_heads) { + cache_offset = 0; + } else if (head_idx < num_heads + 2 * kv_num_heads) { + cache_offset = block_idx * kv_num_heads * block_size + + (head_idx - num_heads) % kv_num_heads * block_size + + block_offset; + } + T* cache_k_scale_now = cache_k_scale + cache_offset; + T* cache_v_scale_now = cache_v_scale + cache_offset; + if (lane_id == 0) { + if (head_idx < num_heads + kv_num_heads) { + cache_k_scale_now[0] = __hdiv(1, scale); + } else { + cache_v_scale_now[0] = __hdiv(1, scale); + } + } + } else { + if (head_idx < num_heads + kv_num_heads) { + scale = __ldg(&cache_k_scale[kv_head_idx]); + } else { + scale = __ldg(&cache_v_scale[kv_head_idx]); + } + } + +#pragma unroll + for (uint32_t i = 0; i < HALF_K_VEC_SIZE; i++) { + cache_vec[i] = QuantToC8( + scale, out_vec1[i], max_bound, min_bound); + cache_vec[i + HALF_K_VEC_SIZE] = QuantToC8( + scale, out_vec2[i], max_bound, min_bound); + } + if (head_idx < num_heads + kv_num_heads) { + const int start_block_16 = + block_offset / 16 * 16 + block_offset % 8 + lane_id / 4 % 2 * 8; + const uint32_t tgt_cache_idx = + block_idx * kv_num_heads * block_size * HeadDim + + kv_head_idx * block_size * HeadDim + start_block_16 * HeadDim + + lane_id / 4 / 2 * 32 + (block_offset % 16) / 8 * 16 + lane_id % 4 * 4; + Store(cache_vec, &key_cache[tgt_cache_idx]); + } else { + const uint32_t base_tgt_cache_idx = + block_idx * kv_num_heads * HeadDim * block_size + + kv_head_idx * HeadDim * block_size + + (lane_id / 4 * 16 + lane_id % 4 * 2) * block_size + + block_offset / 16 % 2 * 8 * block_size + block_offset / 16 / 2 * 32; + const uint32_t tgt_cache_idx1 = base_tgt_cache_idx + + block_offset % 8 / 2 * 4 // per 4 + + block_offset % 16 / 8 * 2 // per 2 + + block_offset % 2; // per 1 + const uint32_t tgt_cache_idx2 = tgt_cache_idx1 + block_size; + const uint32_t tgt_cache_idx3 = tgt_cache_idx1 + 16; + const uint32_t tgt_cache_idx4 = tgt_cache_idx3 + block_size; + value_cache[tgt_cache_idx1] = cache_vec[0]; + value_cache[tgt_cache_idx2] = cache_vec[1]; + value_cache[tgt_cache_idx3] = cache_vec[2]; + value_cache[tgt_cache_idx4] = cache_vec[3]; + } + } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaTriggerProgrammaticLaunchCompletion(); +#endif +} + template , - grids, - num_warps * 32, - 0, - stream, - reinterpret_cast(qkv_ptr), - key_cache_out->data(), - value_cache_out->data(), - reinterpret_cast(qkv_out->data()), - block_tables.data(), - cu_seqlens_q.data(), - seq_lens.data(), - seq_lens_encoder.data(), - cos_emb, - sin_emb, - const_cast(reinterpret_cast( - cache_k_scale.get().data())), - const_cast(reinterpret_cast( - (cache_v_scale.get().data()))), - nullptr, - nullptr, - max_seq_len, - max_blocks_per_seq, - num_heads, - block_size, - 127.0f, - -127.0f, - kv_num_heads, - rope_3d, - rms_norm_eps); + if (use_neox_rotary_style) { + launchWithPdlWhenEnabled( + append_decode_cache_T_int8_neox_rope_kernel, + grids, + num_warps * 32, + 0, + stream, + reinterpret_cast(qkv_ptr), + key_cache_out->data(), + value_cache_out->data(), + reinterpret_cast(qkv_out->data()), + block_tables.data(), + cu_seqlens_q.data(), + seq_lens.data(), + seq_lens_encoder.data(), + cos_emb, + sin_emb, + const_cast(reinterpret_cast( + cache_k_scale.get().data())), + const_cast(reinterpret_cast( + (cache_v_scale.get().data()))), + max_seq_len, + max_blocks_per_seq, + num_heads, + block_size, + 127.0f, + -127.0f, + kv_num_heads, + rope_3d, + rms_norm_eps); + } else { + launchWithPdlWhenEnabled( + append_decode_cache_int8_rope_qk_norm_kernel, + grids, + num_warps * 32, + 0, + stream, + reinterpret_cast(qkv_ptr), + key_cache_out->data(), + value_cache_out->data(), + reinterpret_cast(qkv_out->data()), + block_tables.data(), + cu_seqlens_q.data(), + seq_lens.data(), + seq_lens_encoder.data(), + cos_emb, + sin_emb, + const_cast(reinterpret_cast( + cache_k_scale.get().data())), + const_cast(reinterpret_cast( + (cache_v_scale.get().data()))), + nullptr, + nullptr, + max_seq_len, + max_blocks_per_seq, + num_heads, + block_size, + 127.0f, + -127.0f, + kv_num_heads, + rope_3d, + rms_norm_eps); + } } else if (cache_quant_type_str == "cache_int4_zp") { append_decode_cache_int4_rope( reinterpret_cast(qkv_ptr), diff --git a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu index 53b7e626651..7221ccf4720 100644 --- a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu +++ b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu @@ -17,6 +17,7 @@ #include "paddle/extension.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/core/memory/memcpy.h" +#include "qwen3_rope.h" #include "remote_cache_kv_ipc.h" template @@ -28,7 +29,7 @@ __global__ void GQAVariableLengthRotarySplitKernel( const float *k_norm_weight, const int *batch_id_per_token, const int *cu_seqlens_q, - const int *seq_lens, + const int *seq_lens_encoder, const int *seq_lens_decoder, const int *cu_seqlens_k, T *qkv_out, @@ -38,8 +39,8 @@ __global__ void GQAVariableLengthRotarySplitKernel( const int64_t elem_cnt, const int q_num_head, const int kv_num_head, - const int seq_len, - const int last_dim, + const int max_model_len, + const int head_dim, const bool rope_3d, const float rms_norm_eps) { using LoadT = AlignedVector; @@ -53,30 +54,33 @@ __global__ void GQAVariableLengthRotarySplitKernel( LoadFloat q_norm_vec, k_norm_vec; int64_t global_warp_idx = blockDim.y * blockIdx.x + threadIdx.y; int64_t all_warp_num = gridDim.x * blockDim.y; - const int half_lastdim = last_dim / 2; + const int half_headdim = head_dim / 2; const int offset = - (q_num_head + kv_num_head * 2) * last_dim; // for all q,k,v - const int all_head_num = elem_cnt / last_dim; + (q_num_head + kv_num_head * 2) * head_dim; // for all q,k,v + const int all_head_num = elem_cnt / head_dim; for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_num; gloabl_hi += all_warp_num) { int64_t linear_index = - gloabl_hi * last_dim + threadIdx.x * VecSize; // 全局index + gloabl_hi * head_dim + threadIdx.x * VecSize; // 全局index const int token_idx = linear_index / offset; // token id(第几个token,不分qkv) const int ori_bi = batch_id_per_token[token_idx]; // 第几个batch - if (seq_lens[ori_bi] == 0) continue; + + int cache_kv_len = seq_lens_decoder[ori_bi]; + // 这里其实是不需要处理的,但是由于FA3的bug,所以必须! + if (seq_lens_encoder[ori_bi] == 0) cache_kv_len = 0; + const int bias = linear_index % offset; - const int hi = bias / last_dim; - const int h_bias = bias % last_dim; + const int hi = bias / head_dim; + const int h_bias = bias % head_dim; const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + - seq_lens_decoder - [ori_bi]; // 在当前seq中的id(拼接了seq到一个batch的情况下有效) + cache_kv_len; // 在当前seq中的id(拼接了seq到一个batch的情况下有效) const int64_t emb_idx = - ori_seq_id * half_lastdim + h_bias / 2; // embedding的id + ori_seq_id * half_headdim + h_bias / 2; // embedding的id const int64_t base_idx = - token_idx * (q_num_head + 2 * kv_num_head) * last_dim + hi * last_dim + + token_idx * (q_num_head + 2 * kv_num_head) * head_dim + hi * head_dim + h_bias; Load(&qkv[base_idx], &src_vec); const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id; @@ -84,21 +88,21 @@ __global__ void GQAVariableLengthRotarySplitKernel( T *out_p = nullptr; if (hi < q_num_head) { base_split_idx = - token_idx * q_num_head * last_dim + hi * last_dim + h_bias; + token_idx * q_num_head * head_dim + hi * head_dim + h_bias; out_p = q; } else if (hi < q_num_head + kv_num_head) { - base_split_idx = kv_write_idx * kv_num_head * last_dim + - (hi - q_num_head) * last_dim + h_bias; + base_split_idx = kv_write_idx * kv_num_head * head_dim + + (hi - q_num_head) * head_dim + h_bias; out_p = k; } else { out_p = v; - base_split_idx = kv_write_idx * kv_num_head * last_dim + - (hi - q_num_head - kv_num_head) * last_dim + h_bias; + base_split_idx = kv_write_idx * kv_num_head * head_dim + + (hi - q_num_head - kv_num_head) * head_dim + h_bias; } // TODO check this correct or not int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + rope_3d ? emb_idx + ori_bi * head_dim * max_model_len : emb_idx; float thread_m2 = 0.0f; float warp_m2 = 0.0f; @@ -122,7 +126,7 @@ __global__ void GQAVariableLengthRotarySplitKernel( WelfordWarpAllReduce(thread_m2, &warp_m2); // 单个head的标准差 if (hi < q_num_head + kv_num_head) { // only q and k need norm - float row_variance = max(warp_m2 / last_dim, 0.0f); + float row_variance = max(warp_m2 / head_dim, 0.0f); float row_inv_var = Rsqrt(row_variance + rms_norm_eps); if (hi < q_num_head) { Load(&q_norm_weight[threadIdx.x * VecSize], @@ -165,12 +169,12 @@ __global__ void GQAVariableLengthRotarySplitKernel( template void gqa_rotary_qk_split_variable( - T *qkv_out, // [token_num, 3, num_head, dim_head] + T *qkv_out, // [token_num, 3, num_head, head_dim] T *q, T *k, T *v, const T *qkv_input, - const float *rotary_emb, // [2, 1, 1, seq_len, dim_head / 2] + const float *rotary_emb, // [2, 1, seq_len, 1, head_dim / 2] const float *q_norm_weight, const float *k_norm_weight, const int *batch_id_per_token, @@ -181,14 +185,14 @@ void gqa_rotary_qk_split_variable( const int token_num, const int num_heads, const int kv_num_heads, - const int seq_len, + const int max_model_len, const int input_output_len, - const int dim_head, + const int head_dim, const bool rope_3d, const float rms_norm_eps, const cudaStream_t &stream) { - assert(dim_head == 128 && "dim_head must be 128"); - int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * dim_head; + assert(head_dim == 128 && "head_dim must be 128"); + int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * head_dim; constexpr int HEAD_DIM = 128; constexpr int PackSize = HEAD_DIM / kWarpSize; @@ -199,7 +203,7 @@ void gqa_rotary_qk_split_variable( dim3 block_size(kWarpSize, blocksize / kWarpSize); const float *cos_emb = rotary_emb; - const float *sin_emb = rotary_emb + input_output_len * dim_head / 2; + const float *sin_emb = rotary_emb + input_output_len * head_dim / 2; launchWithPdlWhenEnabled(GQAVariableLengthRotarySplitKernel, grid_size, block_size, @@ -222,12 +226,185 @@ void gqa_rotary_qk_split_variable( elem_nums, num_heads, kv_num_heads, - seq_len, - dim_head, + max_model_len, + head_dim, rope_3d, rms_norm_eps); } +template +__global__ void GQAVariableLengthNeoxPartialRotarySplitKernel( + const T *qkv, + const float *cos_emb, + const float *sin_emb, + const int *batch_id_per_token, + const int *cu_seqlens_q, + const int *seq_lens_encoder, + const int *seq_lens_decoder, + const int *cu_seqlens_k, + T *qkv_out, + T *q, + T *k, + T *v, + const int64_t elem_cnt, + const int q_num_head, + const int kv_num_head, + const int max_model_len, + const int head_dim, + const int rotary_dim) { + using LoadT = AlignedVector; + using LoadEmbT = AlignedVector; + LoadT src_vec; + LoadT src_vec_right; + LoadEmbT cos_emb_vec; + LoadEmbT sin_emb_vec; + int64_t global_warp_idx = blockDim.y * blockIdx.x + threadIdx.y; + int64_t all_warp_num = gridDim.x * blockDim.y; + const int half_rotary_dim = rotary_dim / 2; + const int half_headdim = head_dim / 2; + const int offset = + (q_num_head + kv_num_head * 2) * head_dim; // for all q,k,v + const int all_head_num = elem_cnt / head_dim; +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif + for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_num; + gloabl_hi += all_warp_num) { + int64_t linear_index = + gloabl_hi * head_dim + threadIdx.x * VecSize; // 全局index + const int token_idx = + linear_index / offset; // token id(第几个token,不分qkv) + const int ori_bi = batch_id_per_token[token_idx]; // 第几个batch + + int cache_kv_len = seq_lens_decoder[ori_bi]; + // 这里其实是不需要处理的,但是由于FA3的bug,所以必须! + if (seq_lens_encoder[ori_bi] == 0) cache_kv_len = 0; + + const int bias = linear_index % offset; + const int hi = bias / head_dim; + const int h_bias = bias % head_dim; + + const int ori_seq_id = + (token_idx - cu_seqlens_q[ori_bi]) + + cache_kv_len; // 在当前seq中的id(拼接了seq到一个batch的情况下有效) + const int64_t base_idx = + token_idx * (q_num_head + 2 * kv_num_head) * head_dim + hi * head_dim + + h_bias; + Load(&qkv[base_idx], &src_vec); + const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id; + int64_t base_split_idx; + T *out_p = nullptr; + if (hi < q_num_head) { + base_split_idx = + token_idx * q_num_head * head_dim + hi * head_dim + h_bias; + out_p = q; + } else if (hi < q_num_head + kv_num_head) { + base_split_idx = kv_write_idx * kv_num_head * head_dim + + (hi - q_num_head) * head_dim + h_bias; + out_p = k; + } else { + out_p = v; + base_split_idx = kv_write_idx * kv_num_head * head_dim + + (hi - q_num_head - kv_num_head) * head_dim + h_bias; + } + + if (hi < q_num_head + kv_num_head) { + if (h_bias < rotary_dim) { + int64_t emb_idx = ori_seq_id * half_rotary_dim; + if (h_bias < half_rotary_dim) { + Load(&qkv[base_idx + half_rotary_dim], &src_vec_right); + emb_idx += h_bias; + } else { + Load(&qkv[base_idx - half_rotary_dim], &src_vec_right); + emb_idx += h_bias - half_rotary_dim; + } + Load(&cos_emb[emb_idx], &cos_emb_vec); + Load(&sin_emb[emb_idx], &sin_emb_vec); +#pragma unroll + for (int i = 0; i < VecSize; i++) { + const float input_left = static_cast(src_vec[i]); + const float input_right = static_cast(src_vec_right[i]); + const float cos_tmp = cos_emb_vec[i]; + const float sin_tmp = sin_emb_vec[i]; + if (h_bias < half_rotary_dim) { + src_vec[i] = + static_cast(input_left * cos_tmp - input_right * sin_tmp); + } else { + src_vec[i] = + static_cast(input_left * cos_tmp + input_right * sin_tmp); + } + } + } + } + + Store(src_vec, &qkv_out[base_idx]); + Store(src_vec, &out_p[base_split_idx]); + } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaTriggerProgrammaticLaunchCompletion(); +#endif +} + +template +void gqa_neox_partial_rotary_qk_split_variable( + T *qkv_out, // [token_num, 3, num_head, head_dim] + T *q, + T *k, + T *v, + const T *qkv_input, + const float *rotary_emb, // [2, 1, seq_len, 1, head_dim / 4] + const int *batch_id_per_token, + const int *seq_lens_encoder, + const int *seq_lens_decoder, + const int *cu_seqlens_q, + const int *cu_seqlens_k, + const int token_num, + const int num_heads, + const int kv_num_heads, + const int max_model_len, + const int head_dim, + const int rotary_dim, + const cudaStream_t &stream) { + assert(head_dim == 128 && "head_dim must be 128"); + int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * head_dim; + + constexpr int HEAD_DIM = 128; + constexpr int PackSize = HEAD_DIM / kWarpSize; + assert(rotary_dim / 2 % PackSize == 0); + const int pack_num = elem_nums / PackSize; + const int blocksize = 128; + int grid_size = 1; + GetNumBlocks<128>(pack_num, &grid_size); + dim3 block_size(kWarpSize, blocksize / kWarpSize); + + const float *cos_emb = rotary_emb; + const float *sin_emb = rotary_emb + max_model_len * rotary_dim / 2; + launchWithPdlWhenEnabled( + GQAVariableLengthNeoxPartialRotarySplitKernel, + grid_size, + block_size, + 0, + stream, + qkv_input, + cos_emb, + sin_emb, + batch_id_per_token, + cu_seqlens_q, + seq_lens_encoder, + seq_lens_decoder, + cu_seqlens_k, + qkv_out, + q, + k, + v, + elem_nums, + num_heads, + kv_num_heads, + max_model_len, + head_dim, + rotary_dim); +} + template GQARopeWriteCacheKernel( const int kv_token_num, const int max_seq_len, const float rms_norm_eps, + const bool use_neox_rotary_style, const std::string &cache_quant_type, const bool rope_3d) { typedef PDTraits traits_; @@ -1153,6 +1331,31 @@ std::vector GQARopeWriteCacheKernel( const int num_heads = qkv_dims[qkv_dims.size() - 1] / head_dim - 2 * kv_num_heads; const float softmax_scale = 1.f / sqrt(head_dim); + int rotary_dim = head_dim; + + PADDLE_ENFORCE_EQ(batch_id_per_token.dims().size(), 1); + PADDLE_ENFORCE_EQ(batch_id_per_token.dims()[0], token_num); + + if (!rope_3d) { + PADDLE_ENFORCE_EQ(rotary_embs.dims().size(), 5); + PADDLE_ENFORCE_EQ(rotary_embs.dims()[0], 2); + PADDLE_ENFORCE_EQ(rotary_embs.dims()[1], 1); + PADDLE_ENFORCE_EQ(rotary_embs.dims()[2], max_seq_len); + PADDLE_ENFORCE_EQ(rotary_embs.dims()[3], 1); + if (use_neox_rotary_style) { + // Note(ZKK) Qwen3 like model + // the [0,head_dim/2), [head_dim/2,head_dim) data are totally same! + if (rotary_embs.dims()[4] == head_dim) { + rotary_dim = head_dim; + } else { + // for glm partial rotary style + PADDLE_ENFORCE_EQ(rotary_embs.dims()[4], head_dim / 4); + rotary_dim = head_dim / 2; + } + } else { + PADDLE_ENFORCE_EQ(rotary_embs.dims()[4], head_dim / 2); + } + } AppendAttnMetaData meta_data; meta_data.token_nums = token_num; @@ -1163,9 +1366,6 @@ std::vector GQARopeWriteCacheKernel( meta_data.block_size = block_size; meta_data.batch_size = seq_lens_this_time.dims()[0]; - phi::GPUContext *dev_ctx = static_cast( - phi::DeviceContextPool::Instance().Get(qkv.place())); - auto stream = qkv.stream(); paddle::Tensor qkv_out = GetEmptyTensor(qkv.dims(), qkv.dtype(), qkv.place()); paddle::Tensor q = GetEmptyTensor( @@ -1175,30 +1375,73 @@ std::vector GQARopeWriteCacheKernel( paddle::Tensor v = GetEmptyTensor( {kv_token_num, kv_num_heads, head_dim}, qkv.dtype(), qkv.place()); - // rope - gqa_rotary_qk_split_variable( - qkv_out.data(), - q.data(), - k.data(), - v.data(), - qkv.data(), - rotary_embs.data(), - q_norm_weight ? q_norm_weight.get().data() : nullptr, - k_norm_weight ? k_norm_weight.get().data() : nullptr, - batch_id_per_token.data(), - seq_lens_encoder.data(), - seq_lens_decoder.data(), - cu_seqlens_q.data(), - cu_seqlens_k.data(), - token_num, - num_heads, - kv_num_heads, - max_seq_len, - rope_3d ? rotary_embs.dims()[3] : rotary_embs.dims()[2], - head_dim, - rope_3d, - rms_norm_eps, - stream); + if (use_neox_rotary_style) { + if (rotary_dim == head_dim) { + gqa_rotary_qk_split_variable_qwen3( + qkv_out.data(), + q.data(), + k.data(), + v.data(), + qkv.data(), + rotary_embs.data(), + batch_id_per_token.data(), + seq_lens_encoder.data(), + seq_lens_decoder.data(), + cu_seqlens_q.data(), + cu_seqlens_k.data(), + token_num, + num_heads, + kv_num_heads, + rope_3d ? rotary_embs.dims()[3] : rotary_embs.dims()[2], + head_dim, + rope_3d, + stream); + } else { + gqa_neox_partial_rotary_qk_split_variable( + qkv_out.data(), + q.data(), + k.data(), + v.data(), + qkv.data(), + rotary_embs.data(), + batch_id_per_token.data(), + seq_lens_encoder.data(), + seq_lens_decoder.data(), + cu_seqlens_q.data(), + cu_seqlens_k.data(), + token_num, + num_heads, + kv_num_heads, + max_seq_len, + head_dim, + rotary_dim, + stream); + } + } else { + gqa_rotary_qk_split_variable( + qkv_out.data(), + q.data(), + k.data(), + v.data(), + qkv.data(), + rotary_embs.data(), + q_norm_weight ? q_norm_weight.get().data() : nullptr, + k_norm_weight ? k_norm_weight.get().data() : nullptr, + batch_id_per_token.data(), + seq_lens_encoder.data(), + seq_lens_decoder.data(), + cu_seqlens_q.data(), + cu_seqlens_k.data(), + token_num, + num_heads, + kv_num_heads, + max_seq_len, + rope_3d ? rotary_embs.dims()[3] : rotary_embs.dims()[2], + head_dim, + rope_3d, + rms_norm_eps, + stream); + } if (token_num < kv_token_num) { AppendCacheKV(key_cache, @@ -1347,6 +1590,7 @@ PD_BUILD_STATIC_OP(gqa_rope_write_cache) .Attrs({"kv_token_num: int", "max_seq_len: int", "rms_norm_eps: float", + "use_neox_rotary_style: bool", "cache_quant_type: std::string", "rope_3d: bool"}) .SetKernelFn(PD_KERNEL(GQARopeWriteCacheKernel)); diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh index 8bbc7727bf2..e3f03b98e83 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh @@ -134,9 +134,17 @@ __global__ void multi_query_append_attention_kernel( T *o_base_ptr_T = nullptr; OutT *o_base_ptr_int8 = nullptr; if constexpr (partition_kv) { - o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); + if (ENABLE_PREFILL) { + o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); + } else { + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); + } } else { o_base_ptr_int8 = out + o_offset; } @@ -386,8 +394,18 @@ __global__ void multi_query_append_attention_kernel( const uint32_t qo_head_idx = q_head_idx + qo_idx_now % GROUP_SIZE; const uint32_t qo_idx = q_start_seq_id + qo_idx_now / GROUP_SIZE; if (qo_idx - q_start_seq_id < q_len) { - uint32_t offset = - (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + uint32_t offset; + if (ENABLE_PREFILL) { + offset = + (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + } else { + offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; + } tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; } @@ -423,6 +441,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel( const T *__restrict__ sinks, // [q_num_heads] const int *__restrict__ seq_lens, const int *__restrict__ seq_lens_kv, + const int *__restrict__ seq_lens_encoder, const int *__restrict__ batch_ids, const int *__restrict__ tile_ids_per_batch, const int *__restrict__ cu_seqlens_q, @@ -483,6 +502,10 @@ __global__ void multi_query_append_attention_warp1_4_kernel( } kv_len += q_len; } + const int seq_len_enc = seq_lens_encoder[batch_id]; + if (seq_len_enc > 0) { + return; + } const uint32_t num_chunks_this_seq = div_up(kv_len, chunk_size); if (chunk_idx >= num_chunks_this_seq) { return; @@ -524,9 +547,11 @@ __global__ void multi_query_append_attention_warp1_4_kernel( chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + tid % 8 * num_elems_per_128b(); } else { - o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); } } const int *mask_offset_this_seq = @@ -794,8 +819,12 @@ __global__ void multi_query_append_attention_warp1_4_kernel( offset = (batch_id * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; } else { - offset = - (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; } tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; @@ -1036,7 +1065,8 @@ void MultiQueryAppendAttention( blocky, HEAD_DIM, OUT_NV_TYPE, - ENABLE_PREFILL>; + ENABLE_PREFILL, + false>; launchWithPdlWhenEnabled( kernelFn, grids_merge, @@ -1154,6 +1184,7 @@ void MultiQueryAppendAttention( : nullptr, seq_lens_q.data(), seq_lens_kv.data(), + seq_lens_encoder.data(), batch_ids.data(), tile_ids_per_batch.data(), cu_seqlens_q.data(), @@ -1189,15 +1220,31 @@ void MultiQueryAppendAttention( phi::SizeOf(paddle::DataType::FLOAT32) * static_cast(bsz * num_chunks * num_heads)); } else { - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); + if (ENABLE_PREFILL) { + tmp_workspace = + allocator->Allocate(phi::SizeOf(qkv.dtype()) * + static_cast(token_num * num_chunks * + num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + } else { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + } } launchWithPdlWhenEnabled( split_kv_kernel, @@ -1219,6 +1266,7 @@ void MultiQueryAppendAttention( : nullptr, seq_lens_q.data(), seq_lens_kv.data(), + seq_lens_encoder.data(), batch_ids.data(), tile_ids_per_batch.data(), cu_seqlens_q.data(), @@ -1296,7 +1344,8 @@ void MultiQueryAppendAttention( blocky, HEAD_DIM, OUT_NV_TYPE, - ENABLE_PREFILL>; + ENABLE_PREFILL, + true>; launchWithPdlWhenEnabled( kernelFn, grids_merge, diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c4_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c4_impl.cuh index 9629acf5d95..9748010b452 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c4_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c4_impl.cuh @@ -169,9 +169,17 @@ __global__ void multi_query_append_attention_c4_kernel( T *o_base_ptr_T = nullptr; OutT *o_base_ptr_int8 = nullptr; if constexpr (partition_kv) { - o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); + if (ENABLE_PREFILL) { + o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); + } else { + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); + } } else { o_base_ptr_int8 = out + o_offset; } @@ -477,8 +485,18 @@ __global__ void multi_query_append_attention_c4_kernel( const uint32_t qo_head_idx = q_head_idx + qo_idx_now % GROUP_SIZE; const uint32_t qo_idx = q_start_seq_id + qo_idx_now / GROUP_SIZE; if (qo_idx - q_start_seq_id < q_len) { - uint32_t offset = - (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + uint32_t offset; + if (ENABLE_PREFILL) { + offset = + (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + } else { + offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; + } tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; } @@ -519,6 +537,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel( const T *__restrict__ sinks, // [q_num_heads] const int *__restrict__ seq_lens, const int *__restrict__ seq_lens_kv, + const int *__restrict__ seq_lens_encoder, const int *__restrict__ batch_ids, const int *__restrict__ tile_ids_per_batch, const int *__restrict__ cu_seqlens_q, @@ -587,6 +606,10 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel( } kv_len += q_len; } + const int seq_len_enc = seq_lens_encoder[batch_id]; + if (seq_len_enc > 0) { + return; + } const uint32_t num_chunks_this_seq = div_up(kv_len, chunk_size); if (chunk_idx >= num_chunks_this_seq) { return; @@ -651,9 +674,11 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel( chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + tid % 8 * num_elems_per_128b(); } else { - o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); } } const int *mask_offset_this_seq = @@ -969,8 +994,12 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel( offset = (batch_id * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; } else { - offset = - (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; } tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; @@ -1161,15 +1190,30 @@ void MultiQueryAppendC4Attention( sliding_window); } else { phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d; - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); + if (ENABLE_PREFILL) { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + } else { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + } launchWithPdlWhenEnabled( split_kv_kernel, grids, @@ -1220,6 +1264,7 @@ void MultiQueryAppendC4Attention( sliding_window); // merge constexpr int vec_size = num_elems_per_128b(); + constexpr int blockx = HEAD_DIM / vec_size; constexpr int blocky = (128 + blockx - 1) / blockx; dim3 grids_merge(min(sm_count * 4, token_num), num_heads); @@ -1230,7 +1275,8 @@ void MultiQueryAppendC4Attention( blocky, HEAD_DIM, OUT_NV_TYPE, - ENABLE_PREFILL>, + ENABLE_PREFILL, + false>, grids_merge, blocks_merge, 0, @@ -1366,6 +1412,7 @@ void MultiQueryAppendC4Attention( : nullptr, seq_lens_q.data(), seq_lens_kv.data(), + seq_lens_encoder.data(), batch_ids.data(), tile_ids_per_batch.data(), cu_seqlens_q.data(), @@ -1402,15 +1449,31 @@ void MultiQueryAppendC4Attention( phi::SizeOf(paddle::DataType::FLOAT32) * static_cast(bsz * num_chunks * num_heads)); } else { - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); + if (ENABLE_PREFILL) { + tmp_workspace = + allocator->Allocate(phi::SizeOf(qkv.dtype()) * + static_cast(token_num * num_chunks * + num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + } else { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + } } launchWithPdlWhenEnabled( split_kv_kernel, @@ -1440,6 +1503,7 @@ void MultiQueryAppendC4Attention( : nullptr, seq_lens_q.data(), seq_lens_kv.data(), + seq_lens_encoder.data(), batch_ids.data(), tile_ids_per_batch.data(), cu_seqlens_q.data(), @@ -1517,7 +1581,8 @@ void MultiQueryAppendC4Attention( blocky, HEAD_DIM, OUT_NV_TYPE, - ENABLE_PREFILL>, + ENABLE_PREFILL, + true>, grids_merge, blocks_merge, 0, diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c8_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c8_impl.cuh index dc8e3b5cdfb..59a838373e7 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c8_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c8_impl.cuh @@ -178,9 +178,17 @@ __global__ void multi_query_append_attention_c8_kernel( T *o_base_ptr_T = nullptr; OutT *o_base_ptr_int8 = nullptr; if constexpr (partition_kv) { - o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); + if (ENABLE_PREFILL) { + o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); + } else { + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); + } } else { o_base_ptr_int8 = out + o_offset; } @@ -524,8 +532,18 @@ __global__ void multi_query_append_attention_c8_kernel( const uint32_t qo_head_idx = q_head_idx + qo_idx_now % GROUP_SIZE; const uint32_t qo_idx = q_start_seq_id + qo_idx_now / GROUP_SIZE; if (qo_idx - q_start_seq_id < q_len) { - uint32_t offset = - (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + uint32_t offset; + if (ENABLE_PREFILL) { + offset = + (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + } else { + offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; + } tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; } @@ -566,6 +584,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel( const T *__restrict__ sinks, // [q_num_heads] const int *__restrict__ seq_lens, const int *__restrict__ seq_lens_kv, + const int *__restrict__ seq_lens_encoder, const int *__restrict__ batch_ids, const int *__restrict__ tile_ids_per_batch, const int *__restrict__ cu_seqlens_q, @@ -620,6 +639,10 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel( if (q_len <= 0) { return; } + const int seq_len_enc = seq_lens_encoder[batch_id]; + if (seq_len_enc > 0) { + return; + } T cache_k_scale_reg[IsDynamicC8 ? num_frags_z * 2 : num_frags_y * 4]; T cache_v_scale_reg[IsDynamicC8 ? num_frags_z * 4 : num_frags_y * 2]; if constexpr (!IsDynamicC8) { @@ -702,9 +725,11 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel( chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + tid % 8 * num_elems_per_128b(); } else { - o_base_ptr_T = tmp_workspace + q_start_seq_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); } } const int *mask_offset_this_seq = @@ -1063,8 +1088,12 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel( offset = (batch_id * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; } else { - offset = - (qo_idx * num_chunks + chunk_idx) * q_num_heads + qo_head_idx; + offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; } tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; @@ -1288,15 +1317,30 @@ void MultiQueryAppendC8Attention( sliding_window); } else { phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d; - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); + if (ENABLE_PREFILL) { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + } else { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + } launchWithPdlWhenEnabled( split_kv_kernel, grids, @@ -1351,7 +1395,8 @@ void MultiQueryAppendC8Attention( blocky, HEAD_DIM, OUT_NV_TYPE, - ENABLE_PREFILL>, + ENABLE_PREFILL, + false>, grids_merge, blocks_merge, 0, @@ -1519,6 +1564,7 @@ void MultiQueryAppendC8Attention( : nullptr, seq_lens_q.data(), seq_lens_kv.data(), + seq_lens_encoder.data(), batch_ids.data(), tile_ids_per_batch.data(), cu_seqlens_q.data(), @@ -1555,15 +1601,31 @@ void MultiQueryAppendC8Attention( phi::SizeOf(paddle::DataType::FLOAT32) * static_cast(bsz * num_chunks * num_heads)); } else { - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(token_num * num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); + if (ENABLE_PREFILL) { + tmp_workspace = + allocator->Allocate(phi::SizeOf(qkv.dtype()) * + static_cast(token_num * num_chunks * + num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(token_num * num_chunks * num_heads)); + } else { + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + } } launchWithPdlWhenEnabled( split_kv_kernel, @@ -1587,6 +1649,7 @@ void MultiQueryAppendC8Attention( : nullptr, seq_lens_q.data(), seq_lens_kv.data(), + seq_lens_encoder.data(), batch_ids.data(), tile_ids_per_batch.data(), cu_seqlens_q.data(), @@ -1665,7 +1728,8 @@ void MultiQueryAppendC8Attention( blocky, HEAD_DIM, OUT_NV_TYPE, - ENABLE_PREFILL>, + ENABLE_PREFILL, + true>, grids_merge, blocks_merge, 0, diff --git a/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu b/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu index 15da09e081c..492b3a26647 100644 --- a/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu +++ b/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu @@ -16,25 +16,26 @@ #include "paddle/extension.h" #include "paddle/phi/core/memory/memcpy.h" -__global__ void pre_cache_len_concat(const int* __restrict__ seq_lens_decoder, - const int* __restrict__ seq_lens_this_time, - int* __restrict__ cu_seqlens_k, - int* __restrict__ batch_ids, - int* __restrict__ tile_ids_per_batch, - int* __restrict__ num_blocks_x, - int* __restrict__ kv_token_num, - const int bsz, - const int num_row_per_block) { +__global__ void pre_cache_len_concat(const int* __restrict__ seq_lens_encoder, + const int* __restrict__ seq_lens_decoder, + const int* __restrict__ seq_lens_this_time, + int* __restrict__ cu_seqlens_k, + int* __restrict__ batch_ids, + int* __restrict__ tile_ids_per_batch, + int* __restrict__ num_blocks_x, + int* __restrict__ kv_token_num, + const int bsz, + const int num_row_per_block) { if (threadIdx.x == 0) { int gridx = 0; int index = 0; int total_tokens = 0; cu_seqlens_k[0] = 0; for (uint32_t bid = 0; bid < bsz; bid++) { - int cache_len = seq_lens_decoder[bid]; - const int q_len = seq_lens_this_time[bid]; - if (q_len <= 0) { - cache_len = 0; + int cache_len = 0; + if (seq_lens_encoder[bid] > 0) { + // only deal with chunked prefill case. + cache_len = seq_lens_decoder[bid]; } const int loop_times = div_up(cache_len, num_row_per_block); for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) { @@ -42,6 +43,7 @@ __global__ void pre_cache_len_concat(const int* __restrict__ seq_lens_decoder, tile_ids_per_batch[index++] = tile_id; } gridx += loop_times; + const int q_len = seq_lens_this_time[bid]; total_tokens += (cache_len + q_len); cu_seqlens_k[bid + 1] = total_tokens; } @@ -51,6 +53,7 @@ __global__ void pre_cache_len_concat(const int* __restrict__ seq_lens_decoder, } std::vector PreCacheLenConcat( + const paddle::Tensor& seq_lens_encoder, const paddle::Tensor& seq_lens_decoder, const paddle::Tensor& seq_lens_this_time, const int max_dec_len, @@ -58,45 +61,43 @@ std::vector PreCacheLenConcat( auto stream = seq_lens_decoder.stream(); auto place = seq_lens_decoder.place(); int bsz = seq_lens_this_time.shape()[0]; - const uint32_t max_tile_size_per_bs_pre_cache = div_up(max_dec_len, block_size); + const uint32_t max_tile_size_per_bs_pre_cache = + div_up(max_dec_len, block_size); - paddle::Tensor cu_seqlens_k = GetEmptyTensor( - {bsz + 1}, - paddle::DataType::INT32, - place); + paddle::Tensor cu_seqlens_k = + GetEmptyTensor({bsz + 1}, paddle::DataType::INT32, place); paddle::Tensor pre_cache_batch_ids = GetEmptyTensor( - {bsz * max_tile_size_per_bs_pre_cache}, - paddle::DataType::INT32, - place); + {bsz * max_tile_size_per_bs_pre_cache}, paddle::DataType::INT32, place); paddle::Tensor pre_cache_tile_ids_per_batch = GetEmptyTensor( - {bsz * max_tile_size_per_bs_pre_cache}, - paddle::DataType::INT32, - place); + {bsz * max_tile_size_per_bs_pre_cache}, paddle::DataType::INT32, place); paddle::Tensor pre_cache_num_blocks = - GetEmptyTensor({1}, paddle::DataType::INT32, place); + GetEmptyTensor({1}, paddle::DataType::INT32, place); paddle::Tensor kv_token_num = - GetEmptyTensor({1}, paddle::DataType::INT32, place); + GetEmptyTensor({1}, paddle::DataType::INT32, place); pre_cache_len_concat<<<1, 32, 0, stream>>>( - seq_lens_decoder.data(), - seq_lens_this_time.data(), - cu_seqlens_k.data(), - pre_cache_batch_ids.data(), - pre_cache_tile_ids_per_batch.data(), - pre_cache_num_blocks.data(), - kv_token_num.data(), - bsz, - block_size - ); - paddle::Tensor pre_cache_num_blocks_cpu = pre_cache_num_blocks.copy_to(paddle::CPUPlace(), false); - paddle::Tensor kv_token_num_cpu = kv_token_num.copy_to(paddle::CPUPlace(), false); + seq_lens_encoder.data(), + seq_lens_decoder.data(), + seq_lens_this_time.data(), + cu_seqlens_k.data(), + pre_cache_batch_ids.data(), + pre_cache_tile_ids_per_batch.data(), + pre_cache_num_blocks.data(), + kv_token_num.data(), + bsz, + block_size); + paddle::Tensor pre_cache_num_blocks_cpu = + pre_cache_num_blocks.copy_to(paddle::CPUPlace(), false); + paddle::Tensor kv_token_num_cpu = + kv_token_num.copy_to(paddle::CPUPlace(), false); - return {cu_seqlens_k, - pre_cache_batch_ids, - pre_cache_tile_ids_per_batch, - pre_cache_num_blocks_cpu, /*cpu*/ - kv_token_num_cpu /*cpu*/ - }; + return { + cu_seqlens_k, + pre_cache_batch_ids, + pre_cache_tile_ids_per_batch, + pre_cache_num_blocks_cpu, /*cpu*/ + kv_token_num_cpu /*cpu*/ + }; } std::vector PreCacheLenConcatInferDtype( @@ -121,15 +122,13 @@ std::vector> PreCacheLenConcatInferShape( } PD_BUILD_STATIC_OP(pre_cache_len_concat) - .Inputs({"seq_lens_decoder", - "seq_lens_this_time"}) + .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time"}) .Outputs({"cu_seqlens_k", "pre_cache_batch_ids", "pre_cache_tile_ids_per_batch", "pre_cache_num_blocks_cpu", /*cpu*/ - "kv_token_num_cpu"}) /*cpu*/ - .Attrs({"max_dec_len: int", - "block_size: int"}) + "kv_token_num_cpu"}) /*cpu*/ + .Attrs({"max_dec_len: int", "block_size: int"}) .SetKernelFn(PD_KERNEL(PreCacheLenConcat)) .SetInferShapeFn(PD_INFER_SHAPE(PreCacheLenConcatInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(PreCacheLenConcatInferDtype)); diff --git a/custom_ops/gpu_ops/append_attn/qwen3_rope.h b/custom_ops/gpu_ops/append_attn/qwen3_rope.h new file mode 100644 index 00000000000..6c6325c335f --- /dev/null +++ b/custom_ops/gpu_ops/append_attn/qwen3_rope.h @@ -0,0 +1,171 @@ +#include "encoder_write_cache_with_rope_impl.cuh" +#include "helper.h" +#include "paddle/extension.h" +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/core/memory/memcpy.h" +#include "remote_cache_kv_ipc.h" + +template +__global__ void GQAVariableLengthRotarySplitKernel_Qwen3( + const T *qkv, + const float *cos_emb, + const float *sin_emb, + const int *batch_id_per_token, + const int *cu_seqlens_q, + const int *seq_lens_encoder, + const int *seq_lens_decoder, + const int *cu_seqlens_k, + T *qkv_out, + T *q, + T *k, + T *v, + const int64_t elem_cnt, + const int q_num_head, + const int kv_num_head, + const int max_model_len, + const int head_dim, + const bool rope_3d) { + using LoadT = AlignedVector; + using LoadEmbT = AlignedVector; + LoadEmbT cos_emb_vec; + LoadEmbT sin_emb_vec; + + const int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; + const int offset = (q_num_head + kv_num_head * 2) * (head_dim / 2); + const int64_t loop_times = elem_cnt / 2; + + for (int64_t linear_index = global_thread_idx * VecSize; + linear_index < loop_times; + linear_index += gridDim.x * blockDim.x * VecSize) { + const int token_idx = linear_index / offset; + + const int ori_bi = batch_id_per_token[token_idx]; // 第几个batch + + int cache_kv_len = seq_lens_decoder[ori_bi]; + // 这里其实是不需要处理的,但是由于FA3的bug,所以必须! + if (seq_lens_encoder[ori_bi] == 0) cache_kv_len = 0; + + const int bias = linear_index % offset; + const int hi = bias / (head_dim / 2); + const int h_bias = bias % (head_dim / 2); + // we should handle token_idx, hi 头 的 h_bias 部分! + + const int ori_seq_id = + (token_idx - cu_seqlens_q[ori_bi]) + + cache_kv_len; // 在当前seq中的id(拼接了seq到一个batch的情况下有效) + + const int half_headdim = head_dim / 2; + const int64_t emb_idx = ori_seq_id * head_dim + h_bias; // embedding的id + + const int64_t read_idx = + token_idx * (q_num_head + 2 * kv_num_head) * head_dim + hi * head_dim + + h_bias; + + LoadT src_vec0; + LoadT src_vec1; + + Load(&qkv[read_idx], &src_vec0); + Load(&qkv[read_idx + 64], &src_vec1); + + const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id; + int64_t base_split_idx; + T *out_p = nullptr; + if (hi < q_num_head) { + base_split_idx = + token_idx * q_num_head * head_dim + hi * head_dim + h_bias; + out_p = q; + } else if (hi < q_num_head + kv_num_head) { + base_split_idx = kv_write_idx * kv_num_head * head_dim + + (hi - q_num_head) * head_dim + h_bias; + out_p = k; + } else { + out_p = v; + base_split_idx = kv_write_idx * kv_num_head * head_dim + + (hi - q_num_head - kv_num_head) * head_dim + h_bias; + } + + // TODO check this correct or not + int64_t new_emb_idx = + rope_3d ? emb_idx + ori_bi * 2 * max_model_len * head_dim : emb_idx; + + if (hi < q_num_head + kv_num_head) { + Load(&cos_emb[new_emb_idx], &cos_emb_vec); + Load(&sin_emb[new_emb_idx], &sin_emb_vec); +#pragma unroll + for (int i = 0; i < VecSize; i++) { + float input_left = static_cast(src_vec0[i]); + float input_right = static_cast(src_vec1[i]); + + const float cos_tmp = cos_emb_vec[i]; + const float sin_tmp = sin_emb_vec[i]; + src_vec0[i] = + static_cast(input_left * cos_tmp - input_right * sin_tmp); + src_vec1[i] = + static_cast(input_right * cos_tmp + input_left * sin_tmp); + } + } + Store(src_vec0, &qkv_out[read_idx]); + Store(src_vec0, &out_p[base_split_idx]); + Store(src_vec1, &qkv_out[read_idx + 64]); + Store(src_vec1, &out_p[base_split_idx + 64]); + } +} + +template +void gqa_rotary_qk_split_variable_qwen3(T *qkv_out, + T *q, + T *k, + T *v, + const T *qkv_input, + const float *rotary_emb, + const int *batch_id_per_token, + const int *seq_lens_encoder, + const int *seq_lens_decoder, + const int *cu_seqlens_q, + const int *cu_seqlens_k, + const int token_num, + const int num_heads, + const int kv_num_heads, + const int max_model_len, + const int head_dim, + const bool rope_3d, + const cudaStream_t &stream) { + assert(head_dim == 128 && "head_dim must be 128"); + + int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * head_dim; + + constexpr int HEAD_DIM = 128; + constexpr int PackSize = 8; + const int pack_num = elem_nums / PackSize; + const int blocksize = 128; + int grid_size = 1; + GetNumBlocks<128>(pack_num, &grid_size); + dim3 block_size(128); + + const float *cos_emb = rotary_emb; + const float *sin_emb = rotary_emb + max_model_len * head_dim; + launchWithPdlWhenEnabled( + GQAVariableLengthRotarySplitKernel_Qwen3, + grid_size, + block_size, + 0, + stream, + qkv_input, + cos_emb, + sin_emb, + batch_id_per_token, + cu_seqlens_q, + seq_lens_encoder, + seq_lens_decoder, + cu_seqlens_k, + qkv_out, + q, + k, + v, + elem_nums, + num_heads, + kv_num_heads, + max_model_len, + head_dim, + rope_3d); +} diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh index bf0a22b6e2d..c321107237b 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh @@ -31,6 +31,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel( const int* __restrict__ batch_id_per_token, // [num_tokens] const int* __restrict__ cu_seqlens_q, const int* __restrict__ seq_lens_decoder, // [bsz] + const int* __restrict__ seq_lens_encoder, // [bsz] const float* __restrict__ cos_emb, const float* __restrict__ sin_emb, const float* @@ -75,7 +76,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel( const int ori_bi = batch_id_per_token[token_id]; if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding - if (seq_lens_decoder[ori_bi] == 0) continue; + if (seq_lens_encoder[ori_bi] > 0) continue; const int bias = linear_index % hidden_size; const int hi = bias / head_size; // q + k + v const int h_bias = bias % head_size; @@ -87,7 +88,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel( const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq; const int block_idx = block_table_now[write_seq_id / block_size]; if (block_idx < 0) { - return; // NOTE(gongshaotian): For CUDAGraph padding + continue; // NOTE(gongshaotian): For CUDAGraph padding } const int block_offset = write_seq_id % block_size; @@ -343,6 +344,7 @@ __global__ void append_speculate_cache_rope_kernel( const int* __restrict__ batch_id_per_token, // [num_tokens] const int* __restrict__ cu_seqlens_q, const int* __restrict__ seq_lens_decoder, // [bsz] + const int* __restrict__ seq_lens_encoder, // [bsz] const float* __restrict__ cos_emb, const float* __restrict__ sin_emb, const float* @@ -380,7 +382,7 @@ __global__ void append_speculate_cache_rope_kernel( const int ori_bi = batch_id_per_token[token_id]; if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding - if (seq_lens_decoder[ori_bi] == 0) continue; + if (seq_lens_encoder[ori_bi] > 0) continue; const int bias = linear_index % hidden_size; const int hi = bias / head_size; // q + k + v const int h_bias = bias % head_size; @@ -392,7 +394,7 @@ __global__ void append_speculate_cache_rope_kernel( const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq; const int block_idx = block_table_now[write_seq_id / block_size]; if (block_idx < 0) { - return; // NOTE(gongshaotian): For CUDAGraph padding + continue; // NOTE(gongshaotian): For CUDAGraph padding } const int block_offset = write_seq_id % block_size; @@ -473,6 +475,7 @@ __global__ void append_speculate_cache_neox_rope_kernel( const int* __restrict__ batch_id_per_token, // [num_tokens] const int* __restrict__ cu_seqlens_q, const int* __restrict__ seq_lens_decoder, // [bsz] + const int* __restrict__ seq_lens_encoder, // [bsz] const float* __restrict__ cos_emb, const float* __restrict__ sin_emb, const float* @@ -509,7 +512,7 @@ __global__ void append_speculate_cache_neox_rope_kernel( const int token_id = linear_index / half_hidden_size; const int ori_bi = batch_id_per_token[token_id]; if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding - if (seq_lens_decoder[ori_bi] == 0) continue; + if (seq_lens_encoder[ori_bi] > 0) continue; const int bias = linear_index % half_hidden_size; const int hi = bias / half_head_size; // q + k + v const int h_bias = bias % half_head_size; @@ -521,7 +524,7 @@ __global__ void append_speculate_cache_neox_rope_kernel( const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq; const int block_idx = block_table_now[write_seq_id / block_size]; if (block_idx < 0) { - return; // NOTE(gongshaotian): For CUDAGraph padding + continue; // NOTE(gongshaotian): For CUDAGraph padding } const int block_offset = write_seq_id % block_size; @@ -598,6 +601,166 @@ __global__ void append_speculate_cache_neox_rope_kernel( } } +template +__global__ void append_speculate_cache_neox_partial_rope_kernel( + const InT* __restrict__ qkv, // [token_num, num_heads + 2 * gqa_group_size, + // head_size] + T* __restrict__ key_cache, // [num_blocks, gqa_group_size, block_size, + // head_size // 2] + T* __restrict__ value_cache, // [num_blocks, gqa_group_size, block_size, + // head_size // 2] + T* __restrict__ qkv_out, + const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq] + const int* __restrict__ batch_id_per_token, // [num_tokens] + const int* __restrict__ cu_seqlens_q, + const int* __restrict__ seq_lens_decoder, // [bsz] + const int* __restrict__ seq_lens_encoder, // [bsz] + const float* __restrict__ cos_emb, + const float* __restrict__ sin_emb, + const float* + qkv_out_scales, // [(num_heads + 2 * gqa_group_size) * head_size] + const T* qkv_biases, // [num_head + 2 * gqa_group_size, dim_head] + const int max_seq_len, + const int max_blocks_per_seq, + const int num_heads, + const int output_inner_dim, + const int head_size, + const int rotary_dim, + const int block_size, + const int elem_cnt, + const int gqa_group_size, + const bool rope_3d) { + using LoadT = AlignedVector; + using LoadFloat = AlignedVector; + using LoadInT = AlignedVector; + constexpr int HalfVecSize = VecSize / 2; + using LoadEmbT = AlignedVector; + LoadInT left_vec, right_vec; + LoadT left_bias_vec, right_bias_vec; + LoadFloat left_out_scale_vec, right_out_scale_vec; + LoadEmbT cos_emb_vec; + LoadEmbT sin_emb_vec; + + int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; + const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * head_size; + const int half_head_size = head_size / 2; + const int half_rotary_dim = rotary_dim / 2; + const int64_t half_hidden_size = hidden_size / 2; + for (int32_t linear_index = global_thread_idx * VecSize, + step = gridDim.x * blockDim.x * VecSize; + linear_index < elem_cnt; + linear_index += step) { + const int token_id = linear_index / half_hidden_size; + const int ori_bi = batch_id_per_token[token_id]; + if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding + if (seq_lens_encoder[ori_bi] > 0) continue; + const int bias = linear_index % half_hidden_size; + const int hi = bias / half_head_size; // q + k + v + const int h_bias = bias % half_head_size; + if (hi < num_heads && h_bias >= half_rotary_dim) { + continue; + } + const int start_token_idx = cu_seqlens_q[ori_bi]; + const int write_seq_id = + seq_lens_decoder[ori_bi] + token_id - start_token_idx; + if (write_seq_id == 0) continue; + + const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq; + const int block_idx = block_table_now[write_seq_id / block_size]; + if (block_idx < 0) { + continue; // NOTE(gongshaotian): For CUDAGraph padding + } + const int block_offset = write_seq_id % block_size; + + const int bias_idx_left = hi * head_size + h_bias; + const int bias_idx_right = bias_idx_left + half_head_size; + int ori_idx_left = token_id * hidden_size + hi * head_size + h_bias; + int ori_idx_right = ori_idx_left + half_head_size; + if (hi < num_heads) { + ori_idx_right = ori_idx_left + half_rotary_dim; + } else if (hi < num_heads + gqa_group_size) { + if (h_bias < half_rotary_dim) { + ori_idx_right = ori_idx_left + half_rotary_dim; + } else { + ori_idx_left = ori_idx_left + half_rotary_dim; + ori_idx_right = ori_idx_left + half_rotary_dim; + } + } + Load(&qkv[ori_idx_left], &left_vec); + Load(&qkv[ori_idx_right], &right_vec); + if (qkv_biases) { + Load(&qkv_biases[bias_idx_left], &left_bias_vec); + Load(&qkv_biases[bias_idx_right], &right_bias_vec); + } + if (qkv_out_scales) { + Load(&qkv_out_scales[bias_idx_left], &left_out_scale_vec); + Load(&qkv_out_scales[bias_idx_right], + &right_out_scale_vec); + } + if (hi < num_heads + gqa_group_size) { + // q k rope + const int64_t emb_idx = write_seq_id * half_rotary_dim + h_bias; + int64_t new_emb_idx = + rope_3d ? emb_idx + ori_bi * max_seq_len * head_size * 2 : emb_idx; + if (h_bias < half_rotary_dim) { + Load(&cos_emb[new_emb_idx], &cos_emb_vec); + Load(&sin_emb[new_emb_idx], &sin_emb_vec); + } + } +#pragma unroll + for (int i = 0; i < VecSize; i++) { + // add_bias + rope + float input_left = static_cast(left_vec[i]); + float input_right = static_cast(right_vec[i]); + if (qkv_out_scales) { + input_left *= left_out_scale_vec[i]; + input_right *= right_out_scale_vec[i]; + } + if (qkv_biases) { + input_left = input_left + static_cast(left_bias_vec[i]); + input_right = input_right + static_cast(right_bias_vec[i]); + } + if (hi < num_heads + gqa_group_size && h_bias < half_rotary_dim) { + const float cos_tmp = cos_emb_vec[i]; + const float sin_tmp = sin_emb_vec[i]; + left_bias_vec[i] = + static_cast(input_left * cos_tmp - input_right * sin_tmp); + right_bias_vec[i] = + static_cast(input_right * cos_tmp + input_left * sin_tmp); + } else { + left_bias_vec[i] = static_cast(input_left); + right_bias_vec[i] = static_cast(input_right); + } + } + if (hi < num_heads) { + // write q + Store(left_bias_vec, &qkv_out[ori_idx_left]); + Store(right_bias_vec, &qkv_out[ori_idx_right]); + } else { + // write k/v + const int kv_head_idx = (hi - num_heads) % gqa_group_size; + int tgt_idx_left = (block_idx * gqa_group_size * block_size * head_size + + kv_head_idx * block_size * head_size + + block_offset * head_size + h_bias); + uint32_t tgt_idx_right = tgt_idx_left + half_head_size; + // write + if (hi < num_heads + gqa_group_size) { + if (h_bias < half_rotary_dim) { + tgt_idx_right = tgt_idx_left + half_rotary_dim; + } else { + tgt_idx_left = tgt_idx_left + half_rotary_dim; + tgt_idx_right = tgt_idx_left + half_rotary_dim; + } + Store(left_bias_vec, &key_cache[tgt_idx_left]); + Store(right_bias_vec, &key_cache[tgt_idx_right]); + } else { + Store(left_bias_vec, &value_cache[tgt_idx_left]); + Store(right_bias_vec, &value_cache[tgt_idx_right]); + } + } + } +} + template - <<>>( - qkv, // [token_num, num_heads + 2 * gqa_group_size, head_size] - key_cache, - value_cache, - qkv_out, - block_tables, - batch_id_per_token, - cu_seqlens_q, - seq_lens, - cos_emb, - sin_emb, - qkv_out_scales, - qkv_biases, // [num_head + 2 * gqa_group_size, dim_head] - max_seq_len, - max_blocks_per_seq, - num_heads, - output_inner_dim, - dim_head, - block_size, - elem_nums, - kv_num_heads, - rope_3d); + if (rotary_dim < dim_head) { + append_speculate_cache_neox_partial_rope_kernel + <<>>( + qkv, // [token_num, num_heads + 2 * gqa_group_size, head_size] + key_cache, + value_cache, + qkv_out, + block_tables, + batch_id_per_token, + cu_seqlens_q, + seq_lens, + seq_lens_encoder, + cos_emb, + sin_emb, + qkv_out_scales, + qkv_biases, // [num_head + 2 * gqa_group_size, dim_head] + max_seq_len, + max_blocks_per_seq, + num_heads, + output_inner_dim, + dim_head, + rotary_dim, + block_size, + elem_nums, + kv_num_heads, + rope_3d); + } else { + append_speculate_cache_neox_rope_kernel + <<>>( + qkv, // [token_num, num_heads + 2 * gqa_group_size, head_size] + key_cache, + value_cache, + qkv_out, + block_tables, + batch_id_per_token, + cu_seqlens_q, + seq_lens, + seq_lens_encoder, + cos_emb, + sin_emb, + qkv_out_scales, + qkv_biases, // [num_head + 2 * gqa_group_size, dim_head] + max_seq_len, + max_blocks_per_seq, + num_heads, + output_inner_dim, + dim_head, + block_size, + elem_nums, + kv_num_heads, + rope_3d); + } } else { append_speculate_cache_rope_kernel <<>>( @@ -158,6 +189,7 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv, batch_id_per_token, cu_seqlens_q, seq_lens, + seq_lens_encoder, cos_emb, sin_emb, qkv_out_scales, @@ -496,11 +528,24 @@ void SpeculateWriteCacheWithRoPEKernel( const float* cos_emb = rotary_embs ? rotary_embs.get().data() : nullptr; const float* sin_emb; + int rotary_dim = dim_head; if (rotary_embs) { sin_emb = use_neox_rotary_style ? rotary_embs.get().data() + max_seq_len * dim_head : rotary_embs.get().data() + max_seq_len * dim_head / 2; + rotary_dim = + rotary_embs.get().dims()[rotary_embs.get().dims().size() - 1] * 2; + if (rotary_dim < dim_head) { + if (!use_neox_rotary_style || qkv_out_scales || q_norm_weight || + k_norm_weight || cache_quant_type_str != "none") { + PADDLE_THROW(phi::errors::Fatal( + "partial_rotary_factor < 1.0 only supports neox_rotary_style=True, " + "qkv_out_scales is None, q_norm_weight/k_norm_weight) is None, and " + "cache_quant_type_str is 'none'.")); + } + sin_emb = rotary_embs.get().data() + max_seq_len * rotary_dim / 2; + } } if (q_norm_weight && k_norm_weight) { if (cache_quant_type_str == "none") { @@ -624,6 +669,7 @@ void SpeculateWriteCacheWithRoPEKernel( num_heads, kv_num_heads, dim_head, + rotary_dim, block_size, bsz, token_nums, diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index abf16db95c9..d51da2d17cb 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -49,6 +49,21 @@ void cuda_host_free(uintptr_t ptr) { check_cuda_error(cudaFreeHost(reinterpret_cast(ptr))); } +void FlashAttentionMask(const paddle::Tensor& q_input, + const paddle::Tensor& k_input, + const paddle::Tensor& v_input, + const paddle::Tensor& cu_seq_q, + const paddle::Tensor& cu_seq_k, + const paddle::Tensor& seq_len_encoder, + const paddle::Tensor& attn_out, + const paddle::optional& mask, + const int head_num, + const int kv_head_num, + const int head_dim, + const int max_seq_len, + const int q_token_num, + const int k_token_num); + std::vector AppendAttention( const paddle::Tensor& qkv, const paddle::Tensor& key_cache, @@ -190,10 +205,12 @@ std::vector GQARopeWriteCacheKernel( const int kv_token_num, const int max_seq_len, const float rms_norm_eps, + const bool use_neox_rotary_style, const std::string& cache_quant_type, const bool rope_3d); std::vector PreCacheLenConcat( + const paddle::Tensor& seq_lens_encoder, const paddle::Tensor& seq_lens_decoder, const paddle::Tensor& seq_lens_this_time, const int max_dec_len, @@ -883,6 +900,8 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, const paddle::Tensor& is_block_step, const paddle::Tensor& batch_drop, const paddle::Tensor& pre_ids, + const paddle::Tensor& mask_rollback, + const paddle::Tensor& recompute_token_num, const paddle::Tensor& accept_tokens, const paddle::Tensor& accept_num, const paddle::Tensor& base_model_seq_lens_this_time, @@ -1022,7 +1041,6 @@ void SpeculateLimitThinkingContentLengthV1( const paddle::Tensor& step_idx, const paddle::Tensor& limit_think_status, const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder, const paddle::Tensor& stop_flags, const paddle::Tensor& eos_token_ids, const int64_t think_end_id); @@ -1033,7 +1051,6 @@ void SpeculateLimitThinkingContentLengthV2( const paddle::Tensor& step_idx, const paddle::Tensor& limit_think_status, const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder, const paddle::Tensor& stop_flags, const int64_t think_end_id, const int64_t line_break_id); @@ -1156,6 +1173,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("append_attention_with_output", &AppendAttentionWithOutput, "append attention with output function"); + m.def("flash_mask_attention", &FlashAttentionMask, "flash_mask_attention"); /** * gqa_rope_write_cache.cu * gqa_rope_write_cache diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh index fea3d63fef9..b17ece59036 100644 --- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh +++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh @@ -18,21 +18,23 @@ #include #include -#include #include +#include #include #include #include #include -#define CUDACHECK(cmd) \ - do { \ - cudaError_t e = cmd; \ - if (e != cudaSuccess) { \ - printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ - cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) namespace paddle { @@ -188,7 +190,8 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) { // semantic is used to enforce memory access order before and after this // barrier. template -DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg, +DINLINE void multi_gpu_barrier(const RankSignals& sg, + Signal* self_sg, int rank) { if constexpr (!is_start) __syncthreads(); static_assert( @@ -205,10 +208,12 @@ DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg, &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x]; if constexpr (need_fence) { st_flag_release(peer_counter_ptr, val); - while (ld_flag_acquire(self_counter_ptr) != val); + while (ld_flag_acquire(self_counter_ptr) != val) + ; } else { st_flag_volatile(peer_counter_ptr, val); - while (ld_flag_volatile(self_counter_ptr) != val); + while (ld_flag_volatile(self_counter_ptr) != val) + ; } } if constexpr (is_start || need_fence) __syncthreads(); @@ -226,8 +231,12 @@ DINLINE P packed_reduce(const P* ptrs[], int idx) { template __global__ void __launch_bounds__(512, 1) - cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg, - T* __restrict__ result, int rank, int size) { + cross_device_reduce_1stage(RankData* _dp, + RankSignals sg, + Signal* self_sg, + T* __restrict__ result, + int rank, + int size) { using P = typename packed_t::P; using A = typename packed_t::A; // note: we don't reorder the address so the accumulation order is the same @@ -249,8 +258,12 @@ DINLINE P* get_tmp_buf(Signal* sg) { template __global__ void __launch_bounds__(512, 1) - cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg, - T* __restrict__ result, int rank, int size) { + cross_device_reduce_2stage(RankData* _dp, + RankSignals sg, + Signal* self_sg, + T* __restrict__ result, + int rank, + int size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = gridDim.x * blockDim.x; using P = typename packed_t::P; @@ -323,7 +336,7 @@ class CustomAllreduce { // 3. (In Python) all gather the IPC handles. // 4. Obtain the peer pointers by opening the IPC handles, and store them in // the rank data array at corresponding positions. - RankData *d_rank_data_base_, *d_rank_data_end_; + RankData *d_rank_data_base_origin_, *d_rank_data_base_, *d_rank_data_end_; std::vector graph_unreg_buffers_; // a map from IPC handles to opened IPC pointers std::map ipc_handles_; @@ -338,8 +351,12 @@ class CustomAllreduce { * Note: this class does not own any device memory. Any required buffers * are passed in from the constructor. */ - CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz, - int rank, int world_size, bool full_nvlink = true) + CustomAllreduce(Signal** signals, + void* rank_data, + size_t rank_data_sz, + int rank, + int world_size, + bool full_nvlink = true) : rank_(rank), world_size_(world_size), full_nvlink_(full_nvlink), @@ -349,6 +366,7 @@ class CustomAllreduce { for (int i = 0; i < world_size_; i++) { sg_.signals[i] = signals[i]; } + d_rank_data_base_origin_ = d_rank_data_base_; } char* open_ipc_handle(const void* ipc_handle) { @@ -405,6 +423,7 @@ class CustomAllreduce { CUDACHECK( cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice)); buffers_[ptrs[rank_]] = d_data; + d_rank_data_base_origin_ = d_rank_data_base_; } // Note: when registering graph buffers, we intentionally choose to not @@ -434,7 +453,8 @@ class CustomAllreduce { } } } - CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(), + CUDACHECK(cudaMemcpy(d_rank_data_base_, + rank_data.data(), sizeof(RankData) * num_buffers, cudaMemcpyHostToDevice)); d_rank_data_base_ += num_buffers; @@ -451,8 +471,12 @@ class CustomAllreduce { * guess is that too many SMs will cause contention on NVLink bus. */ template - void allreduce(cudaStream_t stream, T* input, T* output, int size, - int threads = 512, int block_limit = 36) { + void allreduce(cudaStream_t stream, + T* input, + T* output, + int size, + int threads = 512, + int block_limit = 36) { auto d = packed_t::P::size; if (size % d != 0) throw std::runtime_error( @@ -483,9 +507,9 @@ class CustomAllreduce { size /= d; auto bytes = size * sizeof(typename packed_t::P); int blocks = std::min(block_limit, (size + threads - 1) / threads); -#define KL(ngpus, name) \ - name<<>>(ptrs, sg_, self_sg_, output, \ - rank_, size); +#define KL(ngpus, name) \ + name<<>>( \ + ptrs, sg_, self_sg_, output, rank_, size); #define REDUCE_CASE(ngpus) \ case ngpus: { \ @@ -517,15 +541,15 @@ class CustomAllreduce { #undef KL } - void clear_ipc_handles(){ + void clear_ipc_handles() { for (auto [_, ptr] : ipc_handles_) { CUDACHECK(cudaIpcCloseMemHandle(ptr)); } + ipc_handles_.clear(); + d_rank_data_base_ = d_rank_data_base_origin_; } - ~CustomAllreduce() { - clear_ipc_handles(); - } + ~CustomAllreduce() { clear_ipc_handles(); } }; } // namespace paddle diff --git a/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp b/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp index 070290383d6..0816667c2a7 100644 --- a/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp +++ b/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp @@ -445,7 +445,7 @@ struct CollectiveMainloopAttn { if constexpr (NeedMask) { const int lane_id = thread_idx % 32; - mask_start_idx = mask[0] / kBlockN - 1; + mask_start_idx = mask[0] / kBlockN; mask_row_id = thread_idx / 32 * 16 + lane_id / 4; @@ -485,12 +485,6 @@ struct CollectiveMainloopAttn { consumer_wait(pipeline_k, smem_pipe_read_k); warp_scheduler_barrier_sync(); - if constexpr (NeedMask) { - if (n_block >= mask_start_idx) { - app_mask(tSrS, mask, mask_row_id, col_base + n_block * kBlockN); - } - } - gemm( tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS); softmax.rescale_o(tOrO, scores_scale); @@ -500,6 +494,14 @@ struct CollectiveMainloopAttn { warp_scheduler_barrier_arrive(); warpgroup_wait<1>(); pipeline_k.consumer_release(smem_pipe_read_k); // release K + + if constexpr (NeedMask) { + if (n_block - 1 >= mask_start_idx) { + app_mask( + tSrS, mask, mask_row_id, col_base + n_block * kBlockN - kBlockN); + } + } + cute::copy(softmax.template max( tSrS, mainloop_params.softmax_scale_log2), scores_scale); diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu index 45bf8f7041d..9bfb31beef2 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu @@ -33,7 +33,7 @@ __global__ void limit_thinking_content_length_kernel_v1( if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行 - if (current_limit_think_status == 2 && stop_flags[bid]) { + if (current_limit_think_status == 2 || stop_flags[bid]) { return; } diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu index ea5f8c9c402..b261e01b2f5 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu @@ -35,7 +35,7 @@ __global__ void limit_thinking_content_length_kernel_v2( if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行 - if (current_limit_think_status == 3 && stop_flags[bid]) { + if (current_limit_think_status == 3 || stop_flags[bid]) { return; } diff --git a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu index 79317afab48..7124b684d67 100644 --- a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu +++ b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu @@ -38,6 +38,11 @@ __VA_ARGS__ \ break; \ } \ + case 5: { \ + constexpr size_t NUM_EXPERTS_PER_RANK = 5; \ + __VA_ARGS__ \ + break; \ + } \ case 6: { \ constexpr size_t NUM_EXPERTS_PER_RANK = 6; \ __VA_ARGS__ \ @@ -53,6 +58,11 @@ __VA_ARGS__ \ break; \ } \ + case 10: { \ + constexpr size_t NUM_EXPERTS_PER_RANK = 10; \ + __VA_ARGS__ \ + break; \ + } \ case 16: { \ constexpr size_t NUM_EXPERTS_PER_RANK = 16; \ __VA_ARGS__ \ diff --git a/custom_ops/gpu_ops/set_data_ipc.cu b/custom_ops/gpu_ops/set_data_ipc.cu index b7336e5ae65..b8deb0e5d8f 100644 --- a/custom_ops/gpu_ops/set_data_ipc.cu +++ b/custom_ops/gpu_ops/set_data_ipc.cu @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "helper.h" #include "cuda_multiprocess.h" +#include "helper.h" -int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info) { +int sharedMemoryCreate(const char* name, size_t sz, sharedMemoryInfo* info) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) info->size = sz; - info->shmHandle = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, - PAGE_READWRITE, 0, (DWORD)sz, name); + info->shmHandle = CreateFileMapping( + INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, (DWORD)sz, name); if (info->shmHandle == 0) { return GetLastError(); } @@ -42,20 +42,22 @@ int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info) { status = ftruncate(info->shmFd, sz); if (status != 0) { - return status; + return errno; } info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0); - if (info->addr == NULL) { + if (info->addr == MAP_FAILED) { return errno; } + close(info->shmFd); + info->shmFd = -1; return 0; #endif } template -__global__ void set_data(T *input, int n) { +__global__ void set_data(T* input, int n) { if (threadIdx.x == 0) { for (int i = 0; i < n; ++i) { *(input + i) = static_cast(i); @@ -65,7 +67,7 @@ __global__ void set_data(T *input, int n) { } template -__global__ void print_data(const T *input, int n) { +__global__ void print_data(const T* input, int n) { if (threadIdx.x == 0) { for (int i = 0; i < n; ++i) { printf("input[%d]: %f\n", i, input[i]); @@ -81,72 +83,57 @@ void set_data_ipc(const paddle::Tensor& tmp_input, typedef typename traits_::data_t data_t; sharedMemoryInfo info; - volatile shmStruct *shm = NULL; + volatile shmStruct* shm = NULL; if (sharedMemoryCreate(shm_name.c_str(), sizeof(*shm), &info) != 0) { - printf("Failed to create shared memory slab\n"); - printf("Func sharedMemoryCreate. Shm_name: %s\n", shm_name.c_str()); - exit(EXIT_FAILURE); + printf("Failed to create shared memory slab\n"); + printf("Func sharedMemoryCreate. Shm_name: %s\n", shm_name.c_str()); + exit(EXIT_FAILURE); } - shm = (volatile shmStruct *)info.addr; - memset((void *)shm, 0, sizeof(*shm)); + shm = (volatile shmStruct*)info.addr; + memset((void*)shm, 0, sizeof(*shm)); - void *data_ptr_now = reinterpret_cast(const_cast(tmp_input.data())); + void* data_ptr_now = + reinterpret_cast(const_cast(tmp_input.data())); #ifdef PADDLE_WITH_HIP - checkCudaErrors(hipIpcGetMemHandle((hipIpcMemHandle_t *)&shm->memHandle, data_ptr_now)); + checkCudaErrors( + hipIpcGetMemHandle((hipIpcMemHandle_t*)&shm->memHandle, data_ptr_now)); #else - checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle, data_ptr_now)); + checkCudaErrors( + cudaIpcGetMemHandle((cudaIpcMemHandle_t*)&shm->memHandle, data_ptr_now)); #endif - - } -void SetDataIpc(const paddle::Tensor& tmp_input, - const std::string& shm_name) { - std::vector shape = tmp_input.shape(); - - switch (tmp_input.type()) { - case paddle::DataType::BFLOAT16: { - return set_data_ipc( - tmp_input, - shm_name - ); - } - case paddle::DataType::FLOAT16: { - return set_data_ipc( - tmp_input, - shm_name - ); - } - case paddle::DataType::FLOAT32: { - return set_data_ipc( - tmp_input, - shm_name - ); - } - case paddle::DataType::INT8: { - return set_data_ipc( - tmp_input, - shm_name - ); - } - case paddle::DataType::UINT8: { - return set_data_ipc( - tmp_input, - shm_name - ); - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16, bfloat16 and float32 are supported. "); - break; - } +void SetDataIpc(const paddle::Tensor& tmp_input, const std::string& shm_name) { + std::vector shape = tmp_input.shape(); + + switch (tmp_input.type()) { + case paddle::DataType::BFLOAT16: { + return set_data_ipc(tmp_input, shm_name); } + case paddle::DataType::FLOAT16: { + return set_data_ipc(tmp_input, shm_name); + } + case paddle::DataType::FLOAT32: { + return set_data_ipc(tmp_input, shm_name); + } + case paddle::DataType::INT8: { + return set_data_ipc(tmp_input, shm_name); + } + case paddle::DataType::UINT8: { + return set_data_ipc(tmp_input, shm_name); + } + default: { + PD_THROW( + "NOT supported data type. " + "Only float16, bfloat16 and float32 are supported. "); + break; + } + } } PD_BUILD_STATIC_OP(set_data_ipc) .Inputs({"tmp_input"}) - .Attrs({ "shm_name: std::string"}) + .Attrs({"shm_name: std::string"}) .Outputs({"tmp_input_out"}) .SetInplaceMap({{"tmp_input", "tmp_input_out"}}) .SetKernelFn(PD_KERNEL(SetDataIpc)); diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/draft_model_preprocess.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/draft_model_preprocess.cu index 051d20a0324..ea9063640e0 100644 --- a/custom_ops/gpu_ops/speculate_decoding/draft_model/draft_model_preprocess.cu +++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/draft_model_preprocess.cu @@ -15,33 +15,34 @@ #include "helper.h" #include "paddle/extension.h" - #define DISPATCH_BLOCKSIZE(BLOCK_SIZE, ...) \ - do { \ - constexpr int BlockSize = BLOCK_SIZE; \ - __VA_ARGS__; \ + do { \ + constexpr int BlockSize = BLOCK_SIZE; \ + __VA_ARGS__; \ } while (0) -#define DISPATCH_TRUNCATE_FIRST_TOKEN(truncate_first_token, TRUNCATE_FIRST_TOKEN, ...) \ - do { \ - if (truncate_first_token) { \ - constexpr bool TRUNCATE_FIRST_TOKEN = true; \ - __VA_ARGS__; \ - } else { \ - constexpr bool TRUNCATE_FIRST_TOKEN = false; \ - __VA_ARGS__; \ - } \ +#define DISPATCH_TRUNCATE_FIRST_TOKEN( \ + truncate_first_token, TRUNCATE_FIRST_TOKEN, ...) \ + do { \ + if (truncate_first_token) { \ + constexpr bool TRUNCATE_FIRST_TOKEN = true; \ + __VA_ARGS__; \ + } else { \ + constexpr bool TRUNCATE_FIRST_TOKEN = false; \ + __VA_ARGS__; \ + } \ } while (0) -#define DISPATCH_KVCACHE_SCHEDULER(kvcache_scheduler_v1, KVCACHE_SCHEDULER_V1, ...) \ - do { \ - if (kvcache_scheduler_v1) { \ - constexpr bool KVCACHE_SCHEDULER_V1 = true; \ - __VA_ARGS__; \ - } else { \ - constexpr bool KVCACHE_SCHEDULER_V1 = false; \ - __VA_ARGS__; \ - } \ +#define DISPATCH_KVCACHE_SCHEDULER( \ + kvcache_scheduler_v1, KVCACHE_SCHEDULER_V1, ...) \ + do { \ + if (kvcache_scheduler_v1) { \ + constexpr bool KVCACHE_SCHEDULER_V1 = true; \ + __VA_ARGS__; \ + } else { \ + constexpr bool KVCACHE_SCHEDULER_V1 = false; \ + __VA_ARGS__; \ + } \ } while (0) #define DISPATCH_SPLITWISE_PREFILL(splitwise_prefill, SPLITWISE_PREFILL, ...) \ @@ -55,8 +56,9 @@ } \ } while (0) - -template +template __global__ void process_splitwise_prefill( int64_t* draft_tokens, int64_t* input_ids, @@ -123,10 +125,9 @@ __global__ void process_splitwise_prefill( } } - - - -template +template __global__ void draft_model_preprocess_kernel( int64_t* draft_tokens, int64_t* input_ids, @@ -139,6 +140,8 @@ __global__ void draft_model_preprocess_kernel( bool* is_block_step, bool* batch_drop, int64_t* pre_ids, + int* mask_rollback, + int* recompute_token_num, const int64_t* accept_tokens, const int* accept_num, const int* base_model_seq_lens_this_time, @@ -170,7 +173,8 @@ __global__ void draft_model_preprocess_kernel( auto* base_model_draft_tokens_now = base_model_draft_tokens + tid * base_model_draft_tokens_len; auto base_model_seq_len_decoder = base_model_seq_lens_decoder[tid]; - const int32_t base_model_seq_len_this_time = base_model_seq_lens_this_time[tid]; + const int32_t base_model_seq_len_this_time = + base_model_seq_lens_this_time[tid]; auto* pre_ids_now = pre_ids + tid * pre_ids_len; #pragma unroll for (int i = 1; i < base_model_draft_tokens_len; i++) { @@ -180,7 +184,7 @@ __global__ void draft_model_preprocess_kernel( // 1. process block_step situation // -- In v0 mode, block_step will drop mtp query. // -- In v1 mode, block_step will continue to infer. - if constexpr(KVCACHE_SCHEDULER_V1) { + if constexpr (KVCACHE_SCHEDULER_V1) { if (base_model_stop_flags[tid] && base_model_is_block_step[tid]) { stop_flags[tid] = true; is_block_step[tid] = true; @@ -213,7 +217,7 @@ __global__ void draft_model_preprocess_kernel( } } else { // decode generation if constexpr (KVCACHE_SCHEDULER_V1) { - // 3. try to recover mtp infer in V1 mode + // 3. try to recover mtp infer in V1 mode if (!base_model_is_block_step[tid] && is_block_step[tid]) { is_block_step[tid] = false; } @@ -221,16 +225,24 @@ __global__ void draft_model_preprocess_kernel( if (stop_flags[tid]) { stop_flags[tid] = false; // TODO: check - seq_lens_decoder[tid] = base_model_seq_len_decoder - base_model_seq_len_this_time; - step_idx[tid] = base_model_step_idx[tid] - base_model_seq_len_this_time; + seq_lens_decoder[tid] = + base_model_seq_len_decoder - base_model_seq_len_this_time; + step_idx[tid] = + base_model_step_idx[tid] - base_model_seq_len_this_time; } else { // 2: Last base model generated token and first MTP token - seq_lens_decoder[tid] -= num_model_step - 1; - step_idx[tid] -= num_model_step - 1; + const int recompute_token_num_now = recompute_token_num[tid]; + seq_lens_decoder[tid] -= recompute_token_num_now; + step_idx[tid] -= recompute_token_num_now; + mask_rollback[tid] += recompute_token_num_now; + // NOTE(liuzichang): Used for PD-split mode and future dynamic + // strategies. + recompute_token_num[tid] = num_model_step - 1; } for (int i = 0; i < accept_num_now; i++) { draft_tokens_now[i] = accept_tokens_now[i]; - const int pre_id_pos = base_model_step_idx[tid] - (accept_num_now - i); + const int pre_id_pos = + base_model_step_idx[tid] - (accept_num_now - i); const int64_t accept_token = accept_tokens_now[i]; pre_ids_now[pre_id_pos] = accept_token; } @@ -250,103 +262,107 @@ __global__ void draft_model_preprocess_kernel( } } - -void DispatchRunner( - const cudaStream_t &stream, - int64_t* draft_tokens, - int64_t* input_ids, - bool* stop_flags, - int* seq_lens_this_time, - int* seq_lens_encoder, - int* seq_lens_decoder, - int64_t* step_idx, - bool* not_need_stop, - bool* is_block_step, - bool* batch_drop, - int64_t* pre_ids, - const int64_t* accept_tokens, - const int* accept_num, - const int* base_model_seq_lens_this_time, - const int* base_model_seq_lens_encoder, - const int* base_model_seq_lens_decoder, - const int64_t* base_model_step_idx, - const bool* base_model_stop_flags, - const bool* base_model_is_block_step, - int64_t* base_model_draft_tokens, - const int bsz, - const int num_model_step, - const int accept_tokens_len, - const int draft_tokens_len, - const int input_ids_len, - const int base_model_draft_tokens_len, - const int pre_ids_len, - const bool truncate_first_token, - const bool splitwise_prefill, - const bool kvcache_scheduler_v1) { +void DispatchRunner(const cudaStream_t& stream, + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + int* mask_rollback, + int* recompute_token_num, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1) { DISPATCH_BLOCKSIZE(512, { DISPATCH_TRUNCATE_FIRST_TOKEN(truncate_first_token, TRUNCATE_FIRST_TOKEN, { DISPATCH_KVCACHE_SCHEDULER(kvcache_scheduler_v1, KVCACHE_SCHEDULER_V1, { DISPATCH_SPLITWISE_PREFILL(splitwise_prefill, SPLITWISE_PREFILL, { if constexpr (SPLITWISE_PREFILL) { - process_splitwise_prefill - <<<1, BlockSize, 0, stream>>>( - draft_tokens, - input_ids, - stop_flags, - seq_lens_this_time, - seq_lens_encoder, - seq_lens_decoder, - step_idx, - not_need_stop, - is_block_step, - batch_drop, - pre_ids, - accept_tokens, - accept_num, - base_model_seq_lens_this_time, - base_model_seq_lens_encoder, - base_model_seq_lens_decoder, - base_model_step_idx, - base_model_stop_flags, - base_model_is_block_step, - base_model_draft_tokens, - bsz, - num_model_step, - accept_tokens_len, - draft_tokens_len, - input_ids_len, - base_model_draft_tokens_len, - pre_ids_len); + process_splitwise_prefill + <<<1, BlockSize, 0, stream>>>(draft_tokens, + input_ids, + stop_flags, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + step_idx, + not_need_stop, + is_block_step, + batch_drop, + pre_ids, + accept_tokens, + accept_num, + base_model_seq_lens_this_time, + base_model_seq_lens_encoder, + base_model_seq_lens_decoder, + base_model_step_idx, + base_model_stop_flags, + base_model_is_block_step, + base_model_draft_tokens, + bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len); } else { - draft_model_preprocess_kernel - <<<1, BlockSize, 0, stream>>>( - draft_tokens, - input_ids, - stop_flags, - seq_lens_this_time, - seq_lens_encoder, - seq_lens_decoder, - step_idx, - not_need_stop, - is_block_step, - batch_drop, - pre_ids, - accept_tokens, - accept_num, - base_model_seq_lens_this_time, - base_model_seq_lens_encoder, - base_model_seq_lens_decoder, - base_model_step_idx, - base_model_stop_flags, - base_model_is_block_step, - base_model_draft_tokens, - bsz, - num_model_step, - accept_tokens_len, - draft_tokens_len, - input_ids_len, - base_model_draft_tokens_len, - pre_ids_len); + draft_model_preprocess_kernel + <<<1, BlockSize, 0, stream>>>(draft_tokens, + input_ids, + stop_flags, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + step_idx, + not_need_stop, + is_block_step, + batch_drop, + pre_ids, + mask_rollback, + recompute_token_num, + accept_tokens, + accept_num, + base_model_seq_lens_this_time, + base_model_seq_lens_encoder, + base_model_seq_lens_decoder, + base_model_step_idx, + base_model_stop_flags, + base_model_is_block_step, + base_model_draft_tokens, + bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len); } }); }); @@ -365,6 +381,8 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, const paddle::Tensor& is_block_step, const paddle::Tensor& batch_drop, const paddle::Tensor& pre_ids, + const paddle::Tensor& mask_rollback, + const paddle::Tensor& recompute_token_num, const paddle::Tensor& accept_tokens, const paddle::Tensor& accept_num, const paddle::Tensor& base_model_seq_lens_this_time, @@ -389,38 +407,39 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, auto not_need_stop_gpu = not_need_stop.copy_to(seq_lens_this_time.place(), false); - DispatchRunner( - cu_stream, - const_cast(draft_tokens.data()), - const_cast(input_ids.data()), - const_cast(stop_flags.data()), - const_cast(seq_lens_this_time.data()), - const_cast(seq_lens_encoder.data()), - const_cast(seq_lens_decoder.data()), - const_cast(step_idx.data()), - const_cast(not_need_stop_gpu.data()), - const_cast(is_block_step.data()), - const_cast(batch_drop.data()), - const_cast(pre_ids.data()), - accept_tokens.data(), - accept_num.data(), - base_model_seq_lens_this_time.data(), - base_model_seq_lens_encoder.data(), - base_model_seq_lens_decoder.data(), - base_model_step_idx.data(), - base_model_stop_flags.data(), - base_model_is_block_step.data(), - const_cast(base_model_draft_tokens.data()), - real_bsz, - num_model_step, - accept_tokens_len, - draft_tokens_len, - input_ids_len, - base_model_draft_tokens_len, - pre_ids_len, - truncate_first_token, - splitwise_prefill, - kvcache_scheduler_v1); + DispatchRunner(cu_stream, + const_cast(draft_tokens.data()), + const_cast(input_ids.data()), + const_cast(stop_flags.data()), + const_cast(seq_lens_this_time.data()), + const_cast(seq_lens_encoder.data()), + const_cast(seq_lens_decoder.data()), + const_cast(step_idx.data()), + const_cast(not_need_stop_gpu.data()), + const_cast(is_block_step.data()), + const_cast(batch_drop.data()), + const_cast(pre_ids.data()), + const_cast(mask_rollback.data()), + const_cast(recompute_token_num.data()), + accept_tokens.data(), + accept_num.data(), + base_model_seq_lens_this_time.data(), + base_model_seq_lens_encoder.data(), + base_model_seq_lens_decoder.data(), + base_model_step_idx.data(), + base_model_stop_flags.data(), + base_model_is_block_step.data(), + const_cast(base_model_draft_tokens.data()), + real_bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len, + truncate_first_token, + splitwise_prefill, + kvcache_scheduler_v1); auto not_need_stop_cpu = not_need_stop_gpu.copy_to(not_need_stop.place(), false); @@ -428,7 +447,6 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, not_need_stop_data[0] = not_need_stop_cpu.data()[0]; } - PD_BUILD_STATIC_OP(draft_model_preprocess) .Inputs({"draft_tokens", "input_ids", @@ -441,6 +459,8 @@ PD_BUILD_STATIC_OP(draft_model_preprocess) "is_block_step", "batch_drop", "pre_ids", + "mask_rollback", + "recompute_token_num", "accept_tokens", "accept_num", "base_model_seq_lens_this_time", @@ -460,7 +480,10 @@ PD_BUILD_STATIC_OP(draft_model_preprocess) "not_need_stop_out", "batch_drop_out", "pre_ids_out"}) - .Attrs({"num_model_step: int", "truncate_first_token: bool", "splitwise_prefill: bool", "kvcache_scheduler_v1: bool"}) + .Attrs({"num_model_step: int", + "truncate_first_token: bool", + "splitwise_prefill: bool", + "kvcache_scheduler_v1: bool"}) .SetInplaceMap({{"draft_tokens", "draft_tokens_out"}, {"input_ids", "input_ids_out"}, {"stop_flags", "stop_flags_out"}, diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu index 0a703639c71..7d681b0454c 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu @@ -22,7 +22,6 @@ __global__ void speculate_limit_thinking_content_length_kernel_v1( const int64_t* eos_token_ids, int* limit_think_status, int* accept_num, - int* seq_lens_decoder, bool* stop_flags, const int64_t think_end_id, const int tokens_per_step, @@ -39,7 +38,7 @@ __global__ void speculate_limit_thinking_content_length_kernel_v1( if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行 - if (current_limit_think_status == 2 && stop_flags[bid]) { + if (current_limit_think_status == 2 || stop_flags[bid]) { return; } @@ -106,7 +105,6 @@ __global__ void speculate_limit_thinking_content_length_kernel_v1( int discarded_tokens = original_accept_num - new_accept_num; if (discarded_tokens > 0) { step_idx[bid] -= discarded_tokens; - seq_lens_decoder[bid] -= discarded_tokens; } accept_num[bid] = new_accept_num; @@ -119,7 +117,6 @@ void SpeculateLimitThinkingContentLengthV1( const paddle::Tensor& step_idx, const paddle::Tensor& limit_think_status, const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder, const paddle::Tensor& stop_flags, const paddle::Tensor& eos_token_ids, const int64_t think_end_id) { @@ -134,7 +131,6 @@ void SpeculateLimitThinkingContentLengthV1( eos_token_ids.data(), const_cast(limit_think_status.data()), const_cast(accept_num.data()), - const_cast(seq_lens_decoder.data()), const_cast(stop_flags.data()), think_end_id, tokens_per_step, @@ -148,7 +144,6 @@ PD_BUILD_STATIC_OP(speculate_limit_thinking_content_length_v1) "step_idx", "limit_think_status", "accept_num", - "seq_lens_decoder", "stop_flags", "eos_token_ids"}) .Attrs({"think_end_id: int64_t"}) diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu index 709911d2ba0..177892aa755 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu @@ -25,7 +25,6 @@ __global__ void speculate_limit_thinking_content_length_kernel_v2( int64_t* step_idx, int* limit_think_status, int* accept_num, - int* seq_lens_decoder, const bool* stop_flags, const int64_t think_end_id, const int64_t line_break_id, @@ -42,7 +41,7 @@ __global__ void speculate_limit_thinking_content_length_kernel_v2( if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. - if (current_limit_think_status == 3 && stop_flags[bid]) { + if (current_limit_think_status == 3 || stop_flags[bid]) { return; } @@ -115,7 +114,6 @@ __global__ void speculate_limit_thinking_content_length_kernel_v2( int discarded_tokens = original_accept_num - new_accept_num; if (discarded_tokens > 0) { step_idx[bid] -= discarded_tokens; - seq_lens_decoder[bid] -= discarded_tokens; } accept_num[bid] = new_accept_num; @@ -128,7 +126,6 @@ void SpeculateLimitThinkingContentLengthV2( const paddle::Tensor& step_idx, const paddle::Tensor& limit_think_status, const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder, const paddle::Tensor& stop_flags, const int64_t think_end_id, const int64_t line_break_id) { @@ -141,7 +138,6 @@ void SpeculateLimitThinkingContentLengthV2( const_cast(step_idx.data()), const_cast(limit_think_status.data()), const_cast(accept_num.data()), - const_cast(seq_lens_decoder.data()), stop_flags.data(), think_end_id, line_break_id, @@ -155,7 +151,6 @@ PD_BUILD_STATIC_OP(speculate_limit_thinking_content_length_v2) "step_idx", "limit_think_status", "accept_num", - "seq_lens_decoder", "stop_flags"}) .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"}) .Outputs({"next_tokens_out"}) diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc index 4e547d29776..9ef563c62ca 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc @@ -54,6 +54,10 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, if (!save_each_rank && rank_id > 0) { return; } + + int max_draft_tokens = sampled_token_ids.shape()[1]; + int bsz = token_num_per_batch.shape()[0]; + auto sampled_token_ids_cpu = sampled_token_ids.copy_to(paddle::CPUPlace(), false); auto logprob_token_ids_cpu = @@ -128,7 +132,6 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, msg_sed.meta[0] = not_need_stop.data()[0] ? inference_msg_id_from_env : -inference_msg_id_from_env; msg_sed.meta[1] = message_flag; - int bsz = token_num_per_batch.shape()[0]; msg_sed.meta[2] = bsz; int max_num_logprobs = logprob_token_ids.shape()[1]; for (int i = 0; i < bsz; i++) { @@ -146,7 +149,7 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, auto* cur_scores = &cur_batch_msg_sed->scores[j * (K + 1)]; for (int k = 0; k < K + 1; k++) { if (k == 0) { - cur_tokens[k] = (int)sampled_token_ids_data[token_offset + j]; + cur_tokens[k] = (int)sampled_token_ids_data[i * max_draft_tokens + j]; cur_scores[k] = logprob_scores_data[(token_offset + j) * (K + 1) + k]; } else if (k < max_num_logprobs) { cur_tokens[k] = diff --git a/custom_ops/gpu_ops/update_inputs_v1.cu b/custom_ops/gpu_ops/update_inputs_v1.cu index 64230ae2565..7dd786dabfb 100644 --- a/custom_ops/gpu_ops/update_inputs_v1.cu +++ b/custom_ops/gpu_ops/update_inputs_v1.cu @@ -50,6 +50,11 @@ __global__ void update_inputs_kernel_v1(bool* not_need_stop, } if (thread_idx < bsz) { if (stop_flag_now) { + // chuned when max_tokens=1 + if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] < + prompt_lens[thread_idx]) { + topk_ids[thread_idx] = -1; + } seq_lens_this_time[thread_idx] = 0; // stop at next step seq_lens_decoder[thread_idx] = 0; seq_lens_encoder[thread_idx] = 0; diff --git a/custom_ops/xpu_ops/download_dependencies.sh b/custom_ops/xpu_ops/download_dependencies.sh index ad6d4d2dea6..a0ee3b58fb5 100644 --- a/custom_ops/xpu_ops/download_dependencies.sh +++ b/custom_ops/xpu_ops/download_dependencies.sh @@ -12,8 +12,8 @@ rm -rf "$THIRDPARTY_DIR" mkdir -p "$THIRDPARTY_DIR" || exit 1 if [ "$1" == "stable" ]; then - version_xvllm="20251017" - version_xtdk="3.4.0.1" + version_xvllm="20260112" + version_xtdk="4.4.41.1" else version_xvllm="latest" version_xtdk="latest" diff --git a/custom_ops/xpu_ops/src/ops/get_output_msg_with_topk.cc b/custom_ops/xpu_ops/src/ops/get_output_msg_with_topk.cc index f00313e8718..afff264ddc3 100644 --- a/custom_ops/xpu_ops/src/ops/get_output_msg_with_topk.cc +++ b/custom_ops/xpu_ops/src/ops/get_output_msg_with_topk.cc @@ -23,8 +23,8 @@ #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif -#define MAX_BSZ 128 -#define K 5 +#define MAX_BSZ 512 +#define K 20 struct msgdata { long mtype; diff --git a/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess.cc b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess.cc index a4cf8e68748..bf2f09b9342 100644 --- a/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess.cc +++ b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess.cc @@ -33,6 +33,8 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, const paddle::Tensor& is_block_step, const paddle::Tensor& batch_drop, const paddle::Tensor& pre_ids, + const paddle::Tensor& mask_rollback, + const paddle::Tensor& recompute_token_num, const paddle::Tensor& accept_tokens, const paddle::Tensor& accept_num, const paddle::Tensor& base_model_seq_lens_this_time, @@ -114,6 +116,8 @@ PD_BUILD_STATIC_OP(draft_model_preprocess) "is_block_step", "batch_drop", "pre_ids", + "mask_rollback", + "recompute_token_num", "accept_tokens", "accept_num", "base_model_seq_lens_this_time", diff --git a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc index 0400aa02d7d..74d8d829580 100644 --- a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc +++ b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc @@ -292,6 +292,8 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, const paddle::Tensor& is_block_step, const paddle::Tensor& batch_drop, const paddle::Tensor& pre_ids, + const paddle::Tensor& mask_rollback, + const paddle::Tensor& recompute_token_num, const paddle::Tensor& accept_tokens, const paddle::Tensor& accept_num, const paddle::Tensor& base_model_seq_lens_this_time, @@ -659,6 +661,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("is_block_step"), py::arg("batch_drop"), py::arg("pre_ids"), + py::arg("mask_rollback"), + py::arg("recompute_token_num"), py::arg("accept_tokens"), py::arg("accept_num"), py::arg("base_model_seq_lens_this_time"), diff --git a/custom_ops/xpu_ops/src/ops/save_output_msg_with_topk.cc b/custom_ops/xpu_ops/src/ops/save_output_msg_with_topk.cc index 596eb4763c4..07122503209 100644 --- a/custom_ops/xpu_ops/src/ops/save_output_msg_with_topk.cc +++ b/custom_ops/xpu_ops/src/ops/save_output_msg_with_topk.cc @@ -23,8 +23,8 @@ #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif -#define MAX_BSZ 128 -#define K 5 +#define MAX_BSZ 512 +#define K 20 // #define SAVE_WITH_OUTPUT_DEBUG struct msgdata { diff --git a/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc b/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc index 7e1bb881569..f46336426f6 100644 --- a/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc +++ b/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc @@ -17,19 +17,12 @@ #include #include #include +#include "msg_utils.h" #include "paddle/extension.h" -#define MAX_BSZ 256 - -// #define SAVE_WITH_OUTPUT_DEBUG -struct msgdata { - long mtype; - int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens -}; - // #define SAVE_WITH_OUTPUT_DEBUG -void SaveOutMmsg(const paddle::Tensor &x, - const paddle::Tensor ¬_need_stop, +void SaveOutMmsg(const paddle::Tensor& x, + const paddle::Tensor& not_need_stop, int64_t rank_id, int msg_queue_id, bool save_each_rank) { @@ -37,10 +30,10 @@ void SaveOutMmsg(const paddle::Tensor &x, return; } auto x_cpu = x.copy_to(paddle::CPUPlace(), false); - int64_t *x_data = x_cpu.data(); + int64_t* x_data = x_cpu.data(); static struct msgdata msg_sed; - if (const char *inference_msg_queue_id_env_p = + if (const char* inference_msg_queue_id_env_p = std::getenv("INFERENCE_MSG_QUEUE_ID")) { std::string inference_msg_queue_id_env_str(inference_msg_queue_id_env_p); int inference_msg_queue_id_from_env = @@ -57,7 +50,7 @@ void SaveOutMmsg(const paddle::Tensor &x, #endif } int inference_msg_id_from_env = 1; - if (const char *inference_msg_id_env_p = std::getenv("INFERENCE_MSG_ID")) { + if (const char* inference_msg_id_env_p = std::getenv("INFERENCE_MSG_ID")) { std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { @@ -111,15 +104,15 @@ void SaveOutMmsg(const paddle::Tensor &x, return; } -void SaveOutMmsgStatic(const paddle::Tensor &x, - const paddle::Tensor ¬_need_stop, +void SaveOutMmsgStatic(const paddle::Tensor& x, + const paddle::Tensor& not_need_stop, int64_t rank_id, bool save_each_rank) { SaveOutMmsg(x, not_need_stop, rank_id, 1, save_each_rank); } -void SaveOutMmsgDynamic(const paddle::Tensor &x, - const paddle::Tensor ¬_need_stop, +void SaveOutMmsgDynamic(const paddle::Tensor& x, + const paddle::Tensor& not_need_stop, int64_t rank_id, int msg_queue_id, bool save_each_rank) { diff --git a/dockerfiles/Dockerfile.gpu b/dockerfiles/Dockerfile.gpu index a9639286140..5ce8b05b199 100644 --- a/dockerfiles/Dockerfile.gpu +++ b/dockerfiles/Dockerfile.gpu @@ -1,6 +1,6 @@ FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:tag-base -ARG PADDLE_VERSION=3.2.1 -ARG FD_VERSION=2.3.0 +ARG PADDLE_VERSION=3.3.0 +ARG FD_VERSION=2.4.0 ENV DEBIAN_FRONTEND=noninteractive diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md index d3052c9bb47..7c506973566 100644 --- a/docs/get_started/installation/kunlunxin_xpu.md +++ b/docs/get_started/installation/kunlunxin_xpu.md @@ -28,9 +28,9 @@ Verified platform: ```bash mkdir Work cd Work -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.3.0 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.4.0 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \ - ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.3.0 \ + ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.4.0 \ /bin/bash docker exec -it fastdeploy-xpu /bin/bash ``` @@ -40,7 +40,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### Install PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` Alternatively, you can install the latest version of PaddlePaddle (Not recommended) @@ -52,7 +52,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/ ### Install FastDeploy (**Do NOT install via PyPI source**) ```bash -python -m pip install fastdeploy-xpu==2.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-xpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple ``` Alternatively, you can install the latest version of FastDeploy (Not recommended) @@ -66,7 +66,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### Install PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` Alternatively, you can install the latest version of PaddlePaddle (Not recommended) diff --git a/docs/get_started/installation/nvidia_gpu.md b/docs/get_started/installation/nvidia_gpu.md index 8a9a91f18f3..29f2bbc3a5b 100644 --- a/docs/get_started/installation/nvidia_gpu.md +++ b/docs/get_started/installation/nvidia_gpu.md @@ -23,7 +23,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12 First install paddlepaddle-gpu. For detailed instructions, refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html) ```shell # Install stable release -python -m pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ # Install latest Nightly build python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ @@ -34,7 +34,7 @@ Then install fastdeploy. **Do not install from PyPI**. Use the following methods For SM80/90 architecture GPUs(e.g A30/A100/H100/): ``` # Install stable release -python -m pip install fastdeploy-gpu==2.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-gpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple # Install latest Nightly build python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -43,7 +43,7 @@ python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages For SM86/89 architecture GPUs(e.g A10/4090/L20/L40): ``` # Install stable release -python -m pip install fastdeploy-gpu==2.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-gpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple # Install latest Nightly build python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -64,7 +64,7 @@ docker build -f dockerfiles/Dockerfile.gpu -t fastdeploy:gpu . First install paddlepaddle-gpu. For detailed instructions, refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html) ```shell -python -m pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ ``` Then clone the source code and build: @@ -92,7 +92,7 @@ First, install paddlepaddle-gpu. For detailed instructions, please refer to the [PaddlePaddle Installation Guide](https://www.paddlepaddle.org.cn/). ```shell -python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ ``` Then, clone the FastDeploy repository and build using the precompiled operator wheels: diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index bbf88fd1d26..2e0afdafe53 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -223,6 +223,9 @@ include_draft_logprobs: Optional[bool] = False # Whether to return log probabilities during draft stages (e.g., pre-generation or intermediate steps) # for debugging or analysis of the generation process (default False means not returned). +include_logprobs_decode_token: Optional[bool] = True +# Whether to include decoded token in the logprobs/prompt_logprobs results, (default True means the decoded token is always include in results). + logits_processors_args: Optional[Dict] = None # Additional arguments for logits processors, enabling customization of generation logic # (e.g., dynamically adjusting probability distributions). @@ -479,6 +482,9 @@ include_draft_logprobs: Optional[bool] = False # Whether to return log probabilities during draft stages (e.g., pre-generation or intermediate steps) # for debugging or analysis of the generation process (default False means not returned). +include_logprobs_decode_token: Optional[bool] = True +# Whether to include decoded token in the prompt_logprobs results, (default True means the decoded token is always include in results). + logits_processors_args: Optional[Dict] = None # Additional arguments for logits processors, enabling customization of generation logic # (e.g., dynamically adjusting probability distributions). diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index c4c319f83aa..b0c63e8c64e 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -88,5 +88,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Count for cache_transfer_manager process error "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), + + # Worker process health check timeout when waiting for responses in seconds (default: 30) + "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), } ``` diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md index 5573e8639f5..b0e7f2a64f3 100644 --- a/docs/zh/get_started/installation/kunlunxin_xpu.md +++ b/docs/zh/get_started/installation/kunlunxin_xpu.md @@ -28,9 +28,9 @@ ```bash mkdir Work cd Work -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.3.0 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.4.0 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \ - ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.3.0 \ + ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.4.0 \ /bin/bash docker exec -it fastdeploy-xpu /bin/bash ``` @@ -40,7 +40,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) @@ -52,7 +52,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/ ### 安装 FastDeploy(**注意不要通过 pypi 源安装**) ```bash -python -m pip install fastdeploy-xpu==2.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-xpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple ``` 或者你也可以安装最新版 FastDeploy(不推荐) @@ -66,7 +66,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) diff --git a/docs/zh/get_started/installation/nvidia_gpu.md b/docs/zh/get_started/installation/nvidia_gpu.md index 9cb8d65304c..4c3ebdfe623 100644 --- a/docs/zh/get_started/installation/nvidia_gpu.md +++ b/docs/zh/get_started/installation/nvidia_gpu.md @@ -26,7 +26,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12 ``` shell # Install stable release -python -m pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ # Install latest Nightly build python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ @@ -38,7 +38,7 @@ python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/ ``` # 安装稳定版本fastdeploy -python -m pip install fastdeploy-gpu==2.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-gpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple # 安装Nightly Build的最新版本fastdeploy python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -48,7 +48,7 @@ python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages ``` # 安装稳定版本fastdeploy -python -m pip install fastdeploy-gpu==2.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-gpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple # 安装Nightly Build的最新版本fastdeploy python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -70,7 +70,7 @@ docker build -f dockerfiles/Dockerfile.gpu -t fastdeploy:gpu . 首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/) ``` shell -python -m pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ ``` 接着克隆源代码,编译安装 @@ -98,7 +98,7 @@ FastDeploy 提供了 GPU 算子预编译版 Wheel 包,可在无需完整源码 首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/) ``` shell -python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +python -m pip install paddlepaddle-gpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ ``` 接着克隆源代码,拉取 whl 包并安装 diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index 59debbdbf4d..6c2f94de26c 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -218,6 +218,9 @@ top_p_normalized_logprobs: Optional[bool] = False include_draft_logprobs: Optional[bool] = False # 是否在预生成或中间步骤返回对数概率(log probabilities),用于调试或分析生成过程(默认 False 表示不返回)。 +include_logprobs_decode_token: Optional[bool] = True +# 是否在logprobs/prompt_logprobs结果中返回解码后的token,(默认的True表示总是在结果中返回) + logits_processors_args: Optional[Dict] = None # 传递给 logits 处理器(logits processors)的额外参数,用于自定义生成过程中的逻辑(如动态调整概率分布)。 @@ -467,6 +470,9 @@ top_p_normalized_logprobs: Optional[bool] = False include_draft_logprobs: Optional[bool] = False # 是否在预生成或中间步骤返回对数概率(log probabilities),用于调试或分析生成过程(默认 False 表示不返回)。 +include_logprobs_decode_token: Optional[bool] = True +# 是否在prompt_logprobs结果中返回解码后的token,(默认的True表示总是在结果中返回) + logits_processors_args: Optional[Dict] = None # 传递给 logits 处理器(logits processors)的额外参数,用于自定义生成过程中的逻辑(如动态调整概率分布)。 diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index b0a162a8aa8..119f9fb38bc 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -87,5 +87,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), # cache_transfer_manager 进程残留时连续错误阈值 - "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),} + "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), + + # Worker 进程响应等待时的健康检查超时时间(秒),默认 30 秒 + "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), +} ``` diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index b2b8218c805..08f384ec483 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -85,6 +85,13 @@ def parse_args(): default="ipc", help="cache transfer protocol, only support ipc now", ) + parser.add_argument( + "--default_dtype", + type=str, + default="bfloat16", + choices=["float16", "bfloat16", "uint8"], + help="paddle default dtype, swap_cache_batch only support float16、bfloat16 and uint8 now", + ) parser.add_argument("--local_data_parallel_id", type=int, default=0) parser.add_argument("--rdma_port", type=str, default="", help="rmda port") parser.add_argument( @@ -125,6 +132,7 @@ def __init__(self, args): self.num_extra_layers = self.speculative_config.num_extra_cache_layer self.num_extra_layer_gpu_blocks = int(self.num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio) + paddle.set_default_dtype(args.default_dtype) self.swap_to_cpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.transfer_task_queue = queue.Queue() # 用来接收传输任务 @@ -149,7 +157,7 @@ def __init__(self, args): name="cache_ready_signal", array=cache_ready_signal_data, dtype=np.int32, - suffix=self.engine_pid, + suffix=args.engine_worker_queue_port, create=False, ) swap_space_ready_data = np.zeros(shape=[args.mp_num], dtype=np.int32) @@ -157,7 +165,7 @@ def __init__(self, args): name="swap_space_ready_signal", array=swap_space_ready_data, dtype=np.int32, - suffix=self.engine_pid, + suffix=args.engine_worker_queue_port, create=False, ) @@ -172,7 +180,7 @@ def __init__(self, args): name="cache_task_broadcast_signal", array=cache_task_broadcast_data, dtype=np.int32, - suffix=args.engine_pid, + suffix=args.engine_worker_queue_port, create=False, ) @@ -186,20 +194,22 @@ def __init__(self, args): suffix=args.engine_worker_queue_port, create=False, ) - threading.Thread(target=self.clear_or_update_caches, args=[args], daemon=True).start() + # Initialize update/clear signals for RL + self.kv_cache_status_signal = IPCSignal( + name="kv_cache_status", + array=np.zeros([1], dtype=np.int32), + dtype=np.int32, + suffix=args.engine_worker_queue_port, + create=False, + ) + threading.Thread(target=self.check_cache_status, args=[args], daemon=True).start() + + self._pause_cond = threading.Condition() + self.is_paused = False # transfer manager state + self.inflight = 0 # number of inflight transfer tasks def _init_gpu_cache(self, args): - try: - assert not args.create_cache_tensor - except: - logger.warn( - f"In current implementation, cache transfer manager do not create cache tensors at all, " - f"meaning create_cache_tensor should be False, while we got {args.create_cache_tensor}. " - f"Cache tensor creation will occur in: 1) model runner in case of mixed deployment; " - f"or 2) cache messager in case of disaggregation deployment. " - f"Please check the codes and make sure they work correctly." - ) if not args.create_cache_tensor: logger.info(f"[rank {self.rank}/{self.n_ranks}] Waiting for runners or messagers to create kv cache.") while self.cache_ready_signal.value[self.rank] != 1: @@ -427,6 +437,22 @@ def check_work_status(self, time_interval_threashold=envs.FD_CACHE_PROC_EXIT_TIM return True, "" + def submit_task(self, thread_pool: concurrent.futures.ThreadPoolExecutor, task_fn, *args): + + def inflight_task(fn, *args): + try: + return fn(*args) + finally: + with self._pause_cond: + self.inflight -= 1 + if self.inflight == 0: + self._pause_cond.notify_all() + + with self._pause_cond: + self._pause_cond.wait_for(lambda: not self.is_paused) + self.inflight += 1 + thread_pool.submit(inflight_task, task_fn, *args) + def do_data_transfer(self): """ do data transfer task @@ -459,7 +485,8 @@ def do_data_transfer(self): transfer_task_id, ) = data if event_type.value == CacheStatus.SWAP2CPU.value: - self.swap_to_cpu_thread_pool.submit( + self.submit_task( + self.swap_to_cpu_thread_pool, self._do_swap_to_cpu_task, swap_node_ids, gpu_block_id, @@ -468,7 +495,8 @@ def do_data_transfer(self): transfer_task_id, ) else: - self.swap_to_gpu_thread_pool.submit( + self.submit_task( + self.swap_to_gpu_thread_pool, self._do_swap_to_gpu_task, swap_node_ids, gpu_block_id, @@ -634,97 +662,141 @@ def _transfer_data( transfer_task_id, ) - def clear_or_update_caches(self, args): + def check_cache_status(self, args): # TODO XPU support RL if unset_data_ipc is None: return - logger.info("Start a thread to clear/restore kv cache when model weights are cleared/updated.") - logger.info(f"FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}") - kv_cache_status = np.zeros([1], dtype=np.int32) - kv_cache_status_signal = IPCSignal( - name="kv_cache_status", - array=kv_cache_status, - dtype=np.int32, - suffix=self.engine_pid, - create=False, - ) + logger.info("[RL] Launch a thread to clear/restore kv cache when model weights are cleared/updated.") while True: - if kv_cache_status_signal.value[0] == KVCacheStatus.CLEARING: + # handle cache clearing/restoring + if self.kv_cache_status_signal.value[0] == KVCacheStatus.CLEARING: assert args.splitwise_role == "mixed", "Only mixed mode supports clearing cache." try: - logger.info( - f"[rank {self.rank}/{self.n_ranks}] Start clearing caches {self.cache_ready_signal.value}" - ) + # wait for inflight transfer tasks to finish and pause transfer manager + self.pause() + # clear cpu caches - if envs.FD_ENABLE_SWAP_SPACE_CLEARING: + logger.info("[RL] start clearing caches") + logger.debug("[RL] start clearing cpu caches") + if self.num_cpu_blocks > 0 and envs.FD_ENABLE_SWAP_SPACE_CLEARING: paddle.set_device("cpu") for ptrs in self.k_dst_ptrs + self.v_dst_ptrs: cuda_host_free(ptrs) self.cpu_cache_kvs.clear() self.k_dst_ptrs.clear() self.v_dst_ptrs.clear() + if self.cache_dtype == "block_wise_fp8": + self.k_scales_ptrs.clear() + self.v_scales_ptrs.clear() gc.collect() + logger.debug("[RL] successfully cleared cpu caches") # reset swap_space_ready_signal self.swap_space_ready_signal.value[self.rank] = 0 while np.sum(self.swap_space_ready_signal.value) != 0: time.sleep(0.1) + logger.debug("[RL] all ranks cleared cpu caches") + else: + logger.debug("[RL] skip clearing cpu caches") # clear gpu caches - set_device(self.device) - for name, tensor in self.gpu_cache_kvs.items(): - unset_data_ipc(tensor, name, True, False) - self.gpu_cache_kvs.clear() - self.gpu_cache_k_tensors.clear() - self.gpu_cache_v_tensors.clear() - - # reset cache_ready_signal - self.cache_ready_signal.value[self.rank] = 0 - logger.info( - f"[rank {self.rank}/{self.n_ranks}] Finish clearing caches {self.cache_ready_signal.value}" - ) + logger.debug("[RL] start clearing gpu caches") + if args.create_cache_tensor: + logger.info("[RL] waiting for gpu runner to unlink cuda ipc") + while self.cache_ready_signal.value[self.rank] != 0: + time.sleep(0.1) + logger.info("[RL] stop waiting! gpu runner has unlinked cuda ipc") + paddle.set_device(f"gpu:{self.device}") + self.gpu_cache_kvs.clear() + self.gpu_cache_k_tensors.clear() + self.gpu_cache_v_tensors.clear() + if self.cache_dtype == "block_wise_fp8": + self.gpu_cache_scales_k_tensors.clear() + self.gpu_cache_scales_v_tensors.clear() + paddle.device.cuda.empty_cache() + logger.debug("[RL] successfully cleared gpu caches") + else: + for name, tensor in self.gpu_cache_kvs.items(): + unset_data_ipc(tensor, name, True, False) + logger.debug("[RL] successfully unlinked gpu caches cuda ipc") + self.cache_ready_signal.value[self.rank] = 0 - # wait for all ranks caches to be cleared - if np.sum(self.cache_ready_signal.value) != 0: + while np.sum(self.cache_ready_signal.value) != 0: time.sleep(0.1) + logger.info("[RL] all ranks cleared caches!") # reset kv_cache_status_signal - kv_cache_status_signal.value[0] = KVCacheStatus.CLEARED - logger.info("All ranks finish clearing caches") + self.kv_cache_status_signal.value[0] = KVCacheStatus.CLEARED + + self._log_memory("after clearing caches") except Exception as e: - logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to clear caches: {e}") + logger.error(f"[RL] failed to clear caches: {e}") - elif kv_cache_status_signal.value[0] == KVCacheStatus.UPDATING: + elif self.kv_cache_status_signal.value[0] == KVCacheStatus.UPDATING: assert args.splitwise_role == "mixed", "Only mixed mode supports updating cache." try: - logger.info( - f"[rank {self.rank}/{self.n_ranks}] Start restoring caches {self.cache_ready_signal.value}" - ) # restore cpu cache - if envs.FD_ENABLE_SWAP_SPACE_CLEARING: + logger.info("[RL] start restoring caches") + logger.debug("[RL] start restoring cpu caches") + if self.num_cpu_blocks > 0 and envs.FD_ENABLE_SWAP_SPACE_CLEARING: self._init_cpu_cache(args) + logger.debug("[RL] successfully restored cpu caches") while np.sum(self.swap_space_ready_signal.value) != args.mp_num: time.sleep(0.1) + logger.debug("[RL] all ranks restored cpu caches") + else: + logger.debug("[RL] skip restoring cpu caches") # restore gpu cache and set cache_ready_signal + logger.debug("[RL] start restoring gpu caches") self._init_gpu_cache(args) - logger.info( - f"[rank {self.rank}/{self.n_ranks}] Finish restoring caches {self.cache_ready_signal.value}" - ) + logger.debug("[RL] successfully restored gpu caches") # wait for all ranks caches to be ready while np.sum(self.cache_ready_signal.value) != args.mp_num: time.sleep(0.1) + logger.info("[RL] all ranks restored caches!") # set kv_cache_status_signal - logger.info("All ranks finish restoring caches") - kv_cache_status_signal.value[0] = KVCacheStatus.NORMAL + self.kv_cache_status_signal.value[0] = KVCacheStatus.NORMAL + + self._log_memory("after restoring caches") + + # resume transfer + self.resume() except Exception as e: - logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to restore caches: {e}") + logger.error(f"[RL] failed to restore caches: {e}") time.sleep(0.1) + def pause(self): + logger.info("[RL] wait for inflight transfer tasks to finish and pause transfer manager 🔴") + with self._pause_cond: + self.is_paused = True + self._pause_cond.wait_for(lambda: self.inflight == 0) + + def resume(self): + logger.info("[RL] resume transfer manager and start to do transfer tasks 🟢") + with self._pause_cond: + self.is_paused = False + self._pause_cond.notify_all() + + def _log_memory(self, context: str): + """Log current GPU memory usage.""" + max_alloc = paddle.device.cuda.max_memory_allocated() / (1024**3) + max_reserved = paddle.device.cuda.max_memory_reserved() / (1024**3) + curr_alloc = paddle.device.cuda.memory_allocated() / (1024**3) + curr_reserved = paddle.device.cuda.memory_reserved() / (1024**3) + + logger.warning( + f"GPU memory usage {context}:" + f"max_allocated: {max_alloc:.2f}GB " + f"max_reserved: {max_reserved:.2f}GB " + f"current_allocated: {curr_alloc:.2f}GB " + f"current_reserved: {curr_reserved:.2f}GB" + ) + def main(): """ diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index a3c610965a5..311645f6d5f 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -254,36 +254,38 @@ def launch_cache_manager( val_shape_str = str(val_cache_shape) val_cache_arg_str = f" --value_cache_shape {val_shape_str}" - for i in range(tensor_parallel_size): - launch_cmd = ( - "FLAGS_allocator_strategy=auto_growth " - + visible_devices - + " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0" - + f" FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}" - + f" {sys.executable} {py_path}" - + f" --device_id {int(device_ids[i])}" - + f" --rank {i}" - + f" --splitwise_role {self.splitwise_role}" - + f" --num_layers {cache_config.model_cfg.num_hidden_layers}" - + f" --mp_num {tensor_parallel_size}" - + f" --cache_dtype {cache_config.cache_dtype}" - + f" --key_cache_shape {key_cache_shape}" - + val_cache_arg_str - + f" --cache_queue_port {cache_config.cache_queue_port}" - + f" --enable_splitwise {int(self.enable_splitwise)}" - + f" --pod_ip {pod_ip}" - + f" --engine_worker_queue_port {engine_worker_queue_port}" - + f" --num_cpu_blocks {cache_config.num_cpu_blocks}" - + f" --engine_pid {pid_suffix}" - + f" --protocol {cache_config.cache_transfer_protocol}" - + f" --local_data_parallel_id {self.local_data_parallel_id}" - + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" - + f" --speculative_config '{self.speculative_config.to_json_string()}'" - + (" --create_cache_tensor" if create_cache_tensor else "") - + f" >{log_dir}/launch_cache_transfer_manager_tprank{i}.log 2>&1" - ) - logger.info(f"Launch cache transfer manager, command:{launch_cmd}") - cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid)) + if self.cache_config.enable_hierarchical_cache: + for i in range(tensor_parallel_size): + launch_cmd = ( + "FLAGS_allocator_strategy=auto_growth " + + visible_devices + + " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0" + + f" FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}" + + f" {sys.executable} {py_path}" + + f" --device_id {int(device_ids[i])}" + + f" --rank {i}" + + f" --splitwise_role {self.splitwise_role}" + + f" --num_layers {cache_config.model_cfg.num_hidden_layers}" + + f" --mp_num {tensor_parallel_size}" + + f" --cache_dtype {cache_config.cache_dtype}" + + f" --key_cache_shape {key_cache_shape}" + + val_cache_arg_str + + f" --cache_queue_port {cache_config.cache_queue_port}" + + f" --enable_splitwise {int(self.enable_splitwise)}" + + f" --pod_ip {pod_ip}" + + f" --engine_worker_queue_port {engine_worker_queue_port}" + + f" --num_cpu_blocks {cache_config.num_cpu_blocks}" + + f" --engine_pid {pid_suffix}" + + f" --default_dtype '{self.config.model_config.dtype}'" + + f" --protocol {cache_config.cache_transfer_protocol}" + + f" --local_data_parallel_id {self.local_data_parallel_id}" + + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --speculative_config '{self.speculative_config.to_json_string()}'" + + (" --create_cache_tensor" if not self.enable_splitwise else "") + + f" >{log_dir}/launch_cache_transfer_manager_tprank{i}.log 2>&1" + ) + logger.info(f"Launch cache transfer manager, command:{launch_cmd}") + cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid)) logger.info("PrefixCacheManager is waiting for kv cache to be initialized.") while np.sum(self.cache_ready_signal.value) != tensor_parallel_size: @@ -293,13 +295,14 @@ def launch_cache_manager( while np.sum(self.swap_space_ready_signal.value) != tensor_parallel_size: time.sleep(1) - exit_code = cache_manager_processes[-1].poll() - if exit_code is None: - logger.info("Launch cache transfer manager successful") - else: - logger.info( - "Launch cache transfer manager failed, see launch_cache_transfer_manager.log for more information" - ) + if cache_manager_processes: + exit_code = cache_manager_processes[-1].poll() + if exit_code is None: + logger.info("Launch cache transfer manager successful") + else: + logger.info( + "Launch cache transfer manager failed, see launch_cache_transfer_manager.log for more information" + ) # Start additional threads if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0: @@ -684,6 +687,8 @@ def request_match_blocks(self, task, block_size, *args): "cpu_cache_blocks": 0, "gpu_match_token_num": 0, "cpu_match_token_num": 0, + "match_gpu_block_ids": [], + "match_cpu_block_ids": [], } self.metrics.req_count += 1 if isinstance(task.prompt_token_ids, np.ndarray): @@ -742,6 +747,8 @@ def request_match_blocks(self, task, block_size, *args): hit_info["cpu_cache_blocks"] = len(match_cpu_block_ids) hit_info["gpu_match_token_num"] = gpu_match_token_num hit_info["cpu_match_token_num"] = cpu_match_token_num + hit_info["match_gpu_block_ids"] = match_gpu_block_ids + hit_info["match_cpu_block_ids"] = match_cpu_block_ids self.metrics._update_history_hit_metrics() if self.metrics.req_count % 10000 == 0: self.metrics.reset_metrics() @@ -1269,64 +1276,6 @@ def hash_block_features(self, input_ids, extra_keys: list = []): """ return hashlib.sha256(pickle.dumps((input_ids, extra_keys))).hexdigest() - def _revert_match_blocks( - self, - request, - matched_token_num: int, - block_size: int, - chunk_idx: int, - match_node_ids: list, - matche_nodes: list, - match_gpu_block_ids: list, - match_cpu_block_ids: list, - gpu_match_token_num: int, - cpu_match_token_num: int, - swap_node_ids: list, - ): - position = request.multimodal_inputs["mm_positions"][chunk_idx] - revert_tokens = matched_token_num - position.offset - match_block_ids = [node.block_id for node in matche_nodes] - logger.warning( - f"match_block: req_id {request.request_id} revert tokens: {revert_tokens} from matched nodes: {match_block_ids}" - ) - while revert_tokens >= block_size: - if len(matche_nodes) == 0: - logger.error(f"req_id {request.request_id} revert nodes error, tokens: {revert_tokens}") - break - revert_tokens -= block_size - revert_block = matche_nodes.pop() - revert_block_id = revert_block.block_id - if revert_block_id in match_gpu_block_ids: - match_gpu_block_ids.remove(revert_block_id) - match_node_ids.remove(revert_block.node_id) - gpu_match_token_num -= block_size - elif revert_block_id in match_cpu_block_ids: - match_cpu_block_ids.remove(revert_block_id) - match_node_ids.remove(revert_block.node_id) - cpu_match_token_num -= block_size - else: - logger.error( - f"req_id {request.request_id} revert nodes error, nodes: {revert_block_id}, " - f"match_gpu_block_ids: {match_gpu_block_ids}, match_cpu_block_ids: {match_cpu_block_ids}" - ) - break - if revert_block_id in swap_node_ids: - swap_node_ids.remove(revert_block_id) - - if revert_tokens > 0: - last_block_id = matche_nodes[-1].block_id - if last_block_id in match_gpu_block_ids: - gpu_match_token_num -= revert_tokens - elif last_block_id in match_cpu_block_ids: - cpu_match_token_num -= revert_tokens - else: - logger.error( - f"req_id {request.request_id} revert nodes error, revert_tokens: {revert_tokens}, nodes: {last_block_id}, " - f"match_gpu_block_ids: {match_gpu_block_ids}, match_cpu_block_ids: {match_cpu_block_ids}" - ) - current_node = self.radix_tree_root if len(matche_nodes) == 0 else matche_nodes[-1] - return gpu_match_token_num, cpu_match_token_num, current_node - def mm_match_block(self, request, block_size): """ Match and retrieve cached blocks for multimodal requests using a radix tree structure. @@ -1415,28 +1364,6 @@ def mm_match_block(self, request, block_size): if has_modified_cpu_lru_leaf_heap: heapq.heapify(self.cpu_lru_leaf_heap) - if self.cache_config.disable_chunked_mm_input: - matched_token_num = gpu_match_token_num + cpu_match_token_num - is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num) - if is_chunked: - ( - gpu_match_token_num, - cpu_match_token_num, - current_match_node, - ) = self._revert_match_blocks( - request=request, - matched_token_num=matched_token_num, - block_size=block_size, - chunk_idx=chunk_idx, - match_node_ids=match_node_ids, - matche_nodes=matche_nodes, - match_gpu_block_ids=match_gpu_block_ids, - match_cpu_block_ids=match_cpu_block_ids, - gpu_match_token_num=gpu_match_token_num, - cpu_match_token_num=cpu_match_token_num, - swap_node_ids=swap_node_ids, - ) - logger.info(f"match_block: req_id {request.request_id} matched nodes: {match_node_ids}") return ( match_gpu_block_ids, diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 63ac382d108..da63eb0bdcf 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -127,6 +127,11 @@ class ErnieArchitectures: "Ernie4_5_VLMoeForProcessRewardModel", } + ERNIE5_MODELS = { + "Ernie5ForCausalLM", + "Ernie5MoeForCausalLM", + } + @classmethod def register_ernie_model_arch(cls, model_class): if model_class.name().startswith("Ernie") and model_class.name() not in cls.ARCHITECTURES: @@ -142,6 +147,11 @@ def is_ernie_arch(cls, architecture): """Check if the given architecture is an ERNIE architecture.""" return architecture in cls.ARCHITECTURES + @classmethod + def is_ernie5_arch(cls, architectures): + """Check if the given architecture is an ERNIE5 architecture.""" + return any(arch in architectures for arch in cls.ERNIE5_MODELS) + PRETRAINED_INIT_CONFIGURATION = { "top_p": 1.0, @@ -200,6 +210,7 @@ def __init__( self.revision = None self.prefix_layer_name = "layers" self.kv_cache_quant_scale_path = "" + self.enable_entropy = False self.partial_rotary_factor: float = 1.0 self.num_nextn_predict_layers = 0 @@ -313,6 +324,9 @@ def override_name_from_config(self): self.moe_num_experts = self.num_experts if hasattr(self, "n_routed_experts") and getattr(self, "moe_num_experts") is None: self.moe_num_experts = self.n_routed_experts + if hasattr(self, "n_shared_experts") and getattr(self, "moe_num_shared_experts") is None: + # Because the ERNIE 4.5 config.json contains two sets of keys, adaptation is required. + self.moe_num_shared_experts = self.n_shared_experts def read_from_env(self): """ @@ -566,6 +580,8 @@ def __init__( self.use_internode_ll_two_stage: bool = False # disable sequence parallel moe self.disable_sequence_parallel_moe: bool = False + # shutdown comm group if worker idle + self.shutdown_comm_group_if_worker_idle: bool = None self.pod_ip: str = None # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce). @@ -585,6 +601,9 @@ def __init__( self.expert_parallel_size = 1 self.use_ep = self.expert_parallel_size > 1 + if self.shutdown_comm_group_if_worker_idle is None: + self.shutdown_comm_group_if_worker_idle = not self.use_ep + # pd_disaggregation use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0)) use_pd_disaggregation_per_chunk: int = int(os.getenv("FLAGS_use_pd_disaggregation_per_chunk", 0)) @@ -685,6 +704,8 @@ def __init__( self.num_extra_cache_layer = 0 + self.enable_draft_logprob: bool = False + for key, value in args.items(): if hasattr(self, key): setattr(self, key, value) @@ -901,17 +922,19 @@ def init_with_cudagrpah_size(self, max_capture_size: int = 0) -> None: self.real_shape_to_captured_size[bs] = end self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size - def _set_cudagraph_sizes(self, max_capture_size: int = 0): + def _set_cudagraph_sizes(self, max_capture_size: int = 0, dec_token_per_query_per_step: int = 1): """ Calculate a series of candidate capture sizes, and then extract a portion of them as the capture list for the CUDA graph based on user input. """ - # Shape [1, 2, 4, 8, 16, ... 120, 128] - draft_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)] - # Shape [128, 144, ... 240, 256] - draft_capture_sizes += [16 * i for i in range(9, 17)] - # Shape [256, 288, ... 992, 1024] - draft_capture_sizes += [32 * i for i in range(9, 33)] + # Shape [1, 2, 4, 8, 16, ... 120, 128] * dec_token_per_query_per_step + draft_capture_sizes = [i * dec_token_per_query_per_step for i in [1, 2, 4]] + [ + 8 * i * dec_token_per_query_per_step for i in range(1, 17) + ] + # Shape [128, 144, ... 240, 256] * dec_token_per_query_per_step + draft_capture_sizes += [16 * i * dec_token_per_query_per_step for i in range(9, 17)] + # Shape [256, 288, ... 992, 1024] * dec_token_per_query_per_step + draft_capture_sizes += [32 * i * dec_token_per_query_per_step for i in range(9, 33)] draft_capture_sizes.append(max_capture_size) self.cudagraph_capture_sizes = sorted(draft_capture_sizes) @@ -1271,6 +1294,8 @@ def __init__(self, args): self.max_processor_cache = None self.enable_output_caching = False self.disable_chunked_mm_input = False + self.num_cpu_blocks = None + for key, value in args.items(): if hasattr(self, key): setattr(self, key, value) @@ -1322,10 +1347,12 @@ def __init__(self, args): * byte_size ) - if self.swap_space is None: - self.num_cpu_blocks = 0 - else: - self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block) + if self.num_cpu_blocks is None: + if self.swap_space is None: + self.num_cpu_blocks = 0 + else: + self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block) + self._verify_args() def metrics_info(self): @@ -1484,6 +1511,40 @@ def __str__(self) -> str: return json.dumps({key: value for key, value in self.__dict__.items()}) +class RoutingReplayConfig: + """Configuration for Routing Replay used in RL training""" + + def __init__(self, args) -> None: + + self.enable_routing_replay: bool = False + + # Routing store type: local/rdma + self.routing_store_type: str = "local" + + # Local routing store + self.local_store_dir: str = "./routing_replay_output" + + # RDMA routing store + self.rdma_store_server: str = "" + + # Only save last turn + self.only_last_turn: bool = False + + # Fused routing of all layers + self.use_fused_put: bool = False + + if args is not None: + for key, value in args.items(): + if hasattr(self, key) and value != "None": + setattr(self, key, value) + + def to_json_string(self): + """ + Convert routing replay config to json string. + """ + return json.dumps({key: value for key, value in self.__dict__.items()}) + + class FDConfig: """ The configuration class which contains all fastdeploy-related configuration. This @@ -1517,6 +1578,7 @@ def __init__( early_stop_config: Optional[Dict[str, Any]] = None, tool_parser: str = None, test_mode=False, + routing_replay_config: Optional[RoutingReplayConfig] = None, ): self.model_config: ModelConfig = model_config # type: ignore self.cache_config: CacheConfig = cache_config # type: ignore @@ -1533,6 +1595,7 @@ def __init__( self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config self.router_config: RouterConfig = router_config + self.routing_replay_config = routing_replay_config # Initialize cuda graph capture list max_capture_shape = self.scheduler_config.max_num_seqs @@ -1547,12 +1610,16 @@ def __init__( max_capture_shape = min(512, max_capture_shape) if self.graph_opt_config.cudagraph_capture_sizes is None: - self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape) + dec_token_per_query_per_step = ( + self.speculative_config.num_speculative_tokens + 1 + if self.speculative_config is not None and self.speculative_config.method is not None + else 1 + ) + self.graph_opt_config._set_cudagraph_sizes( + max_capture_size=max_capture_shape, dec_token_per_query_per_step=dec_token_per_query_per_step + ) self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape) - if self.parallel_config.use_ep: - self.graph_opt_config.cudagraph_capture_sizes = [0] + self.graph_opt_config.cudagraph_capture_sizes - self.tokenizer = tokenizer self.ips = ips self.tool_parser = tool_parser @@ -1634,6 +1701,11 @@ def postprocess(self): """ calculate some parameters """ + # Unified field model config + if self.model_config.architectures[0] == "Glm4MoeForCausalLM": + # The first moe layer id of GLM4.5 model + self.model_config.moe_layer_start_index = self.model_config.first_k_dense_replace + self.local_device_ids = self.parallel_config.device_ids.split(",")[: self.parallel_config.tensor_parallel_size] if self.parallel_config.tensor_parallel_size <= self.worker_num_per_node or self.node_rank == 0: @@ -1664,7 +1736,6 @@ def postprocess(self): self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs) if self.model_config is not None and self.model_config.enable_mm and not envs.ENABLE_V1_KVCACHE_SCHEDULER: self.cache_config.enable_prefix_caching = False - if ( self.structured_outputs_config is not None and self.structured_outputs_config.guided_decoding_backend != "off" @@ -1714,18 +1785,6 @@ def postprocess(self): if not current_platform.is_cuda(): self.graph_opt_config.use_cudagraph = False logger.info("CUDAGraph currently only support on GPU!") - if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: - if self.scheduler_config.max_num_seqs < self.parallel_config.tensor_parallel_size: - self.parallel_config.use_sequence_parallel_moe = False - logger.info( - "Warning: sequence parallel moe do not support max_num_seqs < tensor_parallel_size when cudagraph enabled. We set use_sequence_parallel_moe to False." - ) - else: - # It will hang when real batch_size < tp_size - self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size) - if self.model_config.enable_mm and self.graph_opt_config.use_cudagraph: - self.cache_config.enable_prefix_caching = False - logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!") if self.scheduler_config.splitwise_role == "mixed": self._disable_sequence_parallel_moe_if_needed("Mixed") @@ -1737,6 +1796,19 @@ def postprocess(self): self.model_config.moe_phase = MoEPhase(phase="decode") else: raise NotImplementedError + if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: + if self.scheduler_config.max_num_seqs < self.parallel_config.tensor_parallel_size: + self.parallel_config.use_sequence_parallel_moe = False + logger.info( + "Warning: sequence parallel moe do not support max_num_seqs < tensor_parallel_size when cudagraph enabled. We set use_sequence_parallel_moe to False." + ) + else: + # It will hang when real batch_size < tp_size + self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size) + + if ErnieArchitectures.is_ernie5_arch(self.model_config.architectures): + # ernie5 model not support chunked_mm_input + self.cache_config.disable_chunked_mm_input = True def check(self): """ diff --git a/fastdeploy/distributed/communication.py b/fastdeploy/distributed/communication.py index a8581595670..922fbb3df8e 100644 --- a/fastdeploy/distributed/communication.py +++ b/fastdeploy/distributed/communication.py @@ -56,6 +56,8 @@ def tensor_model_parallel_all_reduce( group_: paddle.distributed.communication.group.Group = None, ) -> paddle.Tensor: """All-reduce the input tensor across model parallel group.""" + if input_.shape[0] == 0: + return input_ global _TP_AR if _TP_AR is not None and _TP_AR.should_custom_ar(input_): # TODO: supports different_group custom allreduce @@ -90,6 +92,8 @@ def all_reduce( @paddle.jit.marker.unified def tensor_model_parallel_all_reduce_custom(input_: paddle.Tensor) -> paddle.Tensor: """All-reduce the input tensor across model parallel group on calc stream.""" + if input_.shape[0] == 0: + return input_ if paddle.in_dynamic_mode(): hcg = dist.fleet.get_hybrid_communicate_group() mp_group = hcg.get_model_parallel_group() diff --git a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py index dfbed094dd6..0c9be796ced 100644 --- a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py +++ b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py @@ -207,6 +207,10 @@ def register_graph_buffers(self): def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]: """The main allreduce API that provides support for cuda graph.""" + + if input.shape[0] == 0: + return input + if self.capturing: lib = cuda_wrapper.CudaRTLibrary() stream = paddle.device.current_stream() diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 1eaf535498a..75e84447761 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -35,6 +35,7 @@ PlasAttentionConfig, PoolerConfig, RouterConfig, + RoutingReplayConfig, RunnerOption, SpeculativeConfig, StructuredOutputsConfig, @@ -237,7 +238,7 @@ class EngineArgs: """ Flag to enable prefix caching. """ - enable_output_caching: bool = True + enable_output_caching: bool = False """ Flag to enable kv cache for output tokens, only valid in V1 scheduler. """ @@ -265,6 +266,11 @@ class EngineArgs: # This optimization is enabled by default, and can be disabled by using this flag. """ + shutdown_comm_group_if_worker_idle: bool = None + """ + Whether to shutdown the comm group when the weight is cleared. + """ + engine_worker_queue_port: str = "0" """ Port for worker queue communication. @@ -491,6 +497,16 @@ class EngineArgs: Configuration for eplb. """ + routing_replay_config: Optional[Dict[str, Any]] = None + """ + Flag to rollout routing replay(r3) + """ + + enable_entropy: bool = False + """ + Flag to enable entropy output. Default is False (disabled). + """ + def __post_init__(self): """ Post-initialization processing to set default tokenizer if not provided. @@ -798,6 +814,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.logits_processors, help="FQCNs (Fully Qualified Class Names) of logits processors supported by the service.", ) + model_group.add_argument( + "--enable-entropy", + action="store_true", + default=EngineArgs.enable_entropy, + help="Enable output of token-level entropy.", + ) # Parallel processing parameters group parallel_group = parser.add_argument_group("Parallel Configuration") @@ -882,6 +904,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.eplb_config, help="Config of eplb.", ) + parallel_group.add_argument( + "--routing-replay-config", + type=json.loads, + default=EngineArgs.routing_replay_config, + help="Flag of rollout routing replay(r3).", + ) parallel_group.add_argument( "--enable-chunked-moe", action="store_true", @@ -894,6 +922,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.chunked_moe_size, help="Chunked size of moe input.", ) + parallel_group.add_argument( + "--shutdown-comm-group-if-worker-idle", + action=argparse.BooleanOptionalAction, + default=EngineArgs.shutdown_comm_group_if_worker_idle, + help="Shutdown communication group when worker is idle.", + ) # Load group load_group = parser.add_argument_group("Load Configuration") @@ -1235,6 +1269,14 @@ def create_eplb_config(self) -> EPLBConfig: eplb_args["enable_eplb"] = self.enable_eplb return EPLBConfig(eplb_args) + def create_routing_repaly_config(self) -> RoutingReplayConfig: + """ """ + routing_replay_args = asdict(self) + if self.routing_replay_config is not None: + for k, v in self.routing_replay_config.items(): + routing_replay_args[k] = v + return RoutingReplayConfig(routing_replay_args) + def create_engine_config(self, port_availability_check=True) -> FDConfig: """ Create and return a Config object based on the current settings. @@ -1278,6 +1320,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig: graph_opt_cfg = self.create_graph_optimization_config() plas_attention_config = self.create_plas_attention_config() eplb_cfg = self.create_eplb_config() + routing_replay_config = self.create_routing_repaly_config() router_config = RouterConfig(all_dict) early_stop_cfg = self.create_early_stop_config() @@ -1310,4 +1353,5 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig: graph_opt_config=graph_opt_cfg, plas_attention_config=plas_attention_config, early_stop_config=early_stop_cfg, + routing_replay_config=routing_replay_config, ) diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py index 50e74f30153..1bfde17eaea 100644 --- a/fastdeploy/engine/async_llm.py +++ b/fastdeploy/engine/async_llm.py @@ -835,6 +835,7 @@ def _start_worker_service(self): f" --logprobs_mode {self.cfg.model_config.logprobs_mode}" f" --max_logprobs {self.cfg.model_config.max_logprobs}" f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'" + f" --num_cpu_blocks {self.cfg.cache_config.num_cpu_blocks}" ) worker_store_true_flag = { diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 4a493843df7..8f3eb9d026c 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -483,9 +483,6 @@ def _setting_environ_variables(self): if self.cfg.scheduler_config.splitwise_role == "prefill": variables["FLAGS_fmt_write_cache_completed_signal"] = 1 - if self.cfg.model_config.enable_mm: - variables["FLAGS_max_partition_size"] = 1024 - command_prefix = "" for k, v in variables.items(): command_prefix += f"{k}={v} " @@ -568,6 +565,8 @@ def _start_worker_service(self): f" --logprobs_mode {self.cfg.model_config.logprobs_mode}" f" --max_logprobs {self.cfg.model_config.max_logprobs}" f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'" + f" --routing_replay_config '{self.cfg.routing_replay_config.to_json_string()}'" + f" --num_cpu_blocks {self.cfg.cache_config.num_cpu_blocks}" ) if self.cfg.structured_outputs_config.logits_processors is not None: arguments += f" --logits-processors {' '.join(self.cfg.structured_outputs_config.logits_processors)}" @@ -585,6 +584,8 @@ def _start_worker_service(self): "disable_sequence_parallel_moe": self.cfg.parallel_config.disable_sequence_parallel_moe, "enable_logprob": self.cfg.model_config.enable_logprob, "lm_head_fp32": self.cfg.model_config.lm_head_fp32, + "shutdown_comm_group_if_worker_idle": self.cfg.parallel_config.shutdown_comm_group_if_worker_idle, + "enable_entropy": self.cfg.model_config.enable_entropy, } for worker_flag, value in worker_store_true_flag.items(): if value: diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py index 3b8c40cca3c..d462e37f23c 100644 --- a/fastdeploy/engine/expert_service.py +++ b/fastdeploy/engine/expert_service.py @@ -67,13 +67,6 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True): else: self.do_profile = False - if cfg.scheduler_config.splitwise_role != "mixed": - if len(self.cfg.cache_config.pd_comm_port) == 1: - self.cfg.cache_config.pd_comm_port[0] = ( - int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id - ) - else: - self.cfg.cache_config.pd_comm_port = [self.cfg.cache_config.pd_comm_port[local_data_parallel_id]] self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id self.engine = EngineService(self.cfg, start_queue) if self.cfg.scheduler_config.name == "splitwise": diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 9f281c3e68c..f6bb75183c4 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -31,7 +31,12 @@ from fastdeploy.engine.sampling_params import SamplingParams from fastdeploy.entrypoints.openai.protocol import ToolCall from fastdeploy.utils import data_processor_logger -from fastdeploy.worker.output import LogprobsLists, PromptLogprobs, SampleLogprobs +from fastdeploy.worker.output import ( + LogprobsLists, + PromptLogprobs, + SampleLogprobs, + SpeculateMetrics, +) class RequestStatus(Enum): @@ -187,7 +192,12 @@ def from_dict(cls, d: dict): pooling_params = PoolingParams.from_dict(d["pooling_params"]) else: sampling_params = SamplingParams.from_dict(d) - + logprobs = d.get("logprobs", None) + if logprobs is not None: + if logprobs is True: + sampling_params.logprobs = d.get("top_logprobs", None) + elif logprobs is False: + sampling_params.logprobs = None if ( isinstance(d.get("multimodal_inputs"), dict) and isinstance(d["multimodal_inputs"].get("mm_positions"), list) @@ -397,6 +407,7 @@ class CompletionOutput: text: Optional[str] = None reasoning_content: Optional[str] = None tool_calls: Optional[ToolCall] = None + speculate_metrics: Optional[SpeculateMetrics] = None def to_dict(self): """ @@ -470,24 +481,13 @@ class RequestMetrics: llm_engine_recv_req_timestamp: Optional[float] = None llm_engine_send_req_to_engine_timestamp: Optional[float] = None llm_engine_recv_token_timestamp: Optional[float] = None + speculate_metrics: Optional[SpeculateMetrics] = None def to_dict(self): """ Convert the RequestMetrics object to a dictionary. """ - return { - "arrival_time": self.arrival_time, - "inference_start_time": self.inference_start_time, - "first_token_time": self.first_token_time, - "time_in_queue": self.time_in_queue, - "preprocess_cost_time": self.preprocess_cost_time, - "model_forward_time": self.model_forward_time, - "model_execute_time": self.model_execute_time, - "request_start_time": self.request_start_time, - "llm_engine_recv_req_timestamp": self.llm_engine_recv_req_timestamp, - "llm_engine_send_req_to_engine_timestamp": self.llm_engine_send_req_to_engine_timestamp, - "llm_engine_recv_token_timestamp": self.llm_engine_recv_token_timestamp, - } + return {k: v for k, v in asdict(self).items()} @classmethod def from_dict(cls, req_dict: dict[str, Any]) -> RequestMetrics: @@ -589,6 +589,8 @@ def add(self, next_output: RequestOutput) -> None: self.outputs.draft_top_logprobs.sampled_token_ranks.extend( next_output.outputs.draft_top_logprobs.sampled_token_ranks ) + if next_output.metrics.speculate_metrics is not None: + self.outputs.speculate_metrics = next_output.metrics.speculate_metrics def __repr__(self) -> str: return ( diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 440acb81045..97baff7d71f 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -182,7 +182,7 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l name="need_block_num_signal", array=need_block_num_data, dtype=np.int32, - suffix=local_data_parallel_id, + suffix=self.config.parallel_config.engine_worker_queue_port[local_data_parallel_id], create=True, ) @@ -200,6 +200,19 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l self.bos_client = None self.async_preprocess_pool = ThreadPoolExecutor(max_workers=4) + self.init_reserve_output_block_num = ( + envs.FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL + ) # int + self.decay_output_block_num = ( + envs.FD_RESERVE_DECAY_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL + ) # float + self.min_reserve_output_block_num = ( + envs.FD_RESERVE_MIN_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL + ) # int + self.current_reserve_output_block_num = self.init_reserve_output_block_num + self.current_reserve_output_block_num_float = self.init_reserve_output_block_num + self.can_relax_prefill_strategy = True + def allocated_slots(self, request: Request): return len(request.block_tables) * self.config.cache_config.block_size @@ -293,6 +306,9 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re # The request can be scheduled. can_schedule = True break + self.current_reserve_output_block_num = self.init_reserve_output_block_num + self.current_reserve_output_block_num_float = self.init_reserve_output_block_num + self.can_relax_prefill_strategy = False return can_schedule def _update_mm_hashes(self, request): @@ -329,7 +345,7 @@ def _update_mm_hashes(self, request): token_st += h * w // 4 inputs["mm_positions"] = new_mm_positions inputs["mm_hashes"] = new_mm_hashes - else: + elif inputs.get("mm_positions", None) is None or inputs.get("mm_hashes", None) is None: inputs["mm_positions"] = [] inputs["mm_hashes"] = [] @@ -353,6 +369,32 @@ def _is_mm_request(self, request): return False + def revert_chunked_mm_input(self, mm_inputs, matched_token_num): + """ + revert mm_inputs that is chunked + """ + if mm_inputs is None or "mm_positions" not in mm_inputs or len(mm_inputs["mm_positions"]) == 0: + return matched_token_num + + position_idx = len(mm_inputs["mm_positions"]) - 1 + while matched_token_num > 0 and position_idx >= 0: + position = mm_inputs["mm_positions"][position_idx] + if position.offset < matched_token_num < position.offset + position.length: + matched_token_num = ( + position.offset // self.config.cache_config.block_size + ) * self.config.cache_config.block_size + position_idx -= 1 + elif matched_token_num <= position.offset: + position_idx -= 1 + elif matched_token_num >= position.offset + position.length: + break + else: + llm_logger.error( + f"revert_chunked_mm_input error, matched_token_num:{matched_token_num} position:{position}, {mm_inputs['mm_positions']}" + ) + break + return matched_token_num + def _get_num_new_tokens(self, request, token_budget): # TODO: set condition to new _get_num_new_tokens num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens @@ -375,6 +417,18 @@ def _get_num_new_tokens(self, request, token_budget): start_patch_idx = inputs["patch_idx"][-1] else: start_patch_idx = inputs["patch_idx"][pre_end_idx] + if ( + pre_end_idx > 0 + and request.prompt_token_ids[pre_end_idx] + in [ + inputs["image_patch_id"], + inputs["video_patch_id"], + inputs["audio_patch_id"], + ] + and request.prompt_token_ids[pre_end_idx] != request.prompt_token_ids[pre_end_idx - 1] + ): + # It just hit the starting position of the image / video / audio + start_patch_idx -= 1 start_patch_map = inputs["patch_map"][start_patch_idx] request.image_start = start_patch_map["image_num"] request.video_start = start_patch_map["video_num"] @@ -511,6 +565,19 @@ def cache_output_tokens(self, request): request, self.config.cache_config.block_size, request.num_total_tokens - 1 ) + def _get_can_schedule_prefill_threshold_block(self, request, num_chunk_new_block): + if self.can_relax_prefill_strategy: + can_schedule_block_num_threshold = num_chunk_new_block + else: + can_schedule_block_num_threshold = ( + request.need_prefill_tokens + self.config.cache_config.block_size - 1 + ) // self.config.cache_config.block_size + len(self.running) * self.current_reserve_output_block_num + if self.config.speculative_config.method is not None: + can_schedule_block_num_threshold = min( + can_schedule_block_num_threshold + 1, self.config.cache_config.max_block_num_per_seq + ) + return can_schedule_block_num_threshold + def schedule(self): """ Try to pull a batch of requests from the waiting queue and schedule them. @@ -691,8 +758,11 @@ def _allocate_decode_and_extend(): num_new_tokens = self._get_num_new_tokens(request, token_budget) num_new_block = self.get_new_block_nums(request, num_new_tokens) + can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block( + request, num_new_block + ) # Allocate blocks to prefill - if self.cache_manager.can_allocate_gpu_blocks(num_new_block): + if self.cache_manager.can_allocate_gpu_blocks(can_schedule_block_num_threshold): if not request.get("skip_allocate", False): request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block)) self.waiting.popleft() @@ -736,8 +806,11 @@ def _allocate_decode_and_extend(): break num_new_tokens = self._get_num_new_tokens(request, token_budget) num_new_block = self.get_new_block_nums(request, num_new_tokens) + can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block( + request, num_new_block + ) # Allocate blocks to prefill - if self.cache_manager.can_allocate_gpu_blocks(num_new_block): + if self.cache_manager.can_allocate_gpu_blocks(can_schedule_block_num_threshold): if not request.get("skip_allocate", False): request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block)) self.waiting.popleft() @@ -763,7 +836,14 @@ def _allocate_decode_and_extend(): if scheduled_reqs: llm_logger.debug(f"schedued_reqs: {scheduled_reqs}") - + self.current_reserve_output_block_num_float -= self.decay_output_block_num + self.current_reserve_output_block_num = max( + int(self.current_reserve_output_block_num_float), + self.min_reserve_output_block_num, + 0, + ) + if self.current_reserve_output_block_num == 0: + self.can_relax_prefill_strategy = True self.update_metrics() return scheduled_reqs, error_reqs @@ -893,22 +973,43 @@ def get_prefix_cached_blocks(self, request: Request): ) request.num_cached_tokens = matched_token_num - request.gpu_cache_token_num = hit_info["gpu_match_token_num"] - request.cpu_cache_token_num = hit_info["cpu_match_token_num"] request.cache_info = (matched_block_num, no_cache_block_num) request.block_tables = common_block_ids request.skip_allocate = False + if self.config.cache_config.disable_chunked_mm_input: + if matched_token_num == request.need_prefill_tokens: + matched_token_num = matched_token_num - self.config.cache_config.block_size + request.skip_allocate = True + request.num_computed_tokens = self.revert_chunked_mm_input( + request.multimodal_inputs, matched_token_num + ) + else: + if matched_token_num == request.need_prefill_tokens: + request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size + request.skip_allocate = True + else: + request.num_computed_tokens = matched_token_num + + if request.num_cached_tokens != request.num_computed_tokens: + revert_tokens_num = request.num_cached_tokens - request.num_computed_tokens + llm_logger.info( + f"request {request.request_id} num_cached_tokens: {request.num_cached_tokens}, revert_tokens_num: {revert_tokens_num}" + ) + + revert_block_idx = len(common_block_ids) - revert_tokens_num // self.config.cache_config.block_size - 1 + for block_idx in range(len(common_block_ids) - 1, revert_block_idx, -1): + if common_block_ids[block_idx] in hit_info["match_gpu_block_ids"]: + hit_info["gpu_match_token_num"] -= self.config.cache_config.block_size + elif common_block_ids[block_idx] in hit_info["match_cpu_block_ids"]: + hit_info["cpu_match_token_num"] -= self.config.cache_config.block_size + + request.gpu_cache_token_num = hit_info["gpu_match_token_num"] + request.cpu_cache_token_num = hit_info["cpu_match_token_num"] # Report the number of cached tokens to Prometheus metrics - main_process_metrics.prefix_cache_token_num.inc(matched_token_num) + main_process_metrics.prefix_cache_token_num.inc(request.num_computed_tokens) main_process_metrics.prefix_gpu_cache_token_num.inc(request.gpu_cache_token_num) main_process_metrics.prefix_cpu_cache_token_num.inc(request.cpu_cache_token_num) - - if matched_token_num == request.need_prefill_tokens: - request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size - request.skip_allocate = True - else: - request.num_computed_tokens = matched_token_num request.cache_prepare_time = time.time() - cache_prepare_time return True except Exception as e: diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 7d387acc609..9378cef9e2c 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -79,8 +79,17 @@ def __init__(self, pid: int | str, port: int | str, fd_config: FDConfig, workers ) self.max_model_len = self.fd_config.model_config.max_model_len self.enable_prefix_caching = self.fd_config.cache_config.enable_prefix_caching + self.enable_cache_transfer = self.fd_config.cache_config.swap_space self.enable_splitwise = self.fd_config.scheduler_config.splitwise_role != "mixed" self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 + self.num_dp_per_node = self.max_chips_per_node // self.fd_config.parallel_config.tensor_parallel_size + self.data_parallel_rank = ( + self.fd_config.node_rank * self.num_dp_per_node + self.fd_config.parallel_config.local_data_parallel_id + ) + self.data_parallel_info = { + "dp_rank": self.data_parallel_rank, + "local_dp_rank": self.fd_config.parallel_config.local_data_parallel_id, + } if self.enable_mm and self.enable_prefix_caching: from fastdeploy.cache_manager.cache_data import ( @@ -548,43 +557,57 @@ def update_model_weight(self, timeout=300): 2 : worker update finish and notify client """ with self.clear_update_lock: + if self.enable_prefix_caching: + # prefix_tree_status_signal: CLEARED -> UPDATING -> NORMAL + if self.prefix_tree_status_signal.value[0] == PrefixTreeStatus.CLEARED: + self.prefix_tree_status_signal.value[0] = PrefixTreeStatus.UPDATING + api_server_logger.info( + f">>> start updating prefix tree (status: {self.prefix_tree_status_signal.value[0]})" + ) + while timeout >= 0 and self.prefix_tree_status_signal.value[0] != PrefixTreeStatus.NORMAL: + api_server_logger.info(f"... prefix tree status: {self.prefix_tree_status_signal.value[0]}") + time.sleep(1) + timeout -= 1 + if timeout < 0: + return 404, {**self.data_parallel_info, "msg": "update prefix tree timeout"} + api_server_logger.info( + f"<<< finish updating prefix tree (status: {self.prefix_tree_status_signal.value[0]})" + ) + + # model_weights_status_signal: CLEARED -> UPDATING -> NORMAL if self.model_weights_status_signal.value[0] == ModelWeightsStatus.NORMAL: - return True, "" + return 200, {**self.data_parallel_info, "msg": "model weight is updated"} if self.model_weights_status_signal.value[0] == ModelWeightsStatus.UPDATING: - return False, "worker is updating model weight already" + return 400, {**self.data_parallel_info, "msg": "worker is updating model weight already"} if self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARING: - return False, "worker is clearing model weight, cannot update now" + return 403, {**self.data_parallel_info, "msg": "worker is clearing model weight, cannot update now"} self.model_weights_status_signal.value[0] = ModelWeightsStatus.UPDATING - if self.enable_prefix_caching or self.enable_splitwise: - self.kv_cache_status_signal.value[0] = KVCacheStatus.UPDATING - if self.enable_prefix_caching: - self.prefix_tree_status_signal.value[0] = PrefixTreeStatus.UPDATING - api_server_logger.info(f"start update model weight {self.model_weights_status_signal.value}") - all_updated = False - while timeout >= 0 and not all_updated: + api_server_logger.info( + f">>> start updating model weight (weight status: {self.model_weights_status_signal.value[0]})" + if not self.enable_cache_transfer + else f">>> start updating model weight (weight status: {self.model_weights_status_signal.value[0]} cache status: {self.kv_cache_status_signal.value[0]})" + ) + while timeout >= 0: api_server_logger.info( - f"Updating model weights.. " - f"model_weights_status: {self.model_weights_status_signal.value[0]}, " - f"prefix_tree_status: {self.prefix_tree_status_signal.value[0]}, " - f"kv_cache_status: {self.kv_cache_status_signal.value[0]} " + f"... weight status: {self.model_weights_status_signal.value[0]}" + if not self.enable_cache_transfer + else f"... weight status: {self.model_weights_status_signal.value[0]} cache status: {self.kv_cache_status_signal.value[0]}" ) weight_updated = self.model_weights_status_signal.value[0] == ModelWeightsStatus.NORMAL cache_updated = self.kv_cache_status_signal.value[0] == KVCacheStatus.NORMAL - prefix_updated = self.prefix_tree_status_signal.value[0] == PrefixTreeStatus.NORMAL - if self.enable_prefix_caching or self.enable_splitwise: - if self.enable_prefix_caching: - all_updated = weight_updated and cache_updated and prefix_updated - else: - all_updated = weight_updated and cache_updated - else: - all_updated = weight_updated + if weight_updated and (not self.enable_cache_transfer or cache_updated): + break time.sleep(1) timeout -= 1 if timeout < 0: - return False, "Update model weight timeout" - time.sleep(1) - return True, "" + return 404, {**self.data_parallel_info, "msg": "update model weight timeout"} + api_server_logger.info( + f"<<< finish updating model weight (weight status: {self.model_weights_status_signal.value[0]})" + if not self.enable_cache_transfer + else f"<<< finish updating model weight (weight status: {self.model_weights_status_signal.value[0]} cache status: {self.kv_cache_status_signal.value[0]})" + ) + return 200, {**self.data_parallel_info, "msg": "update model weight successfully"} def clear_load_weight(self, timeout=300): """ @@ -594,45 +617,57 @@ def clear_load_weight(self, timeout=300): """ with self.clear_update_lock: + if self.enable_prefix_caching: + # prefix_tree_status_signal: NORMAL -> CLEARING -> CLEARED + if self.prefix_tree_status_signal.value[0] == PrefixTreeStatus.NORMAL: + self.prefix_tree_status_signal.value[0] = PrefixTreeStatus.CLEARING + api_server_logger.info( + f">>> start clearing prefix tree (status: {self.prefix_tree_status_signal.value[0]})" + ) + while timeout >= 0 and self.prefix_tree_status_signal.value[0] != PrefixTreeStatus.CLEARED: + api_server_logger.info(f"... prefix tree status: {self.prefix_tree_status_signal.value[0]}") + time.sleep(1) + timeout -= 1 + if timeout < 0: + return 404, {**self.data_parallel_info, "msg": "clear prefix tree timeout"} + api_server_logger.info( + f"<<< finish clearing prefix tree (status: {self.prefix_tree_status_signal.value[0]})" + ) + + # model_weights_status_signal: NORMAL -> CLEARING -> CLEARED if self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARED: - return True, "" + return 200, {**self.data_parallel_info, "msg": "model weight is cleared"} if self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARING: - return False, "worker is clearing model weight already" + return 400, {**self.data_parallel_info, "msg": "worker is clearing model weight already"} if self.model_weights_status_signal.value[0] == ModelWeightsStatus.UPDATING: - return False, "worker is updating model weight, cannot clear now" + return 403, {**self.data_parallel_info, "msg": "worker is updating model weight, cannot clear now"} self.model_weights_status_signal.value[0] = ModelWeightsStatus.CLEARING - if self.enable_prefix_caching or self.enable_splitwise: - self.kv_cache_status_signal.value[0] = KVCacheStatus.CLEARING - if self.enable_prefix_caching: - self.prefix_tree_status_signal.value[0] = PrefixTreeStatus.CLEARING - - api_server_logger.info(f"start clear model weight {self.model_weights_status_signal.value}") - all_cleared = False - while timeout >= 0 and not all_cleared: + api_server_logger.info( + f">>> start clearing model weight (weight status: {self.model_weights_status_signal.value[0]}" + if not self.enable_cache_transfer + else f">>> start clearing model weight (weight status: {self.model_weights_status_signal.value[0]} cache status: {self.kv_cache_status_signal.value[0]})" + ) + while timeout >= 0: api_server_logger.info( - f"Clearing model weights.. " - f"model_weights_status: {self.model_weights_status_signal.value[0]}, " - f"prefix_tree_status: {self.prefix_tree_status_signal.value[0]}, " - f"kv_cache_status: {self.kv_cache_status_signal.value[0]} " + f"... weight status: {self.model_weights_status_signal.value[0]}" + if not self.enable_cache_transfer + else f"... weight status: {self.model_weights_status_signal.value[0]} cache status: {self.kv_cache_status_signal.value[0]}" ) weight_cleared = self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARED cache_cleared = self.kv_cache_status_signal.value[0] == KVCacheStatus.CLEARED - prefix_cleared = self.prefix_tree_status_signal.value[0] == PrefixTreeStatus.CLEARED - if self.enable_prefix_caching or self.enable_splitwise: - if self.enable_prefix_caching: - all_cleared = weight_cleared and cache_cleared and prefix_cleared - else: - all_cleared = weight_cleared and cache_cleared - else: - all_cleared = weight_cleared + if weight_cleared and (not self.enable_cache_transfer or cache_cleared): + break time.sleep(1) timeout -= 1 - if timeout < 0: - return False, "Clear model weight timeout" - time.sleep(1) - return True, "" + return 404, {**self.data_parallel_info, "msg": "clear model weight timeout"} + api_server_logger.info( + f"<<< finish clearing model weight (weight status: {self.model_weights_status_signal.value[0]})" + if not self.enable_cache_transfer + else f"<<< finish clearing model weight (weight status: {self.model_weights_status_signal.value[0]} cache status: {self.kv_cache_status_signal.value[0]})" + ) + return 200, {**self.data_parallel_info, "msg": "clear model weight successfully"} def check_model_weight_status(self): return self.model_weights_status_signal.value[0] < 0 diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 8da77548951..f4e5fb39202 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -476,12 +476,10 @@ def update_model_weight(request: Request) -> Response: update model weight """ if app.state.dynamic_load_weight: - status, msg = app.state.engine_client.update_model_weight() - if not status: - return Response(content=msg, status_code=404) - return Response(status_code=200) + status_code, msg = app.state.engine_client.update_model_weight() + return JSONResponse(content=msg, status_code=status_code) else: - return Response(content="Dynamic Load Weight Disabled.", status_code=404) + return JSONResponse(content={"error": "Dynamic Load Weight Disabled."}, status_code=404) @app.get("/clear_load_weight") @@ -490,12 +488,10 @@ def clear_load_weight(request: Request) -> Response: clear model weight """ if app.state.dynamic_load_weight: - status, msg = app.state.engine_client.clear_load_weight() - if not status: - return Response(content=msg, status_code=404) - return Response(status_code=200) + status_code, msg = app.state.engine_client.clear_load_weight() + return JSONResponse(content=msg, status_code=status_code) else: - return Response(content="Dynamic Load Weight Disabled.", status_code=404) + return JSONResponse(content={"error": "Dynamic Load Weight Disabled."}, status_code=404) @app.post("/rearrange_experts") diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index a34cb137a9a..c1c4ae09c1f 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -52,6 +52,7 @@ def start_servers(server_count, server_args, ports, metrics_ports, controller_po env = os.environ.copy() env["FD_LOG_DIR"] = env.get("FD_LOG_DIR", "log") + f"/log_{i}" + env["FD_ENABLE_MULTI_API_SERVER"] = "1" cmd = [ sys.executable, "-m", diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 000861470dd..02b78773f3a 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -31,7 +31,7 @@ ) from fastdeploy.engine.pooling_params import PoolingParams -from fastdeploy.worker.output import PromptLogprobs +from fastdeploy.worker.output import PromptLogprobs, SpeculateMetrics class InvalidParameterException(Exception): @@ -230,6 +230,7 @@ class ChatCompletionResponseChoice(BaseModel): draft_logprobs: Optional[LogProbs] = None prompt_logprobs: Optional[PromptLogprobs] = None finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] + speculate_metrics: Optional[SpeculateMetrics] = None class ChatCompletionResponse(BaseModel): @@ -295,6 +296,7 @@ class ChatCompletionResponseStreamChoice(BaseModel): prompt_logprobs: Optional[PromptLogprobs] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None arrival_time: Optional[float] = None + speculate_metrics: Optional[SpeculateMetrics] = None class ChatCompletionStreamResponse(BaseModel): @@ -329,6 +331,7 @@ class CompletionResponseChoice(BaseModel): reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None + speculate_metrics: Optional[SpeculateMetrics] = None class CompletionResponse(BaseModel): @@ -374,6 +377,7 @@ class CompletionResponseStreamChoice(BaseModel): reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None + speculate_metrics: Optional[SpeculateMetrics] = None class CompletionStreamResponse(BaseModel): @@ -455,6 +459,7 @@ class CompletionRequest(BaseModel): frequency_penalty: Optional[float] = Field(default=None, ge=-2, le=2) logprobs: Optional[int] = None include_draft_logprobs: Optional[bool] = False + include_logprobs_decode_token: Optional[bool] = True prompt_logprobs: Optional[int] = None # For logits and logprobs post processing temp_scaled_logprobs: bool = False @@ -616,6 +621,7 @@ class ChatCompletionRequest(BaseModel): top_logprobs: Optional[int] = None prompt_logprobs: Optional[int] = None include_draft_logprobs: Optional[bool] = False + include_logprobs_decode_token: Optional[bool] = True # For logits and logprobs post processing temp_scaled_logprobs: bool = False diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index aa8cf8dd7b0..6c1d63a0070 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -24,6 +24,7 @@ import numpy as np +import fastdeploy.envs as envs from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -57,6 +58,7 @@ LogprobsLists, LogprobsTensors, PromptLogprobs, + SpeculateMetrics, ) NONES = itertools.repeat(None) @@ -263,7 +265,7 @@ async def chat_completion_stream_generator( except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: if choices: chunk.choices = choices @@ -306,7 +308,7 @@ async def chat_completion_stream_generator( else self.engine_client.ori_vocab_size ) prompt_logprobs_res = self._build_prompt_logprobs( - prompt_logprobs_tensors, num_prompt_logprobs + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token ) choice = ChatCompletionResponseStreamChoice( index=i, @@ -373,14 +375,22 @@ async def chat_completion_stream_generator( request.top_logprobs if request.top_logprobs != -1 else self.engine_client.ori_vocab_size ) logprobs_res = self._create_chat_logprobs( - output_top_logprobs, request.logprobs, num_top_logprobs + output_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) if request.include_draft_logprobs and output_draft_top_logprobs is not None: draft_logprobs_res = self._create_chat_logprobs( - output_draft_top_logprobs, request.logprobs, num_top_logprobs + output_draft_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) + output_speculate_metrics = res["metrics"].get("speculate_metrics", None) + delta_message = DeltaMessage( reasoning_content="", prompt_token_ids=None, @@ -412,6 +422,7 @@ async def chat_completion_stream_generator( logprobs=logprobs_res, draft_logprobs=draft_logprobs_res, arrival_time=arrival_time, + speculate_metrics=output_speculate_metrics, ) if res["finished"]: num_choices -= 1 @@ -530,6 +541,7 @@ async def chat_completion_full_generator( decoder_base_url=self.tokenizer_base_url, ) prompt_logprobs_res_list = [[] for _ in range(num_choices)] + speculate_metrics = [None for _ in range(num_choices)] choices = [] while num_choices > 0: if self.engine_client.check_model_weight_status(): @@ -546,7 +558,7 @@ async def chat_completion_full_generator( except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: raise ValueError(f"Engine is not healthy: {msg}") else: @@ -577,7 +589,10 @@ async def chat_completion_full_generator( ) # logprobs logprobs_res = self._create_chat_logprobs( - output_top_logprobs, request.logprobs, num_top_logprobs + output_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) if logprobs_res and logprobs_res.content is not None: logprob_contents[idx].extend(logprobs_res.content) @@ -585,7 +600,10 @@ async def chat_completion_full_generator( # draft_logprobs if request.include_draft_logprobs and output_draft_top_logprobs is not None: draft_logprobs_res = self._create_chat_logprobs( - output_draft_top_logprobs, request.logprobs, num_top_logprobs + output_draft_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) if draft_logprobs_res and draft_logprobs_res.content is not None: draft_logprob_contents[idx].extend(draft_logprobs_res.content) @@ -596,9 +614,12 @@ async def chat_completion_full_generator( if request.prompt_logprobs != -1 else self.engine_client.ori_vocab_size ) - prompt_logprobs_res = self._build_prompt_logprobs(prompt_logprobs_tensors, num_prompt_logprobs) + prompt_logprobs_res = self._build_prompt_logprobs( + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token + ) if prompt_logprobs_res: prompt_logprobs_res_list[idx].extend(clamp_prompt_logprobs(prompt_logprobs_res)) + speculate_metrics[idx] = data["metrics"].get("speculate_metrics", None) if data["finished"]: num_choices -= 1 reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0) @@ -621,6 +642,7 @@ async def chat_completion_full_generator( response_processor=response_processor, prompt_logprobs_res_list=prompt_logprobs_res_list, max_tokens=max_tokens, + speculate_metrics=speculate_metrics[idx], ) choices.append(choice) finally: @@ -674,6 +696,7 @@ async def _create_chat_completion_choice( prompt_logprobs_res_list: list, response_processor: ChatResponseProcessor, max_tokens: int, + speculate_metrics: SpeculateMetrics | None, ) -> ChatCompletionResponseChoice: idx = int(data["request_id"].split("_")[-1]) output = data["outputs"] @@ -731,6 +754,7 @@ async def _create_chat_completion_choice( draft_logprobs=draft_logprobs_full_res, prompt_logprobs=prompt_logprobs_full_res, finish_reason=finish_reason, + speculate_metrics=speculate_metrics, ) def _create_chat_logprobs( @@ -738,6 +762,7 @@ def _create_chat_logprobs( output_top_logprobs, request_logprobs: Optional[bool] = None, request_top_logprobs: Optional[int] = None, + request_decode_flag: Optional[bool] = True, ) -> Optional[LogProbs]: """Create OpenAI-style logprobs for chat completions.""" if output_top_logprobs is None or len(output_top_logprobs) < 3 or any(not lst for lst in output_top_logprobs): @@ -755,6 +780,7 @@ def _create_chat_logprobs( request_logprobs=request_logprobs, response_logprobs=top_logprobs, request_top_logprobs=request_top_logprobs, + request_decode_flag=request_decode_flag, ) if logprobs_res is None: logprobs_res = step_logprobs_res @@ -767,6 +793,7 @@ def _build_logprobs_response( request_logprobs: bool, response_logprobs: Optional[LogprobsLists], request_top_logprobs: int, + request_decode_flag: bool, ) -> Optional[LogProbs]: """ Construct a logprobs response object in line with the OpenAI style. @@ -796,12 +823,16 @@ def _build_logprobs_response( # Construct the candidate token structure (LogProbEntry) of topk top_logprob_entries: List[LogProbEntry] = [] for tid, lp in zip(topk_token_ids, topk_logprobs): - token_str = self.engine_client.data_processor.process_logprob_response( - [tid], clean_up_tokenization_spaces=False - ) - token_bytes = token_str.encode("utf-8", errors="replace") - if "\ufffd" in token_str: - token_str = "bytes:" + "".join(f"\\x{byte:02x}" for byte in token_bytes) + if request_decode_flag: + token_str = self.engine_client.data_processor.process_logprob_response( + [tid], clean_up_tokenization_spaces=False + ) + token_bytes = token_str.encode("utf-8", errors="replace") + if "\ufffd" in token_str: + token_str = "bytes:" + "".join(f"\\x{byte:02x}" for byte in token_bytes) + else: + token_str = "" + token_bytes = [] entry = LogProbEntry(token=token_str, logprob=lp, bytes=list(token_bytes)) top_logprob_entries.append(entry) # Construct the sampled token object (avoid sharing references with top_logprob_entries) @@ -840,6 +871,7 @@ def _build_prompt_logprobs( self, prompt_logprobs_tensors: LogprobsTensors, num_prompt_logprobs: int, + include_logprobs_decode_token: bool, ): """Update with prompt logprobs from worker. Args: @@ -851,10 +883,13 @@ def _build_prompt_logprobs( # Detokenize non-incrementally. # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps] - decoded_tokens = [ - self.engine_client.data_processor.process_logprob_response(token_id) - for token_id in token_ids.flatten().tolist() - ] + if include_logprobs_decode_token: + decoded_tokens = [ + self.engine_client.data_processor.process_logprob_response(token_id) + for token_id in token_ids.flatten().tolist() + ] + else: + decoded_tokens = None # Recover shapes. num_prompt_tokens, num_logprobs = logprobs.shape diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 93013531759..b7b1220a777 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -15,6 +15,7 @@ """ import asyncio +import inspect import itertools import time import traceback @@ -24,6 +25,7 @@ import numpy as np +import fastdeploy.envs as envs from fastdeploy.engine.request import RequestOutput from fastdeploy.entrypoints.openai.protocol import ( CompletionLogprobs, @@ -73,6 +75,7 @@ def __init__(self, engine_client, models, pid, ips, max_waiting_time): else: self.master_ip = "0.0.0.0" self.is_master_ip = True + self._is_process_response_dict_async = None api_server_logger.info(f"master ip: {self.master_ip}") def _check_master(self): @@ -261,6 +264,7 @@ async def completion_full_generator( aggregated_token_ids = [[] for _ in range(num_choices)] aggregated_prompt_logprobs_tensors = [None] * num_choices completion_batched_token_ids = [[] for _ in range(num_choices)] + aggregated_speculate_metrics = [None] * num_choices current_waiting_time = 0 while num_choices > 0: if self.engine_client.check_model_weight_status(): @@ -277,7 +281,7 @@ async def completion_full_generator( except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: raise ValueError(f"Engine is not healthy: {msg}") else: @@ -309,18 +313,21 @@ async def completion_full_generator( aggregated_prompt_logprobs_tensors[rid] = output_prompt_logprobs_tensors aggregated_token_ids[rid].extend(data["outputs"]["token_ids"]) - - self.engine_client.data_processor.process_response_dict( - data, stream=False, include_stop_str_in_output=request.include_stop_str_in_output - ) + await self._call_process_response_dict(data, request, stream=False) output_tokens[rid] += len(data["outputs"]["token_ids"]) completion_batched_token_ids[rid].extend(data["outputs"]["token_ids"]) + + output_speculate_metrics = data["metrics"].get("speculate_metrics", None) + if output_speculate_metrics is not None: + aggregated_speculate_metrics[rid] = output_speculate_metrics + if data.get("finished", False): data["output_token_ids"] = output_tokens[rid] data["outputs"]["top_logprobs"] = aggregated_top_logprobs[rid] data["outputs"]["draft_top_logprobs"] = aggregated_draft_top_logprobs[rid] data["outputs"]["token_ids"] = aggregated_token_ids[rid] data["prompt_logprobs_tensors"] = aggregated_prompt_logprobs_tensors[rid] + data["speculate_metrics"] = aggregated_speculate_metrics[rid] valid_results[rid] = data num_choices -= 1 break @@ -430,7 +437,7 @@ async def completion_stream_generator( except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: raise ValueError(f"Engine is not healthy: {msg}") else: @@ -452,7 +459,7 @@ async def completion_stream_generator( else self.engine_client.ori_vocab_size ) prompt_logprobs_res = self._build_prompt_logprobs( - prompt_logprobs_tensors, num_prompt_logprobs + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token ) if request.return_token_ids: chunk = CompletionStreamResponse( @@ -480,9 +487,7 @@ async def completion_stream_generator( ) first_iteration[idx] = False - self.engine_client.data_processor.process_response_dict( - res, stream=True, include_stop_str_in_output=request.include_stop_str_in_output - ) + await self._call_process_response_dict(res, request, stream=True) if res["metrics"].get("first_token_time") is not None: arrival_time = res["metrics"]["first_token_time"] inference_start_time[idx] = res["metrics"]["inference_start_time"] @@ -512,6 +517,7 @@ async def completion_stream_generator( output_tokens[idx] += output.get("num_image_tokens") num_image_tokens[idx] += output.get("num_image_tokens") reasoning_tokens[idx] += output.get("reasoning_token_num", 0) + output_speculate_metrics = res["metrics"].get("speculate_metrics", None) delta_message = CompletionResponseStreamChoice( index=idx, text=output["text"], @@ -524,6 +530,7 @@ async def completion_stream_generator( logprobs=logprobs_res, prompt_logprobs=clamp_prompt_logprobs(prompt_logprobs_res), draft_logprobs=draft_logprobs_res, + speculate_metrics=output_speculate_metrics, ) if not res["finished"] and "delta_message" in output: delta_message_output = output["delta_message"] @@ -648,7 +655,9 @@ def request_output_to_completion_response( num_prompt_logprobs = ( request.prompt_logprobs if request.prompt_logprobs != -1 else self.engine_client.ori_vocab_size ) - prompt_logprobs_res = self._build_prompt_logprobs(prompt_logprobs_tensors, num_prompt_logprobs) + prompt_logprobs_res = self._build_prompt_logprobs( + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token + ) if request.echo: prompt_text = self._echo_back_prompt(request, idx // (1 if request.n is None else request.n)) token_ids = [*prompt_token_ids, *output["token_ids"]] @@ -681,6 +690,7 @@ def request_output_to_completion_response( draft_logprobs=aggregated_draft_logprobs, prompt_logprobs=clamp_prompt_logprobs(prompt_logprobs_res), finish_reason=finish_reason, + speculate_metrics=final_res["metrics"].get("speculate_metrics", None), ) choices.append(choice_data) @@ -714,6 +724,20 @@ def request_output_to_completion_response( usage=usage, ) + async def _call_process_response_dict(self, res, request, stream): + if self._is_process_response_dict_async is None: + self._is_process_response_dict_async = inspect.iscoroutinefunction( + self.engine_client.data_processor.process_response_dict + ) + if self._is_process_response_dict_async: + await self.engine_client.data_processor.process_response_dict( + res, stream=stream, include_stop_str_in_output=request.include_stop_str_in_output + ) + else: + self.engine_client.data_processor.process_response_dict( + res, stream=stream, include_stop_str_in_output=request.include_stop_str_in_output + ) + def _create_completion_logprobs( self, output_top_logprobs, @@ -814,6 +838,7 @@ def _build_prompt_logprobs( self, prompt_logprobs_tensors: LogprobsTensors, num_prompt_logprobs: int, + include_logprobs_decode_token: bool, ): """Update with prompt logprobs from worker. Args: @@ -825,10 +850,13 @@ def _build_prompt_logprobs( # Detokenize non-incrementally. # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps] - decoded_tokens = [ - self.engine_client.data_processor.process_logprob_response(token_id) - for token_id in token_ids.flatten().tolist() - ] + if include_logprobs_decode_token: + decoded_tokens = [ + self.engine_client.data_processor.process_logprob_response(token_id) + for token_id in token_ids.flatten().tolist() + ] + else: + decoded_tokens = None # Recover shapes. num_prompt_tokens, num_logprobs = logprobs.shape diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 93f135d09da..676ab6a5fd6 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -151,6 +151,21 @@ # "Number of tokens in the group for Mixture of Experts (MoE) computation processing on HPU" "FD_HPU_CHUNK_SIZE": lambda: int(os.getenv("FD_HPU_CHUNK_SIZE", "64")), "FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS": lambda: int(os.getenv("FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS", "30")), + # Reserve output blocks for decoding requests when schedule new prefill requests + "FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: int( + os.getenv("FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "16") + ), + "FD_RESERVE_DECAY_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: float( + os.getenv("FD_RESERVE_DECAY_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "0.025") + ), + "FD_RESERVE_MIN_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: int( + os.getenv("FD_RESERVE_MIN_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "0") + ), + "GLOBAL_LOGGING_INSTRUMENT": lambda: int(os.getenv("GLOBAL_LOGGING_INSTRUMENT", "0")), + # Timeout for worker process health check in seconds + "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), + # Whether to use PFCCLab/DeepEP. + "FD_USE_PFCC_DEEP_EP": lambda: bool(int(os.getenv("FD_USE_PFCC_DEEP_EP", "0"))), } diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index ec89e838329..4da49a9666d 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -143,9 +143,9 @@ class MetricsManager: request_success_total: "Counter" spec_decode_draft_acceptance_rate: "Gauge" spec_decode_efficiency: "Gauge" - spec_decode_num_accepted_tokens_total: "Counter" + spec_decode_num_accepted_tokens_total: "Gauge" spec_decode_num_draft_tokens_total: "Counter" - spec_decode_num_emitted_tokens_total: "Counter" + spec_decode_num_emitted_tokens_total: "Gauge" spec_decode_draft_single_head_acceptance_rate: "list[Gauge]" # for YIYAN Adapter @@ -598,13 +598,13 @@ def _init_speculative_metrics(self, speculative_method, num_speculative_tokens): "kwargs": {}, }, "spec_decode_num_accepted_tokens_total": { - "type": Counter, + "type": Gauge, "name": "fastdeploy:spec_decode_num_accepted_tokens_total", "description": "Total number of tokens accepted by the scoring model and verification program", "kwargs": {}, }, "spec_decode_num_emitted_tokens_total": { - "type": Counter, + "type": Gauge, "name": "fastdeploy:spec_decode_num_emitted_tokens_total", "description": "Total number of tokens output by the entire system", "kwargs": {}, diff --git a/fastdeploy/metrics/trace_util.py b/fastdeploy/metrics/trace_util.py index 111c2c85343..7385c4a40b3 100644 --- a/fastdeploy/metrics/trace_util.py +++ b/fastdeploy/metrics/trace_util.py @@ -1,4 +1,5 @@ import json +import logging import os from fastapi import FastAPI @@ -7,6 +8,7 @@ from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.instrumentation.logging import LoggingInstrumentor from opentelemetry.propagate import extract, inject +from opentelemetry.sdk._logs import LoggingHandler from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import SpanProcessor, TracerProvider from opentelemetry.sdk.trace.export import ( @@ -118,14 +120,40 @@ def instrument(app: FastAPI): llm_logger.info("Applying instrumentors...") FastAPIInstrumentor.instrument_app(app) try: - LoggingInstrumentor().instrument(set_logging_format=True) - except Exception: + global_instrument = envs.GLOBAL_LOGGING_INSTRUMENT + if global_instrument: + LoggingInstrumentor().instrument() + else: + target_logger = logging.getLogger("legacy.trace") + custom_handler = CustomLoggingHandler(level=logging.NOTSET) + target_logger.handlers.insert(0, custom_handler) + except Exception as e: + llm_logger.warning(f"Logging instrument failed: {e}") pass except: llm_logger.info("instrument failed") pass +class CustomLoggingHandler(LoggingHandler): + def emit(self, record): + try: + current_span = trace.get_current_span() + trace_id = 0 + span_id = 0 + if current_span and current_span.is_recording(): + span_context = current_span.get_span_context() + if span_context.trace_id != 0: + trace_id = span_context.trace_id + if span_context.span_id != 0: + span_id = span_context.span_id + record.otelTraceID = "0" if trace_id == 0 else format(trace_id, "032x") + record.otelSpanID = "0" if span_id == 0 else format(span_id, "016x") + except: + record.otelTraceID = "0" + record.otelSpanID = "0" + + def inject_to_metadata(request, metadata_attr="metadata"): """ Inject OpenTelemetry trace context into the metadata field of the request. diff --git a/fastdeploy/model_executor/entropy_utils.py b/fastdeploy/model_executor/entropy_utils.py new file mode 100644 index 00000000000..21d1b3421e9 --- /dev/null +++ b/fastdeploy/model_executor/entropy_utils.py @@ -0,0 +1,106 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import paddle + +from fastdeploy.utils import data_processor_logger + + +def get_entropy(logits): + # Check for -inf values in logits + if paddle.any(paddle.isinf(logits) & (logits < 0)): + data_processor_logger.debug("Detected -inf values in logits, clipping to minimum value") + logits = paddle.clip(logits, min=1e-9) + + a0 = logits - paddle.max(logits, axis=-1, keepdim=True) + ea0 = paddle.exp(a0) + z0 = paddle.sum(ea0, axis=-1, keepdim=True) + p0 = ea0 / z0 + return paddle.sum(p0 * (paddle.log(z0) - a0), axis=-1) + + +def calculate_logits_entropy(logits, share_inputs, temperature): + real_bsz = share_inputs["seq_lens_this_time"].shape[0] + real_seq_lens = paddle.where( + share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0, + paddle.ones([1], dtype="int32"), + share_inputs["seq_lens_this_time"].squeeze(1), + ) + + batch_indices = paddle.arange(real_bsz, dtype="int32") + batch_id_per_token = paddle.repeat_interleave(batch_indices, real_seq_lens) + for i in range(logits.shape[0]): + if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0: + logits[i] = logits[i].scale_(1 / temperature[batch_id_per_token[i]]) + + entropy_tensor = get_entropy(logits) + entropy = entropy_tensor.tolist() + + for i in range(real_bsz): + for _ in range(real_seq_lens[i]): + share_inputs["entropy_list"][i].append(entropy.pop(0)) + if ( + share_inputs["stop_flags"][i] + and share_inputs["seq_lens_decoder"][i] != 0 + and len(share_inputs["entropy_list"][i]) != 0 + ): + data_processor_logger.info( + f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(share_inputs['entropy_list'][i])/len(share_inputs['entropy_list'][i])}" + ) + share_inputs["entropy_list"][i] = [] + + +def speculate_calculate_logits_entropy(logits, share_inputs, temperature): + # get accepted logits + real_bsz = share_inputs["seq_lens_this_time"].shape[0] + total_accepted_num = paddle.sum(share_inputs["accept_num"]) + real_seq_lens = paddle.where( + share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0, + paddle.ones([1], dtype="int32"), + share_inputs["seq_lens_this_time"].squeeze(1), + ) + seq_start_idx = paddle.concat([paddle.zeros([1], dtype="int32"), paddle.cumsum(real_seq_lens, dtype="int32")]) + repeated_starts = paddle.repeat_interleave(seq_start_idx[:-1], share_inputs["accept_num"][:real_bsz]) + offsets = paddle.concat([paddle.arange(share_inputs["accept_num"][i].item()) for i in range(real_bsz)]).astype( + "int32" + ) + accepted_idx = repeated_starts + offsets + + accepted_logits = paddle.empty([total_accepted_num, logits.shape[1]], dtype=logits.dtype) + for i in range(total_accepted_num): + accepted_logits[i] = logits[accepted_idx[i]] + + batch_indices = paddle.arange(share_inputs["accept_num"].shape[0], dtype="int32") + batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"]) + for i in range(accepted_logits.shape[0]): + if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0: + accepted_logits[i] = accepted_logits[i].scale_(1 / temperature[batch_id_per_token[i]]) + + entropy_tensor = get_entropy(accepted_logits) + entropy = entropy_tensor.tolist() + + for i in range(real_bsz): + for _ in range(share_inputs["accept_num"][i]): + share_inputs["entropy_list"][i].append(entropy.pop(0)) + if ( + share_inputs["stop_flags"][i] + and share_inputs["seq_lens_decoder"][i] != 0 + and len(share_inputs["entropy_list"][i]) != 0 + ): + data_processor_logger.info( + f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(share_inputs['entropy_list'][i])/len(share_inputs['entropy_list'][i])}" + ) + share_inputs["entropy_list"][i] = [] diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index 4e9df0d3ce3..787ec77c0eb 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -142,6 +142,8 @@ class ForwardMeta: caches: Optional[list[paddle.Tensor]] = None # Flag of profile run is_dummy_or_profile_run: bool = False + # Routing Replay table buffer + routing_replay_table: Optional[paddle.Tensor] = None # chunked MoE related moe_num_chunk: int = 1 diff --git a/fastdeploy/model_executor/guided_decoding/guidance_backend.py b/fastdeploy/model_executor/guided_decoding/guidance_backend.py index 5bcaa628d14..e1234f15e08 100644 --- a/fastdeploy/model_executor/guided_decoding/guidance_backend.py +++ b/fastdeploy/model_executor/guided_decoding/guidance_backend.py @@ -22,7 +22,6 @@ import llguidance import llguidance.hf import llguidance.torch -import torch from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request @@ -69,13 +68,13 @@ def _check_error(self): self._printed_error = True llm_logger.warning(f"LLGuidance Matcher error: {err}") - def allocate_token_bitmask(self) -> torch.Tensor: + def allocate_token_bitmask(self): """ Allocate a token bitmask tensor for grammar constraints. """ return llguidance.torch.allocate_token_bitmask(self.batch_size, self.vocab_size) - def fill_token_bitmask(self, token_bitmask: torch.Tensor, idx: int) -> None: + def fill_token_bitmask(self, token_bitmask, idx: int) -> None: """ Fill the token bitmask with allowed tokens for the given index. This will automatically provide an EOS mask if the matcher is stopped. diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py index 35aa40b77e0..9b038bae62b 100644 --- a/fastdeploy/model_executor/layers/activation.py +++ b/fastdeploy/model_executor/layers/activation.py @@ -120,6 +120,8 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: Returns: Tensor: Output tensor. """ + if self.bias is None and self.quant_scale == -1: + return paddle.nn.functional.swiglu(x) return fused_bias_act( x, bias=self.bias, diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 346251a3040..4608bd81e92 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -206,20 +206,9 @@ def get_kv_cache_shape( Calculate kv cache shape """ key_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim] - value_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim] if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": - key_cache_shape = [ - max_num_blocks, - self.kv_num_heads, - self.block_size, - self.head_dim // 2, - ] - value_cache_shape = [ - max_num_blocks, - self.kv_num_heads, - self.block_size, - self.head_dim // 2, - ] + key_cache_shape[-1] = self.head_dim // 2 + value_cache_shape = key_cache_shape return key_cache_shape, value_cache_shape def forward_mixed( diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 79804aa2d5c..a5ac1876e34 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -229,6 +229,11 @@ def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): self.sinks.set_value(sinks_tensor) def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): + if self.use_qk_norm and ("q_norm" in param.name or "k_norm" in param.name): + loaded_weight = get_tensor(loaded_weight).astype("float32") + param.copy_(loaded_weight, False) + return + loaded_weight = get_tensor(loaded_weight).cast(paddle.get_default_dtype()) if self.quant_method.cache_quant_config.has_zero_point: # cache_int4_zp loaded_weight = 1.0 / loaded_weight diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 3f570aacfb0..927ef99b0ae 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -63,13 +63,7 @@ class FlashAttentionMetadata(AttentionMetadata): FlashAttentionMetadata """ - rotary_embs: Optional[paddle.Tensor] = None - block_tables: Optional[paddle.Tensor] = None - - cu_seqlens_q: paddle.Tensor = None cu_seqlens_k: paddle.Tensor = None - max_seqlen_q: int = 0 - max_seqlen_k: int = 0 pre_cache_batch_ids = None pre_cache_tile_ids_per_batch = None @@ -83,7 +77,6 @@ class FlashAttentionMetadata(AttentionMetadata): _fuse_kernel_compute_dtype: str = "bf16" _dtype: paddle.dtype = paddle.bfloat16 - max_len_tensor_cpu: paddle.Tensor = None max_len_tensor_cpu_decoder: paddle.Tensor = None @@ -109,7 +102,6 @@ def __init__( FlashAttentionBackend __init__ """ super().__init__() - self.attention_metadata: FlashAttentionMetadata = None self.max_seq_len = fd_config.model_config.max_model_len self.causal = getattr(fd_config.model_config, "causal", True) @@ -133,9 +125,6 @@ def __init__( self.start_layer_index: int = fd_config.model_config.start_layer_index - if fd_config.parallel_config.expert_parallel_rank is None: - fd_config.parallel_config.expert_parallel_rank = 0 - self.rank, self.device_id = init_rank_and_device_id(fd_config) if self.flash_attn_func is None: @@ -154,15 +143,12 @@ def __init__( "The current platform does not support Flash Attention V3, so Flash Attention V2 will be used instead." ) self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) - self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", "32768")) + # Note(ZKK): here must be consistent with append_attn_backend.py + self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", 1024)) self.zero_seq_enc_lens_for_decode = paddle.zeros( shape=[fd_config.scheduler_config.max_num_seqs, 1], dtype=paddle.int32 ) - def get_attntion_meta(self): - """get_attntion_meta""" - return self.attention_metadata - def get_kv_cache_shape( self, max_num_blocks: int, @@ -172,27 +158,13 @@ def get_kv_cache_shape( Calculate kv cache shape """ key_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim] - value_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim] if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": - key_cache_shape = [ - max_num_blocks, - self.kv_num_heads, - self.block_size, - self.head_dim // 2, - ] - value_cache_shape = [ - max_num_blocks, - self.kv_num_heads, - self.block_size, - self.head_dim // 2, - ] + key_cache_shape[-1] = self.head_dim // 2 + value_cache_shape = key_cache_shape return key_cache_shape, value_cache_shape def init_attention_metadata(self, forward_meta: ForwardMeta): metadata = FlashAttentionMetadata() - metadata.cu_seqlens_q = forward_meta.cu_seqlens_q - metadata.rotary_embs = forward_meta.rotary_embs - metadata.block_tables = forward_meta.block_tables get_block_shape_and_split_kv_block( forward_meta.seq_lens_encoder, forward_meta.seq_lens_decoder, @@ -215,18 +187,20 @@ def init_attention_metadata(self, forward_meta: ForwardMeta): self.block_size, ) - ( - metadata.cu_seqlens_k, - metadata.pre_cache_batch_ids, - metadata.pre_cache_tile_ids_per_batch, - metadata.pre_cache_num_blocks_cpu, - metadata.kv_token_num_cpu, - ) = pre_cache_len_concat( - forward_meta.seq_lens_decoder, - forward_meta.seq_lens_this_time, - forward_meta.max_len_tensor_cpu[2], - self.block_size, - ) + if forward_meta.max_len_tensor_cpu[1] > 0: + ( + metadata.cu_seqlens_k, + metadata.pre_cache_batch_ids, + metadata.pre_cache_tile_ids_per_batch, + metadata.pre_cache_num_blocks_cpu, + metadata.kv_token_num_cpu, + ) = pre_cache_len_concat( + forward_meta.seq_lens_encoder, + forward_meta.seq_lens_decoder, + forward_meta.seq_lens_this_time, + forward_meta.max_len_tensor_cpu[2], + self.block_size, + ) # pd_disaggregation metadata.kv_signal_data_list = [None] * self.num_layers @@ -251,11 +225,10 @@ def init_attention_metadata(self, forward_meta: ForwardMeta): elif metadata._dtype == "float32": metadata._fuse_kernel_compute_dtype = "fp32" - metadata.max_len_tensor_cpu = forward_meta.max_len_tensor_cpu - metadata.max_len_tensor_cpu_decoder = paddle.clone(metadata.max_len_tensor_cpu) + metadata.max_len_tensor_cpu_decoder = paddle.clone(forward_meta.max_len_tensor_cpu) metadata.max_len_tensor_cpu_decoder[1] = 0 - self.attention_metadata = metadata + forward_meta.attention_metadata = metadata def forward_mixed( self, @@ -268,7 +241,7 @@ def forward_mixed( layer: Attention, forward_meta: ForwardMeta, ): - metadata = self.attention_metadata + metadata = forward_meta.attention_metadata if self.pd_disaggregation_mode == "per_query": metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( @@ -276,19 +249,21 @@ def forward_mixed( layer.layer_id + self.start_layer_index, ) - if metadata.max_len_tensor_cpu[1] > 0: + use_fa_do_prefill = forward_meta.max_len_tensor_cpu[1].item() > 0 + + if use_fa_do_prefill: q, k, v, _ = gqa_rope_write_cache( qkv, forward_meta.caches[2 * layer.layer_id], forward_meta.caches[2 * layer.layer_id + 1], - metadata.cu_seqlens_q, + forward_meta.cu_seqlens_q, metadata.cu_seqlens_k, - metadata.rotary_embs, + forward_meta.rotary_embs, forward_meta.seq_lens_this_time, forward_meta.seq_lens_encoder, forward_meta.seq_lens_decoder, forward_meta.batch_id_per_token, - metadata.block_tables, + forward_meta.block_tables, forward_meta.kv_batch_ids, forward_meta.kv_tile_ids_per_batch, forward_meta.kv_num_blocks_x_cpu, @@ -307,6 +282,7 @@ def forward_mixed( metadata.kv_token_num_cpu[0].item(), self.max_seq_len, getattr(layer, "rms_norm_eps", 1e-6), + layer.use_neox_rotary_style, getattr(layer, "cache_quant_type_str", "none"), self.rope_3d, ) @@ -315,7 +291,7 @@ def forward_mixed( q, k, v, - metadata.cu_seqlens_q, + forward_meta.cu_seqlens_q, metadata.cu_seqlens_k, max_seqlen_q=forward_meta.max_len_tensor_cpu[0], max_seqlen_k=forward_meta.max_len_tensor_cpu[3], @@ -327,23 +303,23 @@ def forward_mixed( qkv, forward_meta.caches[2 * layer.layer_id], forward_meta.caches[2 * layer.layer_id + 1], - self.zero_seq_enc_lens_for_decode, + self.zero_seq_enc_lens_for_decode if use_fa_do_prefill else forward_meta.seq_lens_encoder, forward_meta.seq_lens_decoder, forward_meta.seq_lens_this_time, forward_meta.batch_id_per_token, forward_meta.cu_seqlens_q, - metadata.block_tables, + forward_meta.block_tables, forward_meta.encoder_batch_ids, forward_meta.encoder_tile_ids_per_batch, forward_meta.encoder_num_blocks_x_cpu, forward_meta.kv_batch_ids, forward_meta.kv_tile_ids_per_batch, forward_meta.kv_num_blocks_x_cpu, - forward_meta.decoder_batch_ids, # from buffer - forward_meta.decoder_tile_ids_per_batch, # from buffer + forward_meta.decoder_batch_ids, + forward_meta.decoder_tile_ids_per_batch, forward_meta.decoder_num_blocks_cpu, - metadata.max_len_tensor_cpu_decoder, - metadata.rotary_embs, + metadata.max_len_tensor_cpu_decoder if use_fa_do_prefill else forward_meta.max_len_tensor_cpu, + forward_meta.rotary_embs, forward_meta.attn_mask, layer.qkv_bias, layer.qkv_scale, @@ -378,7 +354,7 @@ def forward_mixed( self.speculative_method is not None, ) - if metadata.max_len_tensor_cpu[1] > 0: + if use_fa_do_prefill: merge_prefill_decode_output( res_encoder, res_decoder, diff --git a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py index e2b6e4fd38c..2ace2cd893a 100644 --- a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py @@ -16,6 +16,7 @@ from __future__ import annotations +import os from dataclasses import dataclass, field from typing import TYPE_CHECKING, List, Optional @@ -28,6 +29,7 @@ AttentionMetadata, ) from fastdeploy.model_executor.layers.attention.ops import ( + append_attention, flash_mask_attention, get_block_shape_and_split_kv_block, gqa_rope_write_cache, @@ -48,8 +50,6 @@ else: merge_prefill_decode_output = None -import os - @dataclass class FlashMaskAttentionMetadata(AttentionMetadata): @@ -57,19 +57,6 @@ class FlashMaskAttentionMetadata(AttentionMetadata): FlashAttentionMetadata """ - rotary_embs: Optional[paddle.Tensor] = None - block_tables: Optional[paddle.Tensor] = None - - cu_seqlens_q: paddle.Tensor = None - cu_seqlens_k: paddle.Tensor = None - max_seqlen_q: int = 0 - max_seqlen_k: int = 0 - - pre_cache_batch_ids = None - pre_cache_tile_ids_per_batch = None - pre_cache_num_blocks_cpu = None - kv_token_num_cpu = None - # pd_disaggregation kv_signal_metadata: Optional[paddle.Tensor] = None kv_signal_data_list: List[Optional[paddle.Tensor]] = field(default_factory=list) @@ -77,7 +64,6 @@ class FlashMaskAttentionMetadata(AttentionMetadata): _fuse_kernel_compute_dtype: str = "bf16" _dtype: paddle.dtype = paddle.bfloat16 - max_len_tensor_cpu: paddle.Tensor = None max_len_tensor_cpu_decoder: paddle.Tensor = None @@ -103,7 +89,6 @@ def __init__( FlashAttentionBackend __init__ """ super().__init__() - self.attention_metadata: FlashMaskAttentionMetadata = None self.max_seq_len = fd_config.model_config.max_model_len self.causal = getattr(fd_config.model_config, "causal", True) @@ -142,10 +127,6 @@ def __init__( shape=[fd_config.scheduler_config.max_num_seqs, 1], dtype=paddle.int32 ) - def get_attntion_meta(self): - """get_attntion_meta""" - return self.attention_metadata - def get_kv_cache_shape( self, max_num_blocks: int, @@ -155,63 +136,14 @@ def get_kv_cache_shape( Calculate kv cache shape """ key_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim] - value_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim] if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": - key_cache_shape = [ - max_num_blocks, - self.kv_num_heads, - self.block_size, - self.head_dim // 2, - ] - value_cache_shape = [ - max_num_blocks, - self.kv_num_heads, - self.block_size, - self.head_dim // 2, - ] + key_cache_shape[-1] = self.head_dim // 2 + value_cache_shape = key_cache_shape return key_cache_shape, value_cache_shape def init_attention_metadata(self, forward_meta: ForwardMeta): metadata = FlashMaskAttentionMetadata() - metadata.cu_seqlens_q = forward_meta.cu_seqlens_q - metadata.rotary_embs = forward_meta.rotary_embs - metadata.block_tables = forward_meta.block_tables - get_block_shape_and_split_kv_block( - forward_meta.seq_lens_encoder, - forward_meta.seq_lens_decoder, - forward_meta.seq_lens_this_time, - forward_meta.decoder_batch_ids, - forward_meta.decoder_tile_ids_per_batch, - forward_meta.decoder_num_blocks_cpu, - forward_meta.decoder_num_blocks_device, - forward_meta.decoder_chunk_size_device, - forward_meta.max_len_tensor_cpu, - forward_meta.encoder_batch_ids, - forward_meta.encoder_tile_ids_per_batch, - forward_meta.encoder_num_blocks_x_cpu, - forward_meta.kv_batch_ids, - forward_meta.kv_tile_ids_per_batch, - forward_meta.kv_num_blocks_x_cpu, - self.encoder_block_shape_q, - self.decoder_block_shape_q, - self.group_size, - self.block_size, - ) - - ( - metadata.cu_seqlens_k, - metadata.pre_cache_batch_ids, - metadata.pre_cache_tile_ids_per_batch, - metadata.pre_cache_num_blocks_cpu, - metadata.kv_token_num_cpu, - ) = pre_cache_len_concat( - forward_meta.seq_lens_decoder, - forward_meta.seq_lens_this_time, - forward_meta.max_len_tensor_cpu[2], - self.block_size, - ) - - # pd_disaggregation + # metadata only save pd_disaggregation info. metadata.kv_signal_data_list = [None] * self.num_layers if self.pd_disaggregation_mode == "per_chunk": if not self.keep_pd_step_flag and not forward_meta.is_dummy_or_profile_run: @@ -234,11 +166,10 @@ def init_attention_metadata(self, forward_meta: ForwardMeta): elif metadata._dtype == "float32": metadata._fuse_kernel_compute_dtype = "fp32" - metadata.max_len_tensor_cpu = forward_meta.max_len_tensor_cpu - metadata.max_len_tensor_cpu_decoder = paddle.clone(metadata.max_len_tensor_cpu) + metadata.max_len_tensor_cpu_decoder = paddle.clone(forward_meta.max_len_tensor_cpu) metadata.max_len_tensor_cpu_decoder[1] = 0 - self.attention_metadata = metadata + forward_meta.attention_metadata = metadata def forward_mixed( self, @@ -251,7 +182,7 @@ def forward_mixed( layer: Attention, forward_meta: ForwardMeta, ): - metadata = self.attention_metadata + metadata = forward_meta.attention_metadata if self.pd_disaggregation_mode == "per_query": metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( @@ -259,26 +190,67 @@ def forward_mixed( layer.layer_id + self.start_layer_index, ) - if metadata.max_len_tensor_cpu[1] > 0: + if layer.layer_id == 0: + get_block_shape_and_split_kv_block( + forward_meta.seq_lens_encoder, + forward_meta.seq_lens_decoder, + forward_meta.seq_lens_this_time, + forward_meta.decoder_batch_ids, + forward_meta.decoder_tile_ids_per_batch, + forward_meta.decoder_num_blocks_cpu, + forward_meta.decoder_num_blocks_device, + forward_meta.decoder_chunk_size_device, + forward_meta.max_len_tensor_cpu, + forward_meta.encoder_batch_ids, + forward_meta.encoder_tile_ids_per_batch, + forward_meta.encoder_num_blocks_x_cpu, + forward_meta.kv_batch_ids, + forward_meta.kv_tile_ids_per_batch, + forward_meta.kv_num_blocks_x_cpu, + self.encoder_block_shape_q, + self.decoder_block_shape_q, + self.group_size, + self.block_size, + ) + + # here we add five members,this is ugly, just for now. + if forward_meta.max_len_tensor_cpu[1].item() > 0: + ( + forward_meta.attn_cu_seqlens_k, + forward_meta.pre_cache_batch_ids, + forward_meta.pre_cache_tile_ids_per_batch, + forward_meta.pre_cache_num_blocks_cpu, + forward_meta.kv_token_num_cpu, + ) = pre_cache_len_concat( + forward_meta.seq_lens_encoder, + forward_meta.seq_lens_decoder, + forward_meta.seq_lens_this_time, + forward_meta.max_len_tensor_cpu[2], + self.block_size, + ) + + use_fa_do_prefill = forward_meta.max_len_tensor_cpu[1].item() > 0 + + if use_fa_do_prefill: res_encoder = paddle.zeros([qkv.shape[0], self.num_heads * self.head_dim], dtype=qkv.dtype) q, k, v, _ = gqa_rope_write_cache( qkv, forward_meta.caches[2 * layer.layer_id], forward_meta.caches[2 * layer.layer_id + 1], - metadata.cu_seqlens_q, - metadata.cu_seqlens_k, - metadata.rotary_embs, + forward_meta.cu_seqlens_q, + forward_meta.attn_cu_seqlens_k, + forward_meta.rotary_embs, forward_meta.seq_lens_this_time, forward_meta.seq_lens_encoder, forward_meta.seq_lens_decoder, forward_meta.batch_id_per_token, - metadata.block_tables, + forward_meta.block_tables, forward_meta.kv_batch_ids, forward_meta.kv_tile_ids_per_batch, forward_meta.kv_num_blocks_x_cpu, - metadata.pre_cache_batch_ids, - metadata.pre_cache_tile_ids_per_batch, - metadata.pre_cache_num_blocks_cpu, + forward_meta.pre_cache_batch_ids, + forward_meta.pre_cache_tile_ids_per_batch, + forward_meta.pre_cache_num_blocks_cpu, getattr(layer, "q_norm_weight", None), getattr(layer, "k_norm_weight", None), getattr(layer, "cache_k_scale", None), @@ -288,9 +260,10 @@ def forward_mixed( getattr(layer, "cache_k_zp", None), getattr(layer, "cache_v_zp", None), metadata.kv_signal_data_list[layer.layer_id], - metadata.kv_token_num_cpu[0].item(), + forward_meta.kv_token_num_cpu[0].item(), self.max_seq_len, getattr(layer, "rms_norm_eps", 1e-6), + layer.use_neox_rotary_style, getattr(layer, "cache_quant_type_str", "none"), self.rope_3d, ) @@ -299,8 +272,8 @@ def forward_mixed( q, k, v, - metadata.cu_seqlens_q, - metadata.cu_seqlens_k, + forward_meta.cu_seqlens_q, + forward_meta.attn_cu_seqlens_k, forward_meta.seq_lens_encoder, res_encoder, forward_meta.attn_mask_offsets, @@ -311,6 +284,74 @@ def forward_mixed( q.shape[0], k.shape[0], ) + + res_decoder = append_attention( + qkv, + forward_meta.caches[2 * layer.layer_id], + forward_meta.caches[2 * layer.layer_id + 1], + self.zero_seq_enc_lens_for_decode if use_fa_do_prefill else forward_meta.seq_lens_encoder, + forward_meta.seq_lens_decoder, + forward_meta.seq_lens_this_time, + forward_meta.batch_id_per_token, + forward_meta.cu_seqlens_q, + forward_meta.block_tables, + forward_meta.encoder_batch_ids, + forward_meta.encoder_tile_ids_per_batch, + forward_meta.encoder_num_blocks_x_cpu, + forward_meta.kv_batch_ids, + forward_meta.kv_tile_ids_per_batch, + forward_meta.kv_num_blocks_x_cpu, + forward_meta.decoder_batch_ids, + forward_meta.decoder_tile_ids_per_batch, + forward_meta.decoder_num_blocks_cpu, + metadata.max_len_tensor_cpu_decoder if use_fa_do_prefill else forward_meta.max_len_tensor_cpu, + forward_meta.rotary_embs, + forward_meta.attn_mask, + layer.qkv_bias, + layer.qkv_scale, + getattr(layer, "cache_k_scale", None), + getattr(layer, "cache_v_scale", None), + getattr(layer, "cache_k_out_scale", None), + getattr(layer, "cache_v_out_scale", None), + getattr(layer, "cache_k_zp", None), + getattr(layer, "cache_v_zp", None), + layer.linear_shift, + layer.linear_smooth, + forward_meta.attn_mask_offsets, + metadata.kv_signal_data_list[layer.layer_id], + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), + getattr(layer, "sinks", None), + getattr(layer, "rms_norm_eps", 1e-6), + metadata._fuse_kernel_compute_dtype, + getattr(layer, "cache_quant_type_str", "none"), + layer.use_neox_rotary_style, + self.rope_3d, + self.max_seq_len, + getattr(layer, "quant_max_bound", 0.0), + getattr(layer, "quant_min_bound", 0.0), + getattr(layer, "out_scale", -1.0), + self.encoder_block_shape_q, + self.decoder_block_shape_q, + self.max_partition_size, + self.max_seq_len, + self.speculate_max_draft_token_num + 1, + self.causal, + self.speculative_method is not None, + ) + + if use_fa_do_prefill: + merge_prefill_decode_output( + res_encoder, + res_decoder, + forward_meta.seq_lens_encoder, + forward_meta.seq_lens_decoder, + forward_meta.seq_lens_this_time, + forward_meta.cu_seqlens_q, + self.num_heads, + self.head_dim, + self.speculate_max_draft_token_num + 1, + ) return res_encoder else: - raise NotImplementedError("FlashMaskAttentionBackend is not supported for decode.") + return res_decoder diff --git a/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py b/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py index 670fa65f3ef..ef9ab022dd0 100644 --- a/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py +++ b/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py @@ -51,6 +51,7 @@ def gqa_rope_write_cache( kv_token_num: int = 1, max_seq_len: int = 0, rms_norm_eps: float = 1e-6, + use_neox_rotary_style: bool = False, cache_quant_type: str = "none", rope_3d: bool = False, ): @@ -87,6 +88,7 @@ def gqa_rope_write_cache( kv_token_num, max_seq_len, rms_norm_eps, + use_neox_rotary_style, cache_quant_type, rope_3d, ) diff --git a/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py b/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py index 42a931d18f4..68eed2c8a21 100644 --- a/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py +++ b/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py @@ -24,6 +24,7 @@ def pre_cache_len_concat( + seq_lens_encoder: paddle.Tensor, seq_lens_decoder: paddle.Tensor, seq_lens_this_time: paddle.Tensor, max_dec_len: int = 0, @@ -32,7 +33,7 @@ def pre_cache_len_concat( if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import pre_cache_len_concat - out = pre_cache_len_concat(seq_lens_decoder, seq_lens_this_time, max_dec_len, block_size) + out = pre_cache_len_concat(seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, max_dec_len, block_size) return out else: raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py index 918450c74f1..192c0b8833a 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py +++ b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -101,6 +103,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Triton compute Fused MoE. @@ -117,6 +120,8 @@ def apply( scores += layer.gate_correction_bias topk_weights, topk_ids = paddle.topk(scores, k=top_k, axis=-1, sorted=False) topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) intermediate_cache1 = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], diff --git a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py index e67dd6dbdaf..2260d7caf7b 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py @@ -16,6 +16,7 @@ import multiprocessing import os +from typing import Callable import numpy as np import paddle @@ -182,6 +183,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle gcu compute Fused MoE. @@ -194,6 +196,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -205,6 +208,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -216,6 +220,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle Cutlass compute Fused MoE. @@ -381,6 +386,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle gcu compute Fused MoE. diff --git a/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py b/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py index d47bfc86b93..8e4d7b1cc5e 100644 --- a/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py +++ b/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -48,6 +50,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate_out: paddle.Tensor, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -59,6 +62,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate_out: paddle.Tensor, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -70,6 +74,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle hpu Fused MoE. @@ -142,6 +147,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate_out: paddle.Tensor, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -153,6 +159,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate_out: paddle.Tensor, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -164,6 +171,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle hpu Fused MoE. diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py index 7b61d58b6f5..fbbfac277b8 100644 --- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py +++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -245,6 +247,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Triton compute Fused MoE. @@ -274,6 +277,9 @@ def apply( True, # apply_norm_weight False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + up_gate_proj_out = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], dtype=x.dtype, diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 3a14e28e305..4356f8cc442 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -235,6 +237,7 @@ def apply_tp_fused_op( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply TP Fused Op. @@ -262,6 +265,7 @@ def apply_tp_scatter_op( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply TP Scatter Op. @@ -318,6 +322,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ apply tp @@ -368,6 +373,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -442,6 +448,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -488,6 +495,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ compute Fused MoE. diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index 52d7dadeebc..5ae82efe4ca 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -283,10 +283,12 @@ def weight_loader(self, param, loaded_weight, shard_id=None): if output_dim == 0: h2d_copy(param[: shard_weight.shape[0]], shard_weight) if not current_platform.is_maca(): - param[shard_weight.shape[0] :].fill_(0) + if param.shape[0] != shard_weight.shape[0]: + param[shard_weight.shape[0] :].fill_(0) else: h2d_copy(param[:, : shard_weight.shape[1]], shard_weight) - param[:, shard_weight.shape[1] :].fill_(0) + if param.shape[1] != shard_weight.shape[1]: + param[:, shard_weight.shape[1] :].fill_(0) def forward(self, ids_remove_padding=None) -> paddle.Tensor: """ diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index e126aed2ba1..49b25dc3d0c 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -356,22 +356,31 @@ def __init__( self.output_sizes = output_sizes def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): - assert loaded_shard_id in ["q_a", "kv_a"] if not param._is_initialized(): param.initialize() + if loaded_shard_id is None: + axis = -1 if (self.fd_config.model_config.model_format == "torch") ^ True else 0 + if hasattr(param, "tensor_track"): + param.tensor_track.mark(start=0, end=loaded_weight.shape[axis]) - if loaded_shard_id == "q_a": - param_shard_offset = 0 - param_shard_size = self.output_sizes[0] else: - # loaded_shard_id == "kv_a" - param_shard_offset = self.output_sizes[0] - param_shard_size = self.output_sizes[1] - param_output_dim = True - if hasattr(param, "tensor_track"): - param_output_dim = param.tensor_track.output_dim - param.tensor_track.mark(start=param_shard_offset, end=param_shard_offset + param_shard_size) - param = slice_fn(param, param_output_dim, start=param_shard_offset, end=param_shard_offset + param_shard_size) + assert loaded_shard_id in ["q_a", "kv_a", "gate", "up"] + + if loaded_shard_id in ["q_a", "gate"]: + param_shard_offset = 0 + param_shard_size = self.output_sizes[0] + elif loaded_shard_id in ["kv_a", "up"]: + param_shard_offset = self.output_sizes[0] + param_shard_size = self.output_sizes[1] + + if hasattr(param, "tensor_track"): + param.tensor_track.mark(start=param_shard_offset, end=param_shard_offset + param_shard_size) + param = slice_fn( + param, + (self.fd_config.model_config.model_format == "torch") ^ True, + start=param_shard_offset, + end=param_shard_offset + param_shard_size, + ) assert param.shape == loaded_weight.shape, ( f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})" ) @@ -974,7 +983,12 @@ def __init__( def process_weights_after_loading(self): if self.fd_config.load_config.dynamic_load_weight: return - w = self.kv_b_proj.weight.reshape( + w = ( + self.kv_b_proj.weight.transpose([1, 0]) + if self.fd_config.model_config.model_format == "torch" + else self.kv_b_proj.weight + ) + w = w.reshape( [ self.kv_lora_rank, self.num_heads_per_partition, diff --git a/fastdeploy/model_executor/layers/lm_head.py b/fastdeploy/model_executor/layers/lm_head.py index ff2797a0415..a7bff3905b0 100644 --- a/fastdeploy/model_executor/layers/lm_head.py +++ b/fastdeploy/model_executor/layers/lm_head.py @@ -102,6 +102,10 @@ def __init__( }, ) set_weight_attrs(self.linear.weight, {"output_dim": True}) + if self.tp_size > 1: + if with_bias: + set_weight_attrs(self.linear.bias, {"output_dim": True}) + else: self.linear = RowParallelLinear( embedding_dim, diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index b61fe48f6a1..43bfd1a0557 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -14,24 +14,64 @@ # limitations under the License. """ +from __future__ import annotations + +import traceback from abc import abstractmethod +from types import ModuleType +from typing import Optional import paddle from paddle import nn from paddleformers.utils.log import logger -try: - from paddle.distributed.communication import deep_ep -except: - logger.warning("import deep_ep Failed!") - -from typing import Optional - import fastdeploy +from fastdeploy import envs from fastdeploy.config import MoEPhase from fastdeploy.utils import singleton +def load_deep_ep() -> ModuleType: + """ + Load DeepEP module according to FastDeploy env switch. + + Returns: + Imported deep_ep module object. + """ + + try: + if envs.FD_USE_PFCC_DEEP_EP: + # Enable torch proxy before importing deep_ep (required by PFCC/PaddleFleet variants) + paddle.compat.enable_torch_proxy(scope={"deep_ep"}) + try: + import paddlefleet.ops.deep_ep as deep_ep # type: ignore + + logger.info("FD use PaddleFleet/DeepEP now.") + return deep_ep + except ModuleNotFoundError: + import deep_ep # type: ignore + + logger.info("FD use PFCCLab/DeepEP now.") + return deep_ep + else: + from paddle.distributed.communication import deep_ep # type: ignore + + logger.info("FD use Paddle/DeepEP now.") + return deep_ep + except Exception as e: + logger.error( + "import deep_ep failed! FD_USE_PFCC_DEEP_EP=%s. type=%s, err=%s", + envs.FD_USE_PFCC_DEEP_EP, + type(e).__name__, + e, + ) + logger.error("Traceback:\n%s", traceback.format_exc()) + raise + + +deep_ep = load_deep_ep() + + class DeepEPBufferManager: _engine: Optional["DeepEPEngine"] = None @@ -280,23 +320,40 @@ def low_latency_dispatch( if self.deepep_engine is None: raise RuntimeError("DeepEP buffer not initialized!") - ( - packed_recv_x, - recv_expert_count, - handle, - _, - dispatch_hook, - ) = self.deepep_engine.low_latency_dispatch( - hidden_states, - topk_idx, - expertwise_scale, - self.buffer.num_max_dispatch_tokens_per_rank, - self.num_experts, - use_fp8=use_fp8, - async_finish=False, - return_recv_hook=True, - num_per_channel=quant_group_size, - ) + if envs.FD_USE_PFCC_DEEP_EP: + ( + packed_recv_x, + recv_expert_count, + handle, + _, + dispatch_hook, + ) = self.deepep_engine.low_latency_dispatch( + hidden_states, + topk_idx, + self.buffer.num_max_dispatch_tokens_per_rank, + self.num_experts, + use_fp8=use_fp8, + async_finish=False, + return_recv_hook=True, + ) + else: + ( + packed_recv_x, + recv_expert_count, + handle, + _, + dispatch_hook, + ) = self.deepep_engine.low_latency_dispatch( + hidden_states, + topk_idx, + expertwise_scale, + self.buffer.num_max_dispatch_tokens_per_rank, + self.num_experts, + use_fp8=use_fp8, + async_finish=False, + return_recv_hook=True, + num_per_channel=quant_group_size, + ) return packed_recv_x, recv_expert_count, handle, dispatch_hook diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index b34291a96f4..729295d9244 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -15,6 +15,7 @@ """ from abc import abstractmethod +from typing import Callable import paddle from paddle import nn @@ -22,9 +23,8 @@ from fastdeploy.model_executor.utils import ( TensorTracker, default_weight_loader, - free_tensor, + process_weight_transpose, set_weight_attrs, - weight_fully_copied, ) from fastdeploy.platforms import current_platform @@ -163,6 +163,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -175,6 +176,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -187,6 +189,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle Cutlass compute Fused MoE. @@ -198,6 +201,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle Cutlass compute Fused MoE. @@ -207,13 +211,13 @@ def apply( if layer.fd_config.model_config.moe_phase.phase == "prefill": if layer.fd_config.scheduler_config.splitwise_role == "mixed" and is_moe_start_layer: self.ep_prefill_runner.clean_low_latency_buffer() - return self.apply_ep_prefill(layer, x, gate) + return self.apply_ep_prefill(layer, x, gate, topk_ids_hookfunc=topk_ids_hookfunc) else: if layer.fd_config.scheduler_config.splitwise_role == "mixed" and is_moe_start_layer: self.ep_decoder_runner.clean_low_latency_buffer() - return self.apply_ep_decode(layer, x, gate) + return self.apply_ep_decode(layer, x, gate, topk_ids_hookfunc=topk_ids_hookfunc) else: - return self.apply_tp(layer, x, gate) + return self.apply_tp(layer, x, gate, topk_ids_hookfunc=topk_ids_hookfunc) class UnquantizedFusedMoEMethod(MoEMethodBase): @@ -307,25 +311,5 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): def process_weights_after_loading(self, layer): if self.model_format != "torch": return - if not weight_fully_copied(layer.up_gate_proj_weight) or not weight_fully_copied(layer.down_proj_weight): - return - up_gate_proj_weight_transpose = layer.up_gate_proj_weight.transpose([0, 2, 1]) - down_proj_weight_transpose = layer.down_proj_weight.transpose([0, 2, 1]) - up_gate_proj = layer.create_parameter( - shape=up_gate_proj_weight_transpose.shape, - dtype=up_gate_proj_weight_transpose.dtype, - default_initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.02), - is_bias=False, - ) - up_gate_proj.copy_(up_gate_proj_weight_transpose, False) - free_tensor(layer.up_gate_proj_weight) - layer.up_gate_proj_weight = up_gate_proj - down_proj = layer.create_parameter( - shape=down_proj_weight_transpose.shape, - dtype=down_proj_weight_transpose.dtype, - default_initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.02), - is_bias=False, - ) - down_proj.copy_(down_proj_weight_transpose, False) - free_tensor(layer.down_proj_weight) - layer.down_proj_weight = down_proj + process_weight_transpose(layer, "up_gate_proj_weight") + process_weight_transpose(layer, "down_proj_weight") diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index e45ad63b19c..c3dbfc9ba5f 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn from paddle.nn.quant import weight_quantize @@ -132,6 +134,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -148,8 +151,13 @@ def apply_ep_prefill( handle, event, ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + if self.ep_prefill_runner.ep_engine.async_finish: event.current_stream_wait() + token_all_num = sum(recv_num_tokens_per_expert_list) # 3. Compute ffn @@ -217,6 +225,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -225,6 +234,10 @@ def apply_ep_decode( estimate_total_token_nums = gate_out.shape[0] * layer.top_k # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + expertwise_scale = None if hasattr(layer, "up_gate_proj_in_scale_all_experts"): # only use in w4a8 expertwise_scale = getattr(layer, "up_gate_proj_in_scale_all_experts", None) @@ -269,6 +282,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle Cutlass compute Fused MoE. @@ -369,6 +383,9 @@ def apply_tp( if hasattr(layer, "up_gate_proj_in_scale"): dequant_scale = None + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8": # only w4a8 need expert_idx_per_token # Other need not this tensor, so we make it None. diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 1245cddcebc..0a0440e487d 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -14,12 +14,14 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn -from paddle.distributed.communication import deep_ep from paddleformers.utils.log import logger import fastdeploy +from fastdeploy.model_executor.layers.moe.ep import deep_ep from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func, deep_gemm from fastdeploy.worker.tbo import let_another_thread_run @@ -139,6 +141,7 @@ def apply_ep_prefill( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP prefill method. @@ -147,10 +150,15 @@ def apply_ep_prefill( # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + # 2. Dynamic compute blockwise quantization scales - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, self.quant_config.weight_block_size[0] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False ) + x_scale_tensor = x_scale_tensor[: x.shape[0]] event = deep_ep.Buffer.capture() let_another_thread_run() @@ -218,11 +226,10 @@ def apply_ep_prefill( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), @@ -264,6 +271,7 @@ def apply_ep_decode( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Apply the EP decoder method. @@ -271,6 +279,10 @@ def apply_ep_decode( gate_out = gate(x.cast("float32")) # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + # 2. EP Dispatch permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch( x, topk_idx, topk_weights, use_fp8=True @@ -335,6 +347,7 @@ def apply_tp( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Paddle Use DeepGemm compute Fused MoE. @@ -363,9 +376,17 @@ def apply_tp( False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) - recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) + recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, + using_pow2_scale=False, + output_scale_transpose=False, + ) + recv_x_scale = recv_x_scale[: recv_x.shape[0]] ( permute_input, @@ -406,12 +427,10 @@ def apply_tp( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (ffn_out.shape[0], getattr(layer, self.added_weight_attrs[1]).shape[1]), diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py index 094d3df8f1a..cd836dbaf09 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -239,6 +241,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Marlin compute Fused MoE. @@ -273,6 +276,9 @@ def apply( False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + block_size_m = 64 for m in [8, 16, 32, 48, 64]: diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 3c14859375c..922729d91bd 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -282,6 +284,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Triton compute Fused MoE. @@ -314,6 +317,10 @@ def apply( True, # apply_norm_weight, False, ) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + up_gate_proj_out = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], dtype=x.dtype, @@ -664,6 +671,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Triton compute Fused MoE. @@ -724,6 +732,9 @@ def apply( * ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + up_gate_proj_out = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], dtype=x.dtype, @@ -953,6 +964,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Triton compute Fused MoE. @@ -974,6 +986,9 @@ def apply( False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + up_gate_proj_out = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], dtype=x.dtype, @@ -1228,6 +1243,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): down_proj_attrs, ) else: + # offline quant # 1.init shape extra_weight_attrs = {**extra_weight_attrs} if layer.fd_config.load_config.load_choices == "default_v1": @@ -1243,17 +1259,9 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): down_proj_scale_shape = self.down_proj_scale_shape[:1] + self.down_proj_scale_shape[1:][::-1] up_gate_proj_attrs = { **extra_weight_attrs, - "tensor_track": TensorTracker( - shape=up_gate_proj_weight_shape, - output_dim=False, - ), } down_proj_attrs = { **extra_weight_attrs, - "tensor_track": TensorTracker( - shape=down_proj_weight_shape, - output_dim=False, - ), } else: up_gate_proj_weight_shape = self.up_gate_proj_weight_shape @@ -1466,6 +1474,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Triton compute Fused MoE. @@ -1488,6 +1497,8 @@ def apply( True, # apply_norm_weight False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) config = { "BLOCK_SIZE_M": 64, @@ -1514,7 +1525,10 @@ def apply( from .triton_moe_kernels import fused_moe_kernel_paddle - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, self.quant_config.weight_block_size[0]) + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False + ) + x_scale = x_scale[: x.shape[0]] fused_moe_kernel_paddle[grid]( x_q, @@ -1567,9 +1581,10 @@ def apply( ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - intermediate_cache2, self.quant_config.weight_block_size[0] + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False ) + x_scale = x_scale[: x_q.shape[0]] fused_moe_kernel_paddle[grid]( x_q, diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py index f75e36bcbdd..3c548ba57c8 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from typing import Callable + import paddle from paddle import nn @@ -261,6 +263,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Use Wint2 Triton Fusedmoe compute Fused MoE. @@ -288,6 +291,9 @@ def apply( topk_only_mode=False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + ffn_out = fastdeploy.model_executor.ops.gpu.moe_expert_ffn_wint2( permute_input, token_nums_per_expert, @@ -328,6 +334,7 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + topk_ids_hookfunc: Callable = None, ) -> paddle.Tensor: """ Use Wint2 Triton Fusedmoe compute Fused MoE. @@ -343,6 +350,9 @@ def apply( False, ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_ids) + num_tokens, K = x.shape E, _, N = layer.up_gate_proj_weight.shape M = num_tokens diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 743e05031f6..683a95fa767 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -14,7 +14,8 @@ # limitations under the License. """ -from typing import Optional +from functools import partial +from typing import Callable, Optional import paddle from paddle import nn @@ -26,6 +27,9 @@ tensor_model_parallel_all_reduce_custom, ) from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.layers.moe.routing_indices_cache import ( + save_routing_to_buffer, +) from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.utils import h2d_copy, slice_fn from fastdeploy.platforms import current_platform @@ -226,7 +230,7 @@ def __init__( self.is_rearrange = False if self.ep_size > 1: self.quant_method.init_ep(self) - + self.enable_routing_replay = fd_config.routing_replay_config.enable_routing_replay # Merge normal and RL build model if gate_correction_bias is not None: self.gate_correction_bias = gate_correction_bias @@ -270,10 +274,13 @@ def weight_loader( if not param._is_initialized(): param.initialize() weight_need_transpose = getattr(param, "weight_need_transpose", False) + + if self.ep_size > 1 or weight_need_transpose: + loaded_weight = get_tensor(loaded_weight) + if shard_id is None: # 1.gate up fused in disk if weight_need_transpose: - loaded_weight = get_tensor(loaded_weight) loaded_weight = loaded_weight.transpose([1, 0]) output_size = param[expert_id - self.expert_id_offset].shape[SHARD_ID_TO_SHARDED_DIM["gate"]] shard_offsets = [ @@ -289,7 +296,6 @@ def weight_loader( self.weight_loader(param, loaded_weight_shard, expert_id, shard_id, "fused") else: if weight_need_transpose and source != "fused": - loaded_weight = get_tensor(loaded_weight) loaded_weight = loaded_weight.transpose([1, 0]) # 2.gate up splited in disk assert shard_id in ["gate", "down", "up"] @@ -600,7 +606,7 @@ def load_state_dict(self, state_dict, is_rearrange: bool = False): else: self.quant_method.process_loaded_weights(self, state_dict) - def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): + def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer, topk_ids_hookfunc: Callable = None): """ Forward split allgather function. """ @@ -615,14 +621,14 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): if end_offset > token_num: end_offset = token_num part_x[: (end_offset - start_offset), :] = x[start_offset:end_offset, :] - out = self.quant_method.apply(self, part_x, gate) + out = self.quant_method.apply(self, part_x, gate, topk_ids_hookfunc=topk_ids_hookfunc) multi_outs = paddle.zeros([token_num_per_rank * self.attn_tp_size, x.shape[1]], dtype=x.dtype) paddle.distributed.all_gather(multi_outs, out, self.tp_group) out = multi_outs[:token_num, :] return out - def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): + def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta = None): """ Defines the forward computation of the moe layer. @@ -633,6 +639,23 @@ def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): Tensor: Output tensor.s """ + topk_ids_hookfunc = None + if self.enable_routing_replay: + # When execute empty_input_forward forward_meta is None. When execute mtp layer routing_replay_table is None. + if forward_meta is not None and forward_meta.routing_replay_table is not None: + moe_layer_idx = self.layer_idx - self.fd_config.model_config.moe_layer_start_index + topk_ids_hookfunc = partial( + save_routing_to_buffer, + routing_replay_table=forward_meta.routing_replay_table, + batch_id_per_token=forward_meta.batch_id_per_token, + seq_lens_decoder=forward_meta.seq_lens_decoder, + cu_seqlens_q=forward_meta.cu_seqlens_q, + layer_idx=moe_layer_idx, + tp_size=self.fd_config.parallel_config.tensor_parallel_size, + ep_size=self.fd_config.parallel_config.expert_parallel_size, + tp_group=self.fd_config.parallel_config.tp_group, + ) + token_num = x.shape[0] if ( self.ep_size > 1 @@ -640,11 +663,16 @@ def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): and (not self.fd_config.parallel_config.use_sequence_parallel_moe) and token_num >= self.attn_tp_size ): - out = self.forward_split_allgather(x, gate) + out = self.forward_split_allgather(x, gate, topk_ids_hookfunc=topk_ids_hookfunc) elif self.fd_config.parallel_config.use_ep and self.fd_config.parallel_config.enable_chunked_moe: - out = self.forward_chunked_moe(x, gate, forward_meta) + out = self.forward_chunked_moe( + x, + gate, + forward_meta, + topk_ids_hookfunc=topk_ids_hookfunc, + ) else: - out = self.forward_normal(x, gate) + out = self.forward_normal(x, gate, forward_meta, topk_ids_hookfunc=topk_ids_hookfunc) if self.reduce_results and self.tp_size > 1: if current_platform.is_intel_hpu(): @@ -653,7 +681,9 @@ def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): out = tensor_model_parallel_all_reduce(out, self.tp_group) return out - def forward_chunked_moe(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): + def forward_chunked_moe( + self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta, topk_ids_hookfunc: Callable = None + ): """ Split input to multi chunk to reduce the memory usage of moe. @@ -677,21 +707,25 @@ def forward_chunked_moe(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: Fo for i in range(forward_meta.max_moe_num_chunk): if i < forward_meta.moe_num_chunk: - out_split_list[i] = self.quant_method.apply(self, x_split_list[i], gate) + out_split_list[i] = self.quant_method.apply( + self, x_split_list[i], gate, topk_ids_hookfunc=topk_ids_hookfunc + ) else: # just need to use real data to infer max_moe_num_chunk times. - self.quant_method.apply(self, fake_x, gate) + self.quant_method.apply(self, fake_x, gate, topk_ids_hookfunc=topk_ids_hookfunc) out = paddle.concat(out_split_list, axis=0) else: # when only one chunk, just need to use real data to infer once. - out = self.quant_method.apply(self, x, gate) + out = self.quant_method.apply(self, x, gate, topk_ids_hookfunc=topk_ids_hookfunc) for i in range(forward_meta.max_moe_num_chunk - 1): - self.quant_method.apply(self, fake_x, gate) + self.quant_method.apply(self, fake_x, gate, topk_ids_hookfunc=topk_ids_hookfunc) return out - def forward_normal(self, x: paddle.Tensor, gate: nn.Layer): + def forward_normal( + self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta, topk_ids_hookfunc: Callable = None + ): """ Normal mode of forward. @@ -702,5 +736,5 @@ def forward_normal(self, x: paddle.Tensor, gate: nn.Layer): Tensor: Output tensor.s """ - out = self.quant_method.apply(self, x, gate) + out = self.quant_method.apply(self, x, gate, topk_ids_hookfunc=topk_ids_hookfunc) return out diff --git a/fastdeploy/model_executor/layers/moe/routing_indices_cache.py b/fastdeploy/model_executor/layers/moe/routing_indices_cache.py new file mode 100644 index 00000000000..795fbafeb8f --- /dev/null +++ b/fastdeploy/model_executor/layers/moe/routing_indices_cache.py @@ -0,0 +1,830 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import asyncio +import atexit +import functools +import multiprocessing +import os +import shutil +import threading +import time +import traceback +from abc import ABC, abstractmethod +from concurrent.futures import ThreadPoolExecutor +from multiprocessing import Process, Queue +from typing import Dict, Optional, TypedDict + +import numpy as np +import paddle +import paddle.distributed as dist +import triton +import triton.language as tl +from paddleformers.utils.log import logger + +from fastdeploy.config import FDConfig, RoutingReplayConfig + + +@triton.jit +def _save_routing_kernel( + ROUTING_REPLAY_TABLE_PTR, + TOPK_IDS_PTR, + BATCH_ID_PER_TOKEN_PTR, + CU_SEQLENS_Q_PTR, + SEQ_LENS_DECODER_PTR, + LAYER_IDX, + TOKEN_NUM, + TOP_K, + NUM_HIDDEN_LAYERS, + MAX_MODEL_LEN, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + pid_m = tl.program_id(axis=0) + + token_offsets = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + token_mask = token_offsets < TOKEN_NUM + + k_offsets = tl.arange(0, BLOCK_SIZE_K) + + k_mask = k_offsets < TOP_K + + topk_ids_ptrs = TOPK_IDS_PTR + token_offsets[:, None] * TOP_K + k_offsets[None, :] + # [BLOCK_SIZE_M, BLOCK_SIZE_K] + + load_mask = token_mask[:, None] & k_mask[None, :] + topk_vals = tl.load(topk_ids_ptrs, mask=load_mask) + + batch_ids = tl.load(BATCH_ID_PER_TOKEN_PTR + token_offsets, mask=token_mask) + pad_mask = token_mask & (batch_ids != -1) + # [0, 3, 4, 10, 12][0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3] + # -> [0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 10, 10] + # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] - [0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 10, 10] + # -> [0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 0, 1] + start_offsets = tl.load(CU_SEQLENS_Q_PTR + batch_ids, mask=pad_mask) + token_relative_index = token_offsets - start_offsets + + # [BLOCK_SIZE_M] + len_decoder = tl.load(SEQ_LENS_DECODER_PTR + batch_ids, mask=pad_mask) + token_seq_pos = len_decoder + token_relative_index + + STRIDE_BUF_SEQ = MAX_MODEL_LEN * NUM_HIDDEN_LAYERS * TOP_K + STRIDE_BUF_TOKEN = NUM_HIDDEN_LAYERS * TOP_K + STRIDE_BUF_LAYER = TOP_K + + # [BLOCK_SIZE_M, BLOCK_SIZE_K] + output_ptrs = ( + ROUTING_REPLAY_TABLE_PTR + + batch_ids[:, None] * STRIDE_BUF_SEQ + + token_seq_pos[:, None] * STRIDE_BUF_TOKEN + + LAYER_IDX * STRIDE_BUF_LAYER + + k_offsets[None, :] + ) + + pos_mask = token_seq_pos < MAX_MODEL_LEN + pos_mask = pos_mask & pad_mask + + # [BLOCK_SIZE_M, BLOCK_SIZE_K] + pos_mask = pos_mask[:, None] & k_mask[None, :] + + final_mask = load_mask & pos_mask + + tl.store(output_ptrs, topk_vals, mask=final_mask) + + +def save_routing_to_buffer( + routing_replay_table: paddle.Tensor, # [max_num_seqs, num_layers, max_len, top_k] + topk_ids: paddle.Tensor, # [token_num, top_k] + batch_id_per_token: paddle.Tensor, # [token_num, 1] + seq_lens_decoder: paddle.Tensor, # [max_num_seqs, 1] + cu_seqlens_q: paddle.Tensor, # [max_num_seqs + 1, 1] + layer_idx: int, + tp_size: int, + ep_size: int, + tp_group: dist.communication.group.Group, +): + if tp_size > 1 and ep_size > 1: + token_num_per_rank = topk_ids.shape[0] + if token_num_per_rank == 0: + return + topk_ids_all = paddle.zeros([token_num_per_rank * tp_size, topk_ids.shape[1]], dtype=topk_ids.dtype) + paddle.distributed.all_gather(topk_ids_all, topk_ids, tp_group) + topk_ids = topk_ids_all[: batch_id_per_token.shape[0], :] + + token_num, top_k = topk_ids.shape + max_num_seqs, max_model_len, num_hidden_layers, _ = routing_replay_table.shape + assert token_num > 0 + assert topk_ids.shape[1] == routing_replay_table.shape[3], (topk_ids.shape[1], routing_replay_table.shape[3]) + assert batch_id_per_token.shape[0] == token_num, (batch_id_per_token.shape[0], token_num) + assert seq_lens_decoder.shape[0] == max_num_seqs, (seq_lens_decoder.shape[0], max_num_seqs) + + BLOCK_SIZE_M = 128 + BLOCK_SIZE_K = triton.next_power_of_2(top_k) # top_k + + grid = (triton.cdiv(token_num, BLOCK_SIZE_M),) + _save_routing_kernel[grid]( + routing_replay_table, + topk_ids, + batch_id_per_token, + cu_seqlens_q, + seq_lens_decoder, + LAYER_IDX=layer_idx, + TOKEN_NUM=token_num, + TOP_K=top_k, + NUM_HIDDEN_LAYERS=num_hidden_layers, + MAX_MODEL_LEN=max_model_len, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_K=BLOCK_SIZE_K, + ) + + +class RoutingReplayManager: + """Request level routing replay table manager""" + + def __init__(self, fd_config: FDConfig, block_table, total_block_num): + self.fd_config = fd_config + self.block_table = block_table + self.max_num_seqs = fd_config.scheduler_config.max_num_seqs + self.max_model_len = fd_config.model_config.max_model_len + self.num_moe_layers = fd_config.model_config.num_hidden_layers - fd_config.model_config.moe_layer_start_index + self.only_last_turn = fd_config.routing_replay_config.only_last_turn + self.use_fused_put = fd_config.routing_replay_config.use_fused_put + if fd_config.model_config.architectures[0] == "Glm4MoeForCausalLM": + self.moe_top_k = fd_config.model_config.num_experts_per_tok + else: + self.moe_top_k = fd_config.model_config.moe_k + self.tp_rank = fd_config.parallel_config.tensor_parallel_rank + + # Initialize the routing replay table and routing cache + self.routing_batch_to_request: Dict[int, str] = {} + num_experts = fd_config.model_config.moe_num_experts + fd_config.model_config.moe_num_shared_experts + self.routing_dtype = self.get_routing_dtype(num_experts=num_experts) + self._init_routing_cache(dtype=self.routing_dtype, total_block_num=total_block_num) + + # Initialize routing store wrapper + if self.tp_rank == 0: + self._store_wrapper = StoreWrapper( + fd_config=fd_config, + ) + self._store_wrapper.start_store_warpper() + + def _init_routing_cache(self, dtype: str, total_block_num: int): + """Initialize the device buffer and host buffer.""" + + max_num_kv_tokens = total_block_num * self.fd_config.cache_config.block_size + + self._host_cache = paddle.full( + shape=[max_num_kv_tokens, self.num_moe_layers, self.moe_top_k], fill_value=-1, dtype=dtype, device="cpu" + ) + + self.routing_replay_table = paddle.full( + shape=[self.max_num_seqs, self.max_model_len, self.num_moe_layers, self.moe_top_k], + fill_value=-1, + dtype=dtype, + ) + logger.info( + f"[R3] The host cache size is:{self._host_cache.shape}, device cache size is: {self.routing_replay_table.shape}" + ) + + def get_routing_dtype(self, num_experts: int, reserved_fill_value: int = 1) -> str: + """Calculate the minimum number of bits required for storage routing.""" + if num_experts <= 0: + raise ValueError(f"num_experts must be greater than 0 but got {num_experts}, please check model config.") + dtype = "uint8" + total_number = num_experts + reserved_fill_value + if total_number <= 255: # uint8: 0~255 + dtype = "uint8" + elif total_number <= 65535: # uint16: 0~65,535 + dtype = "uint16" + elif total_number <= 4294967295: # uint32: 0~4,294,967,295 + dtype = "uint32" + else: + raise ValueError( + f"The number of experts {num_experts} exceeds the representation range of uint32, please check model config." + ) + logger.info(f"[R3] Routing replay table dtype: {dtype}") + return dtype + + def update_host_cache(self, positions: paddle.Tensor, slot_mapping: paddle.Tensor): + """Update the host cache with new tokens""" + for batch_id, position in enumerate(positions): + if len(position) > 0 and len(slot_mapping[batch_id]) > 0: + routing_ids = self.routing_replay_table[batch_id, position, :, :].contiguous() + routing_ids = routing_ids.cpu() + + self._host_cache[slot_mapping[batch_id], :, :] = routing_ids + + def get_token_positions(self, seq_lens_decoder, seq_lens_this_time): + """Get token position of each sequence in a batch.""" + starts = seq_lens_decoder.numpy()[:, 0] + increase_num = seq_lens_this_time.numpy()[:, 0] + + positions = [] + for i in range(self.max_num_seqs): + if seq_lens_this_time[i] == 0: + positions.append([]) + continue + repeated_base = np.repeat(starts[i], increase_num[i]) + positions.append(repeated_base + np.arange(0, increase_num[i])) + + return positions + + def compute_slot_mapping(self, positions: np.ndarray): + """Compute the mapping between token ids and kvcache slots""" + slot_mapping = [] + for batch_id, position in enumerate(positions): + if len(position) == 0: + slot_mapping.append([]) + continue + block_table_indices = position // self.fd_config.cache_config.block_size + token_block_ids = self.block_table[batch_id, block_table_indices] + block_offset = position % self.fd_config.cache_config.block_size + + token_cache_ids = np.array(token_block_ids) * self.fd_config.cache_config.block_size + block_offset + slot_mapping.append(token_cache_ids) + + return slot_mapping + + def _get_routing_from_cache(self, finished_batch_ids, seq_lens_decoder): + """ + When request is finished or cleared the length of the request is recorded at seq_lens_decoder + 1. finish the step: after update input, lens = seq_lens_decoder_buffer + 2. clear parameter: after update input, lens = seq_lens_decoder_buffer + """ + # Get the slot mapping of the request cache. + current_token_nums = seq_lens_decoder.numpy()[:, 0] + positions = [] + for batch_id in range(self.max_num_seqs): + position = [] + if batch_id in finished_batch_ids: + position = np.arange(0, current_token_nums[batch_id]) + positions.append(position) + + # Collection the cached routing information + token_cache_ids = self.compute_slot_mapping(positions=positions) + for slot_map in token_cache_ids: + if len(slot_map) > 0: + token_cached_routing = self._host_cache[slot_map, :, :] + return paddle.transpose(token_cached_routing, [1, 0, 2]) + raise ValueError("No cached routing found") + + def put_finished_batch( + self, + finished_batch_ids, + seq_lens_decoder, + ): + finished_batch_ids_list = finished_batch_ids.cpu().tolist() + for batch_id, finished in enumerate(finished_batch_ids_list): + if finished: + assert batch_id in self.routing_batch_to_request.keys() + # Deregister the request + request_id = self._deregister_request(batch_id) + # Put the routing of finished request to store + self._put_request_to_store( + batch_id=batch_id, + request_id=request_id, + seq_lens_decoder=seq_lens_decoder, + ) + # Clear the slot of the finished batch + self._clear_table_slot(batch_id) + + def register_request(self, batch_id: int, request_id: str): + """ + Register a new request to routing replay table + Args: + batch_id: The batch ID of this request + request_id: The global ID of the request is usually executed by the training process in RL + """ + # The chunked prefill tasks will be registered repeatedly + if batch_id in self.routing_batch_to_request: + if self.routing_batch_to_request[batch_id] == request_id: + logger.warning(f"[R3] Request {request_id} has been registered at {batch_id}.") + return + else: + raise RuntimeError( + f"[R3] The Batch {batch_id} has been registered by request {self.routing_batch_to_request[batch_id]}, now robed by {request_id}," + ) + + # Register the new request + self.routing_batch_to_request[batch_id] = request_id + logger.info(f"[R3] Register request {request_id} with batch id {batch_id}") + + def _deregister_request(self, batch_id: int) -> str: + """ + Deregister a request from routing replay table + """ + assert batch_id in self.routing_batch_to_request + return self.routing_batch_to_request.pop(batch_id) + + def _put_request_to_store( + self, + batch_id: int, + request_id: str, + seq_lens_decoder, + ): + if self.tp_rank == 0: + before_put_request_time = time.perf_counter() + + # Collect the routing of finished request + batch_buffer = self._get_routing_from_cache( + finished_batch_ids=[batch_id], seq_lens_decoder=seq_lens_decoder + ) + rollout_id = self.split_request_id(request_id) + # TODO(gongshaotian): Delete pad func after trainer support dynamic len + paded_batch_buffer = self.pad_routing_cache(routing_indices=batch_buffer) + + if self.use_fused_put: + self._store_wrapper.submit_put_task(routing_indices=paded_batch_buffer, rollout_id=rollout_id) + else: + for layer_id in range(self.num_moe_layers): + layer_buffer = batch_buffer[layer_id] + self._store_wrapper.submit_put_task( + routing_indices=layer_buffer, rollout_id=rollout_id, layer_idx=layer_id + ) + + # Only store the routing of last turn + if self.only_last_turn: + self._store_wrapper.submit_clear_prefix_batch_task(rollout_id=rollout_id) + + logger.info(f"[R3] Submit {request_id} time cost: {time.perf_counter() - before_put_request_time}") + + def _clear_table_slot(self, batch_id: int): + assert 0 <= batch_id < self.max_num_seqs + self.routing_replay_table[batch_id].fill_(-1) + + def get_routing_table(self) -> paddle.Tensor: + return self.routing_replay_table + + def split_request_id(self, request_id: str): + """ + Split the request id to get rollout id. + + request_id: "chatcmpl-request.user-uuid" + rollout_id: "request.user" + example: "chatcmpl-xxx_xxx_epoch_15:2:2:1-d9f16c5c-65f6-4815-b44d-14e2c581907c_0" -> "xxx_xxx_epoch_15:2:2:1" + """ + chat_type, tmp_str = request_id.split("-", 1) + # NOTE(gongshaotian): only support chatcmpl now + assert ( + chat_type == "chatcmpl" + ), "Rollout Routing Replay only supports chatcmpl. Please check whether the request type and userid settings are correct." + reversed_tmp_str = tmp_str[::-1].split("-", 5) + rollout_id = reversed_tmp_str[-1][::-1] + return rollout_id + + def pad_routing_cache(self, routing_indices) -> paddle.Tensor: + """Pad routing indices of the request levevl to max model len""" + current_shape = routing_indices.shape[1] + pad_tensor = paddle.full( + shape=[self.num_moe_layers, (self.max_model_len - current_shape), self.moe_top_k], + fill_value=-1, + dtype=self.routing_dtype, + ) + return paddle.concat([routing_indices, pad_tensor], axis=1) + + +class StoreWrapper(object): + def __init__(self, fd_config: False) -> None: + super().__init__() + self.fd_config = fd_config + + # Initialize task queue + layer_num = 61 + max_request = 200 + self.queue_max_size = layer_num * max_request + + self.manager = multiprocessing.Manager() + self._task_queue = self.manager.Queue(maxsize=self.queue_max_size) + + self._monitor_thread: threading.Thread = None + self._stop_monitor = threading.Event() + + # Initialize consumer process + self._routing_store_process = StoreProcess( + task_queue=self._task_queue, + routing_replay_config=self.fd_config.routing_replay_config, + ) + self._sotre_process_running = False + + # Register atexit handler + atexit.register(self.shutdown) + + def shutdown(self): + """ """ + if not self._sotre_process_running: + return + self._sotre_process_running = False + + # Stop the monitor thread + self._stop_monitor.set() + if self._monitor_thread and self._monitor_thread.is_alive(): + self._monitor_thread.join(timeout=3.0) + + # Put a sentinel value to signal the consumer to stop + if self._routing_store_process and self._routing_store_process.is_alive(): + try: + self._task_queue.put_nowait(None) + except Exception as e: + logger.info(f"Could not put sentinel into queue: {e}") + + if self._routing_store_process and self._routing_store_process.is_alive(): + # Wait for all tasks to be processed + self._routing_store_process.join(timeout=10.0) + if self._routing_store_process.is_alive(): + self._routing_store_process.close() + self._routing_store_process.join() + + self._task_queue.join() + self.manager.shutdown() + self._sotre_process_running = False + + def start_store_warpper(self): + """ """ + if self._sotre_process_running: + return + self._sotre_process_running = True + + # Start monitor thread + self._stop_monitor.clear() + self._monitor_thread = threading.Thread(target=self._monitor_queue_load, daemon=True) + self._monitor_thread.start() + + # Start Routing Store Wrapper in sub process + self._routing_store_process.start() + + def _monitor_queue_load(self): + """ """ + while not self._stop_monitor.is_set(): + time.sleep(2.0) + if not self._sotre_process_running: + break + qsize = self._task_queue.qsize() + + # Alarm when the task exceeds 80% of the queue capacity + if qsize > self.queue_max_size * 0.8: + logger.warning( + f"[Monitor] Queue load is HIGH: {qsize}/{self.queue_max_size}. " + f"Dropped tasks so far: {self._dropped_tasks}. " + "Consider increasing max_workers or queue_max_size." + ) + logger.info(f"[Monitor] Queue load: {qsize}/{self.queue_max_size}") + + def submit_put_task(self, routing_indices: paddle.Tensor, rollout_id: str, layer_idx: int = None) -> None: + """Submit a put task to the task queue""" + if not self._sotre_process_running: + raise RuntimeError("Store not started.") + + start_time = time.perf_counter() + if layer_idx is not None: + rdma_rollout_key = f"{rollout_id}_{layer_idx}" + else: + rdma_rollout_key = rollout_id + + routing_indices_np = np.array(routing_indices.numpy(), copy=True) + + task: StoreTask = {"task_type": "put", "key": rdma_rollout_key, "data": routing_indices_np} + + try: + self._task_queue.put_nowait(task) + except Exception: + raise RuntimeError(f"Queue is FULL. Dropping put task for key: {rdma_rollout_key}. ") + logger.info(f"[R3] Submit put task for key: {rdma_rollout_key}, cost time: {time.perf_counter()-start_time} s") + + def submit_clear_store_task(self) -> None: + """Submit clear store task""" + if not self._sotre_process_running: + raise RuntimeError("Store not started.") + + start_time = time.perf_counter() + task: StoreTask = {"task_type": "clear_store", "key": None, "data": None} + + try: + self._task_queue.put_nowait(task) + # Wait for the task to be processed + self._task_queue.join() + except Exception: + raise RuntimeError("Queue is FULL. Dropping put task for key: clear_store. ") + logger.info(f"[R3] Submit clear task, cost time: {time.perf_counter()-start_time} s") + + def submit_clear_prefix_batch_task(self, rollout_id) -> None: + """Submit clear prefix batch task""" + if not self._sotre_process_running: + raise RuntimeError("Store not started.") + prefix_batch = self.get_needed_clear_ids(rollout_id) + + if prefix_batch is None: + return + start_time = time.perf_counter() + task: StoreTask = {"task_type": "clear_prefix_batch", "key": prefix_batch, "data": None} + try: + self._task_queue.put_nowait(task) + except Exception: + raise RuntimeError("Queue is FULL. Dropping put task for key: clear_store. ") + logger.info( + f"[R3] Submit clear prefix batch task for key: {prefix_batch}, cost time: {time.perf_counter()-start_time} s" + ) + + def get_needed_clear_ids(self, roullout_id: str) -> Optional[str]: + """ + Generate the prefix IDs for all closed multi-round tasks. + rollout_id: "xxx_xxx_epoch_15:2:2:1" + example: xxx_xxx_data_id:gen_id:turn_id:segment_id + """ + reversed_segment_id, reversed_turn_id, reversed_prefix_gen_id = roullout_id[::-1].split(":", 2) + prefix_gen_id = reversed_prefix_gen_id[::-1] + turn_id = eval(reversed_turn_id[::-1]) + segment_id = eval(reversed_segment_id[::-1]) + + assert turn_id >= 0 and segment_id >= 0 + prefix_batch = None + if turn_id > 0: + prefix_batch = f"{prefix_gen_id}:{(turn_id-1)}:{segment_id}" + return prefix_batch + + +class StoreTask(TypedDict): + task_type: str + key: str + data: np.ndarray + + +class StoreProcess(Process): + def __init__(self, task_queue: Queue, routing_replay_config: RoutingReplayConfig) -> None: + super().__init__() + + self._task_queue = task_queue + self.routing_replay_config = routing_replay_config + self.max_workers = 1 + self._closed = False + # Note: _routing_store and _event_loop_thread must be initialized in run() + # because they cannot be properly inherited after fork() + self._routing_store = None + self._event_loop_thread = None + + def run(self): + logger.info(f"[R3] Start Running Store Wrapper in sub process {os.getpid()}") + + # Initialize routing store in subprocess + self._routing_store = get_routing_store(routing_replay_config=self.routing_replay_config) + + # Initialize event loop thread in subprocess + self._event_loop_thread = AsyncEventLoopThread() + self._event_loop_thread.start() + if not self._event_loop_thread._started_event.wait(timeout=5.0): + raise RuntimeError("Failed to start async event loop thread in subprocess") + + clear_store_task = StoreTask({"task_type": "clear_store", "key": None, "data": None}) + self._task_queue.put_nowait(clear_store_task) + + logger.info(f"[R3] Event loop thread started in subprocess {os.getpid()}") + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + while not self._closed: + try: + task = StoreTask(self._task_queue.get()) + logger.info(f"[R3] Receive {task['task_type']} task, key: {task['key']}") + if task is None: # Sentinel + self._task_queue.task_done() + break + + if task["task_type"] == "put": + logger.info(f"[R3] before process put task, key: {task['key']}") + future = executor.submit(self.process_put_task, task) + future.add_done_callback(lambda f: self._task_queue.task_done()) + elif task["task_type"] == "clear_store": + future = executor.submit(self.process_clear_store_task, task) + future.add_done_callback(lambda f: self._task_queue.task_done()) + elif task["task_type"] == "clear_prefix_batch": + future = executor.submit(self.process_clear_prefix_batch_task, task) + future.add_done_callback(lambda f: self._task_queue.task_done()) + except Exception as e: + self._task_queue.task_done() + raise RuntimeError(f"Error during processing task. {e}") + + logger.info(f"[Consumer Process {Process.current_process().pid}] Shutdown.") + + def process_put_task(self, store_task: StoreTask) -> None: + try: + coro_obj = self._routing_store.put(routing_key=store_task["key"], routing_indices=store_task["data"]) + future = self._event_loop_thread.submit_coroutine( + coro_obj, callback=functools.partial(self._on_async_task_completed, store_task) + ) + return future + except Exception as e: + logger.error(f"Error submitting put task: {e}") + traceback.print_exc() + raise + + def process_clear_store_task(self, store_task: StoreTask) -> None: + try: + coro_obj = self._routing_store.clear_store() + future = self._event_loop_thread.submit_coroutine( + coro_obj, callback=functools.partial(self._on_async_task_completed, store_task) + ) + return future + except Exception as e: + logger.error(f"Error during processing clear store task. {e}") + traceback.print_exc() + raise + + def process_clear_prefix_batch_task(self, store_task: StoreTask) -> None: + try: + coro_obj = self._routing_store.clear_prefix_batch(routing_prefix_key=store_task["key"]) + future = self._event_loop_thread.submit_coroutine( + coro_obj, callback=functools.partial(self._on_async_task_completed, store_task) + ) + return future + except Exception as e: + logger.error(f"Error submitting clear_prefix_batch task: {e}") + traceback.print_exc() + raise + + def _on_async_task_completed(self, task, future): + """ """ + try: + # result = future.result() + logger.info(f"[R3] Async task completed: {task['task_type']}, key: {task.get('key')}") + except Exception as e: + logger.error(f"[R3] Async task failed: {task['task_type']}, key: {task.get('key')}, error: {e}") + traceback.print_exc() + raise + + def close(self): + """Close the store process""" + self._closed = True + if hasattr(self, "_event_loop_thread"): + self._event_loop_thread.stop() + + +class AsyncEventLoopThread(threading.Thread): + def __init__(self): + super().__init__(daemon=True) + self._loop = None + self._started_event = threading.Event() + self._closed = False + + def run(self): + """Run the async event loop""" + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + + # Set the event loop to be started + self._started_event.set() + logger.info("[EventLoopThread] Event loop started, running forever...") + + try: + self._loop.run_forever() + logger.info("[EventLoopThread] Event loop stopped") + except Exception as e: + logger.error(f"[EventLoopThread] Event loop exception: {e}") + traceback.print_exc() + finally: + logger.info("[EventLoopThread] Closing event loop") + self._loop.close() + + def submit_coroutine(self, coro, callback=None): + """Thread safely submit coroutine to event loop""" + if self._closed: + raise RuntimeError("Event loop thread is closed") + if not self._started_event.wait(timeout=5.0): + raise RuntimeError("Event loop failed to start within 5 seconds") + + future = asyncio.run_coroutine_threadsafe(coro, self._loop) + + if callback: + + def wrapped_callback(f): + try: + callback(f) + except Exception as e: + print(f"Error in callback: {e}") + traceback.print_exc() + + future.add_done_callback(wrapped_callback) + logger.info("coro add callback func") + return future + + def stop(self): + """Stop the event loop""" + if not self._closed: + self._closed = True + if self._loop: + self._loop.call_soon_threadsafe(self._loop.stop) + + +class RoutingStoreBase(ABC): + """Base class for routing store""" + + def __init__(self, routing_replay_config: RoutingReplayConfig) -> None: + self.routing_replay_config = routing_replay_config + + @abstractmethod + async def put(self, routing_key: str, routing_indices: np.ndarray) -> None: + """Put the routing indices into store""" + raise NotImplementedError + + @abstractmethod + async def clear_store( + self, + ): + """Clear the routing indices store""" + raise NotImplementedError + + @abstractmethod + async def clear_prefix_batch(self, routing_prefix_key: str): + """Clear the routing indices""" + raise NotImplementedError + + +class RoutingStoreLocal(RoutingStoreBase): + """Routing Store using local memory""" + + def __init__(self, routing_replay_config) -> None: + super().__init__(routing_replay_config=routing_replay_config) + self.local_store_dir = routing_replay_config.local_store_dir + os.makedirs(self.local_store_dir, exist_ok=True) + + async def put( + self, + routing_key: str, + routing_indices: np.ndarray, + ) -> None: + """Put the routing indices into store""" + # TODO(gongshaotian) covert ./store_dir/routing_key/layer_id.pdtensor to ./store_dir/routing_key.pt + time_before_put = time.perf_counter() + file_path = os.path.join(self.local_store_dir, f"{routing_key}.pdtensor") + paddle.save(routing_indices, file_path) + logger.info(f"[R3] The routing key {routing_key} put cost is {time.perf_counter()-time_before_put}s") + + async def clear_store(self): + """Clear the routing indices store""" + if os.path.isdir(self.local_store_dir): + shutil.rmtree(self.local_store_dir) + + logger.info("[R3] Clear routing store.") + + async def clear_prefix_batch(self, routing_prefix_key: str): + """Clear the routing indices""" + raise NotImplementedError + + +class RoutingStoreRDMA(RoutingStoreBase): + """Routing Store using RDMA""" + + def __init__(self, routing_replay_config) -> None: + super().__init__(routing_replay_config=routing_replay_config) + try: + # Only used in RLHF + from p2pstore import P2PClient, P2PConfig + except ModuleNotFoundError: + raise ModuleNotFoundError(" RoutingStoreRDMA and p2pstore only support in RLHF. ") + + rdma_store_server = routing_replay_config.rdma_store_server + p2pConfig = P2PConfig(metadata_server=rdma_store_server) + self.p2p_client = P2PClient(p2pConfig) + + async def put(self, routing_key: str, routing_indices: np.ndarray) -> None: + """Put the routing indices into store""" + time_before_put = time.perf_counter() + result = await self.p2p_client.put(routing_key, routing_indices) + logger.info(f"[R3] The routing key {routing_key}, put cost is {time.perf_counter()-time_before_put}s") + return result + + async def clear_prefix_batch(self, routing_prefix_key: str): + time_before_clear = time.perf_counter() + result = await self.p2p_client.delete_prefix_batch([routing_prefix_key]) + logger.info( + f"[R3] The clear routing prefix key {routing_prefix_key}, cost is {time.perf_counter()-time_before_clear}s" + ) + return result + + async def clear_store(self): + """Clear the routing indices store""" + time_before_clear = time.perf_counter() + result = await self.p2p_client.clear() + logger.info(f"[R3] Clear routing store cost is {time.perf_counter()-time_before_clear}s.") + return result + + +def get_routing_store(routing_replay_config: RoutingReplayConfig) -> RoutingStoreBase: + if routing_replay_config.routing_store_type == "local": + return RoutingStoreLocal(routing_replay_config=routing_replay_config) + elif routing_replay_config.routing_store_type == "rdma": + return RoutingStoreRDMA(routing_replay_config=routing_replay_config) + else: + raise ValueError( + f"Invalid routing store type: '{routing_replay_config.routing_store_type}'. " + "Valid types are: 'local', 'rdma'" + ) diff --git a/fastdeploy/model_executor/layers/mtp_linear.py b/fastdeploy/model_executor/layers/mtp_linear.py index b1699720bdd..e1f52d73899 100644 --- a/fastdeploy/model_executor/layers/mtp_linear.py +++ b/fastdeploy/model_executor/layers/mtp_linear.py @@ -86,6 +86,9 @@ def __init__( ) if self.tp_size > 1: set_weight_attrs(self.linear.weight, {"output_dim": True}) + if self.bias_key is not None: + set_weight_attrs(self.linear.bias, {"output_dim": True}) + else: self.linear = RowParallelLinear( embedding_dim, diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py index ec1f0e65891..a66172fc1b5 100644 --- a/fastdeploy/model_executor/layers/normalization.py +++ b/fastdeploy/model_executor/layers/normalization.py @@ -105,14 +105,14 @@ def __init__( self.tp_rank = self.fd_config.parallel_config.tensor_parallel_rank self.tp_group = self.fd_config.parallel_config.tp_group is_input_norm = prefix.endswith(".input_layernorm") - is_last_norm = prefix.endswith(".norm") + self.is_last_norm = prefix.endswith(".norm") self.split_x = ( self.fd_config.parallel_config.use_sequence_parallel_moe and self.layer_id == self.fd_config.model_config.moe_layer_start_index and is_input_norm ) self.allgather_out = self.fd_config.parallel_config.use_sequence_parallel_moe and ( - (self.layer_id > self.fd_config.model_config.moe_layer_start_index and is_input_norm) or is_last_norm + (self.layer_id > self.fd_config.model_config.moe_layer_start_index and is_input_norm) ) self.init_weight() @@ -130,6 +130,10 @@ def init_weight(self): dtype=self._norm_weight_dtype, ) + def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): + loaded_weight = get_tensor(loaded_weight).astype(self._norm_weight_dtype) + param.copy_(loaded_weight, False) + def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): """ Load the checkpoint state dictionary into the layer. diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index a7b61fc0ef8..c13b429095a 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -18,7 +18,6 @@ import paddle -import fastdeploy from fastdeploy import envs from fastdeploy.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -138,7 +137,9 @@ def create_weights(self, layer, **extra_weight_attrs): weight_shape = layer.weight_shape weight_scale_inv_shape = weight_scale_inv_shape extra_weight_attrs["output_dim"] = ( - not extra_weight_attrs["output_dim"] if extra_weight_attrs["output_dim"] is not None else None + not extra_weight_attrs["output_dim"] + if extra_weight_attrs.get("output_dim", None) is not None + else None ) layer.weight_dtype = "float8_e4m3fn" @@ -224,9 +225,10 @@ def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = Fal layer.weight_scale_inv.set_value(weight_scale) def apply(self, layer, x): - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( - x, self.quant_config.weight_block_size[0] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=True ) + x_scale_tensor = x_scale_tensor.T linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) from fastdeploy.model_executor.ops.gpu import deep_gemm diff --git a/fastdeploy/model_executor/layers/quantization/kv_cache.py b/fastdeploy/model_executor/layers/quantization/kv_cache.py index cd461fde799..2ccc06f0e45 100644 --- a/fastdeploy/model_executor/layers/quantization/kv_cache.py +++ b/fastdeploy/model_executor/layers/quantization/kv_cache.py @@ -263,10 +263,11 @@ def process_weights_after_loading(self, layer: nn.Layer): """ use for loader v1 """ - if layer.cache_k_scale._is_initialized(): - layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale) - if layer.cache_v_scale._is_initialized(): - layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale) + if "block_wise" not in layer.cache_quant_type_str: + if layer.cache_k_scale._is_initialized(): + layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale) + if layer.cache_v_scale._is_initialized(): + layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale) def apply(self, layer): """ diff --git a/fastdeploy/model_executor/layers/sample/logprobs.py b/fastdeploy/model_executor/layers/sample/logprobs.py new file mode 100644 index 00000000000..affaf10346c --- /dev/null +++ b/fastdeploy/model_executor/layers/sample/logprobs.py @@ -0,0 +1,82 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import paddle +import triton +import triton.language as tl + +from fastdeploy.platforms import current_platform + + +@triton.jit +def count_greater_kernel( + x_ptr, # [num_tokens, n_elements] + y_ptr, # [num_tokens, 1] + out_ptr, # [num_tokens, 1] + n_elements, + BLOCK_SIZE: tl.constexpr, +): + b = tl.program_id(0) + sum_val = 0.0 + y = tl.load(y_ptr + b * 1 + 0) + for col_start_idx in range(0, tl.cdiv(n_elements, BLOCK_SIZE)): + col_ids = col_start_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + col_mask = col_ids < n_elements + x = tl.load(x_ptr + b * n_elements + col_ids, mask=col_mask, other=-float("inf")) + compare_mask = x >= y + cmp_mask = tl.where(compare_mask & col_mask, 1, 0) + sum_val += tl.sum(cmp_mask, axis=0) + tl.store(out_ptr + b, sum_val.to(tl.int64)) + + +def batched_count_greater_than(x: paddle.Tensor, y: paddle.Tensor) -> paddle.Tensor: + """ + Triton implementation: (x >= y).sum(-1) + + Args: + x (paddle.Tensor): 2D tensor,shape [num_tokens, n_elements],float32. + y (paddle.Tensor): 2D tensor,shape [num_tokens, 1],float32. + + Returns: + paddle.Tensor: 1D tensor,shape [num_tokens]. + """ + assert x.dim() == 2, f"x must be 2D, got {x.dim()}D" + assert y.dim() == 2 and y.shape[1] == 1, f"y must be 2D with shape [num_tokens, 1], got {y.shape}" + assert x.shape[0] == y.shape[0], f"shape[0] mismatch: x has {x.shape[0]}, y has {y.shape[0]}" + assert x.dtype == y.dtype, f"dtype mismatch: x is {x.dtype}, y is {y.dtype}" + + if current_platform.is_cuda(): + + num_tokens, n_elements = x.shape + dtype = paddle.int64 + + out = paddle.empty([num_tokens], dtype=dtype, device=x.place) + + config = {"BLOCK_SIZE": 4096, "num_warps": 16} + grid = (num_tokens,) + + count_greater_kernel[grid]( + x_ptr=x, + y_ptr=y, + out_ptr=out, + n_elements=n_elements, + BLOCK_SIZE=config["BLOCK_SIZE"], + num_warps=config["num_warps"], + ) + else: + out = (x >= y).sum(-1) + + return out diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py index 9418ae506e3..f629bfc17d9 100644 --- a/fastdeploy/model_executor/layers/sample/meta_data.py +++ b/fastdeploy/model_executor/layers/sample/meta_data.py @@ -53,6 +53,8 @@ class SamplingMetadata: stop_flags: Optional[paddle.Tensor] = None prompt_ids: Optional[paddle.Tensor] = None prompt_lens: Optional[paddle.Tensor] = None + temp_scaled_logprobs_flag: Optional[bool] = None + top_p_normalized_logprobs_flag: Optional[bool] = None temp_scaled_logprobs: Optional[paddle.Tensor] = None top_p_normalized_logprobs: Optional[paddle.Tensor] = None share_inputs: Optional[Dict[str, paddle.Tensor]] = None diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index f65d314d8d8..84abe02d45b 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -30,6 +30,7 @@ from fastdeploy.model_executor.layers.sample.early_stopper import ( get_early_stopper_cls_from_stragegy, ) +from fastdeploy.model_executor.layers.sample.logprobs import batched_count_greater_than from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.ops import ( apply_penalty_multi_scores, @@ -56,12 +57,40 @@ def top_p_normalize_probs_paddle( return paddle.zeros_like(probs_sort).put_along_axis_(indices=probs_idx, values=probs_sort, axis=-1) -def padding_sampling_params(top_p, top_k, seq_lens_this_time, seq_lens_encoder): +def padding_sampling_params(top_p, top_k, infer_seed, seq_lens_this_time, seq_lens_encoder): real_bsz = seq_lens_this_time.shape[0] repeats = paddle.where(seq_lens_encoder[:real_bsz] == 0, seq_lens_this_time, paddle.ones_like(seq_lens_this_time)) top_p_padding = paddle.repeat_interleave(top_p[:real_bsz], repeats).unsqueeze(1) top_k_padding = paddle.repeat_interleave(top_k[:real_bsz], repeats).unsqueeze(1) - return top_p_padding, top_k_padding + topp_seed = paddle.repeat_interleave(infer_seed[:real_bsz], repeats).unsqueeze(1) + + MAX_INFER_SEED = 9223372036854775806 + + token_lens = paddle.where( + seq_lens_encoder[:real_bsz] == 0, + seq_lens_this_time, + paddle.ones_like(seq_lens_this_time), + ) + + batch_start = (paddle.cumsum(token_lens, axis=0) - token_lens.astype("int64")).reshape(-1) # [B] + token_batch_ids = paddle.repeat_interleave( + paddle.arange(token_lens.shape[0], dtype="int64"), + token_lens, + ) + token_pos = paddle.arange(topp_seed.shape[0], dtype="int64") + local_pos = token_pos - paddle.gather(batch_start, token_batch_ids) + + is_decoder = paddle.gather(seq_lens_encoder[:real_bsz] == 0, token_batch_ids).reshape(-1) + + offsets = paddle.where( + is_decoder, + local_pos * 4, + paddle.zeros_like(local_pos), + ) + + topp_seed[:, 0] = (topp_seed[:, 0] + offsets) % MAX_INFER_SEED + + return top_p_padding, top_k_padding, topp_seed class GuidedDecoding: @@ -375,7 +404,7 @@ def compute_logprobs( temp_scaled_logprobs = sampling_metadata.temp_scaled_logprobs top_p_normalized_logprobs = sampling_metadata.top_p_normalized_logprobs share_inputs = sampling_metadata.share_inputs - if temp_scaled_logprobs is not None: + if temp_scaled_logprobs is not None and sampling_metadata.temp_scaled_logprobs_flag: real_bsz_temp_scaled = temp_scaled_logprobs[:real_bsz] temperature = sampling_metadata.temperature[:real_bsz] temp_temperature = paddle.where(real_bsz_temp_scaled, temperature, paddle.ones_like(temperature)) @@ -385,7 +414,11 @@ def compute_logprobs( top_p_logprob = None top_p_req_mask = None - if top_p_normalized_logprobs is not None and share_inputs is not None: + if ( + top_p_normalized_logprobs is not None + and share_inputs is not None + and sampling_metadata.top_p_normalized_logprobs_flag + ): seq_lens_this_time = share_inputs["seq_lens_this_time"].reshape([-1, 1])[:real_bsz] seq_lens_encoder = share_inputs["seq_lens_encoder"].reshape([-1, 1])[:real_bsz] seq_lens_decoder = share_inputs["seq_lens_decoder"].reshape([-1, 1])[:real_bsz] @@ -434,7 +467,7 @@ def gather_logprobs( token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1) # Compute the ranks of the actual token. - token_ranks = (logprobs >= token_logprobs).sum(-1) + token_ranks = batched_count_greater_than(logprobs, token_logprobs) if num_logprobs >= 1: # Find the topK values. @@ -497,7 +530,7 @@ def forward_cuda( sampling_metadata.top_p, sampling_metadata.top_k, sampling_metadata.top_k_list, - seed=sampling_metadata.seed[0, 0], + topp_seed=sampling_metadata.seed, ) logprobs_tensors = ( @@ -514,6 +547,7 @@ def forward_cuda( # token per request. sampled_token_ids=next_tokens, logprobs_tensors=logprobs_tensors, + logits=logits, ) return sampler_output @@ -626,7 +660,11 @@ def compute_logprobs( top_p_logprob = None top_p_token_mask = None - if top_p_normalized_logprobs is not None and share_inputs is not None: + if ( + top_p_normalized_logprobs is not None + and share_inputs is not None + and sampling_metadata.top_p_normalized_logprobs_flag + ): real_token_top_p = ( sampling_metadata.top_p[:real_bsz].squeeze(1).repeat_interleave(batch_token_num).unsqueeze(1) ) @@ -676,7 +714,7 @@ def gather_logprobs( token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1) # Compute the ranks of the actual token. - token_ranks = (logprobs >= token_logprobs).sum(-1) + token_ranks = batched_count_greater_than(logprobs, token_logprobs) if num_logprobs >= 1: # Find the topK values. @@ -721,13 +759,14 @@ def forward_cuda( probs = F.softmax(logits) - top_p, top_k = padding_sampling_params( + top_p, top_k, topp_seed = padding_sampling_params( sampling_metadata.top_p, sampling_metadata.top_k, + sampling_metadata.seed, share_inputs["seq_lens_this_time"], share_inputs["seq_lens_encoder"], ) - _, sampled_token_ids = top_k_top_p_sampling(probs, top_p=top_p, top_k=top_k, seed=sampling_metadata.seed[0, 0]) + _, sampled_token_ids = top_k_top_p_sampling(probs, top_p=top_p, top_k=top_k, topp_seed=topp_seed) verify_scores, verify_tokens, actual_candidate_len = top_p_candidates( probs, @@ -800,18 +839,19 @@ def forward_cuda( raw_logprobs = target_logits.clone() logprobs_tensors = None - token_ids = share_inputs["accept_tokens"] if num_logprobs is not None: - token_ids = paddle.concat( - [share_inputs["accept_tokens"][i, : share_inputs["accept_num"][i]] for i in range(real_bsz)] - ) + token_ids = share_inputs["accept_tokens"] + idx = paddle.arange(share_inputs["accept_tokens"].shape[1], dtype="int32") + mask = idx < share_inputs["accept_num"].unsqueeze(1) + token_ids = paddle.masked_select(share_inputs["accept_tokens"], mask) logprobs_tensors = self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=token_ids) sampler_output = SamplerOutput( - sampled_token_ids=token_ids, + sampled_token_ids=share_inputs["accept_tokens"], logprobs_tensors=logprobs_tensors, token_num_per_batch=share_inputs["accept_num"], cu_batch_token_offset=share_inputs["cu_batch_token_offset"], + logits=logits, ) return sampler_output @@ -904,6 +944,7 @@ def __init__(self, fd_config: FDConfig): else: raise NotImplementedError self.logprobs_mode = fd_config.model_config.logprobs_mode + self.enable_draft_logprob = fd_config.speculative_config.enable_draft_logprob def pre_process(self, skip_idx_list: List[int] = []): """pre process before running""" @@ -956,7 +997,11 @@ def compute_logprobs( top_p_logprob = None top_p_token_mask = None - if top_p_normalized_logprobs is not None and share_inputs is not None: + if ( + top_p_normalized_logprobs is not None + and share_inputs is not None + and sampling_metadata.top_p_normalized_logprobs_flag + ): real_token_top_p = ( sampling_metadata.top_p[:real_bsz] .squeeze(1) @@ -1010,7 +1055,7 @@ def gather_logprobs( token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1) # Compute the ranks of the actual token. - token_ranks = (logprobs >= token_logprobs).sum(-1) + token_ranks = batched_count_greater_than(logprobs, token_logprobs) if num_logprobs >= 1: # Find the topK values. @@ -1033,7 +1078,7 @@ def forward_cuda( """ """ num_logprobs = sampling_metadata.max_num_logprobs real_bsz = share_inputs["seq_lens_this_time"].shape[0] - if num_logprobs is not None and share_inputs["substep"] == 0: + if self.enable_draft_logprob and num_logprobs is not None and share_inputs["substep"] == 0: real_token_num = share_inputs["batch_token_num"][:real_bsz].sum() if self.logprobs_mode == "raw_logprobs": raw_logprobs = self.compute_logprobs( @@ -1060,17 +1105,11 @@ def forward_cuda( ) probs = F.softmax(logits) - top_p, top_k = padding_sampling_params( - sampling_metadata.top_p, - sampling_metadata.top_k, - share_inputs["seq_lens_this_time"], - share_inputs["seq_lens_encoder"], - ) - _, next_tokens = top_k_top_p_sampling(probs, top_p=top_p, top_k=top_k, seed=sampling_metadata.seed[0, 0]) + next_tokens = paddle.argmax(probs, axis=-1) token_ids = None logprobs_tensors = None - if num_logprobs is not None and share_inputs["substep"] == 0: + if self.enable_draft_logprob and num_logprobs is not None and share_inputs["substep"] == 0: token_ids = paddle.empty(real_token_num, dtype="int64") speculate_insert_first_token( token_ids, diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index c18f062457e..fd55846aba7 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -220,6 +220,35 @@ def group_wise_int4_weight_quantize(weight: paddle.Tensor, group_size: int = 128 return quant_weight.astype(paddle.int8), weight_scale +def scale_wrapper(x_amax: paddle.Tensor, eps: float = 0.0) -> paddle.Tensor: + """ + Paddle implementation of CUDA ScaleWrapper logic. + Args: + x_amax (paddle.Tensor): amax tensor (float32 recommended) + eps (float): epsilon to avoid division by zero + Returns: + paddle.Tensor: scale tensor, same shape as x_amax + """ + fp8_max = 448.0 + float_max = paddle.finfo(paddle.float32).max + amax_mod = paddle.maximum( + x_amax, + paddle.full_like(x_amax, eps), + ) + scale = fp8_max / amax_mod + scale = paddle.where( + amax_mod == 0, + paddle.ones_like(scale), + scale, + ) + scale = paddle.where( + paddle.isinf(scale), + paddle.full_like(scale, float_max), + scale, + ) + return scale + + def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]: """ Only used in deep_gemm block wise quant weight. @@ -244,11 +273,10 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten x_abs = paddle.abs(x_view).astype(paddle.float32) x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) - x_amax = paddle.clip(x_amax, min=1e-4) - x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) - + scale = scale_wrapper(x_amax) + x_scaled = (x_view * scale).astype(paddle.float8_e4m3fn) return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) + paddle.view(1.0 / scale, (x_view.shape[0], x_view.shape[2])) ) diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index a795a9e0304..83ba492ee3b 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -21,7 +21,9 @@ import json import os import pickle +import re import time +from contextlib import ExitStack from functools import wraps from pathlib import Path @@ -39,6 +41,10 @@ from fastdeploy.model_executor.utils import multi_switch_config_context +def natural_key(s: str): + return [int(t) if t.isdigit() else t for t in re.split(r"(\d+)", s)] + + def pdparams_weight_iterator(paddle_file_list: list[str]): for pdparams_file in tqdm( paddle_file_list, @@ -71,9 +77,12 @@ def load_weights_from_cache(model, weights_iterator): def get_weight_iterator(model_path: str): - _, files_list, use_safetensors = get_all_weights_file(model_path) + files_list, ordered_weight_map, use_safetensors, is_key_ordered = get_all_weights_file(model_path) if use_safetensors: - weights_iterator = safetensors_weights_iterator(files_list) + if is_key_ordered: + weights_iterator = safetensors_weights_iterator(files_list) + else: + weights_iterator = safetensors_weights_iterator_ordered(ordered_weight_map) else: weights_iterator = pdparams_weight_iterator(files_list) return weights_iterator @@ -333,6 +342,26 @@ def safetensors_weights_iterator(safe_tensor_list: list[str]): yield name, param +def safetensors_weights_iterator_ordered(ordered_weight_map: dict[str, str]): + """ + safetensors_weights_iterator_ordered + """ + with ExitStack() as stack: + current_file = None + current_handle = None + + for key, st_file in tqdm( + ordered_weight_map.items(), + desc="Loading safetensors weights", + ): + if st_file != current_file: + stack.close() + current_handle = stack.enter_context(safe_open(st_file, framework="paddle", device="cpu")) + current_file = st_file + + yield key, current_handle.get_tensor(key) + + def fast_weights_iterator(safe_tensor_list: list[str]): """ paddleformers' iterator for safetensors @@ -353,7 +382,7 @@ def load_pre_sharded_checkpoint(model_path: str, local_rank: int): """ state_dict = {} - _, safetensor_files, _ = get_all_weights_file(os.path.join(model_path, f"rank{local_rank}")) + safetensor_files, _, _, _ = get_all_weights_file(os.path.join(model_path, f"rank{local_rank}")) weights_iterator = safetensors_weights_iterator(safetensor_files) for name, weight in weights_iterator: state_dict[name] = weight.clone() @@ -368,23 +397,31 @@ def get_all_weights_file(model_path: str): use_safetensors = True files_list = [str(file) for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"] if len(files_list) > 0: - key_name_list = [] + ordered_weight_map = {} use_safetensors = False + # dont care about the order of the files + return files_list, {}, use_safetensors, False else: safe_model_path = model_path / "model.safetensors" if safe_model_path.exists(): - files_list = [str(safe_model_path)] with safe_open(safe_model_path, framework="np", device="cpu") as f: - key_name_list = f.keys() - return key_name_list, files_list, use_safetensors + key_name_list = sorted(f.keys(), key=natural_key) + ordered_weight_map = {key: "model.safetensors" for key in key_name_list} + is_key_ordered = True + files_list = [str(safe_model_path)] + return files_list, ordered_weight_map, use_safetensors, is_key_ordered else: index_file = model_path / "model.safetensors.index.json" with index_file.open("r") as f: weight_map = json.load(f)["weight_map"] + keys = list(weight_map.keys()) + is_key_ordered = keys == sorted(keys, key=natural_key) + ordered_weight_map = { + key: str(model_path / weight_map[key]) for key in sorted(weight_map.keys(), key=natural_key) + } weight_files_in_index = {str(model_path / weight_map[name]) for name in weight_map} - key_name_list = list(weight_map.keys()) files_list = sorted(weight_files_in_index) - return key_name_list, files_list, use_safetensors + return files_list, ordered_weight_map, use_safetensors, is_key_ordered def deal_state_dict(state_dict): diff --git a/fastdeploy/model_executor/model_loader/default_loader.py b/fastdeploy/model_executor/model_loader/default_loader.py index ca0dfa84f92..bd813c804c4 100644 --- a/fastdeploy/model_executor/model_loader/default_loader.py +++ b/fastdeploy/model_executor/model_loader/default_loader.py @@ -95,3 +95,31 @@ def load_model(self, fd_config: FDConfig) -> nn.Layer: # TODO(gongshaotian): Now, only support safetensor self.load_weights(model, fd_config, architectures) return model + + def load_rl_mock_model(self, fd_config: FDConfig) -> nn.Layer: + """use for rl model load""" + # (TODO:gaoziyuan) optimze + original_architectures = fd_config.model_config.architectures[0] + logger.info(f"Starting to load model {original_architectures}.") + + import fastdeploy.rl # noqa + + if fd_config.speculative_config.model_type != "mtp": + model_architectures = original_architectures.replace("Ernie5ForCausalLM", "Ernie5MoeForCausalLM") + else: + model_architectures = original_architectures.replace("Ernie5ForCausalLM", "Ernie5MTPForCausalLM") + + model_architectures += "RL" + context = paddle.LazyGuard() + + with context: + model_cls = ModelRegistry.get_class(model_architectures) + model = model_cls(fd_config) + + model.eval() + + if fd_config.load_config.load_strategy == "normal": + # normal strategy need load weight and architectures need without "RL" + self.load_weights(model, fd_config, original_architectures) + # RL model not need set_state_dict + return model diff --git a/fastdeploy/model_executor/model_loader/default_loader_v1.py b/fastdeploy/model_executor/model_loader/default_loader_v1.py index 8fb0ebf3881..92f8b773868 100644 --- a/fastdeploy/model_executor/model_loader/default_loader_v1.py +++ b/fastdeploy/model_executor/model_loader/default_loader_v1.py @@ -56,8 +56,8 @@ def load_weights(self, model, fd_config: FDConfig, enable_cache: bool = False) - load_weights_from_cache(model, weights_iterator) else: model.load_weights(weights_iterator) - if fd_config.speculative_config.model_type != "mtp": - process_final_after_loading(model, fd_config) + + process_final_after_loading(model, fd_config) self.clean_memory_fragments() @@ -98,3 +98,30 @@ def load_model(self, fd_config: FDConfig) -> nn.Layer: return model self.load_weights(model, fd_config, enable_cache) return model + + def load_rl_mock_model(self, fd_config: FDConfig) -> nn.Layer: + """use for rl model load""" + # (TODO:gaoziyuan) optimze + original_architectures = fd_config.model_config.architectures[0] + + import fastdeploy.rl # noqa + + if fd_config.speculative_config.model_type != "mtp": + model_architectures = original_architectures.replace("Ernie5ForCausalLM", "Ernie5MoeForCausalLM") + else: + model_architectures = original_architectures.replace("Ernie5ForCausalLM", "Ernie5MTPForCausalLM") + + model_architectures += "RL" + context = paddle.LazyGuard() + + with context: + model_cls = ModelRegistry.get_class(model_architectures) + model = model_cls(fd_config) + + model.eval() + + if fd_config.load_config.load_strategy == "normal": + # normal strategy need load weight and architectures need without "RL" + self.load_weights(model, fd_config, original_architectures) + # RL model not need set_state_dict + return model diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 680d565e176..aa07467e566 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -594,6 +594,9 @@ def forward( ) out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0]) + return out @@ -803,7 +806,7 @@ def _get_tensor_parallel_mappings(cls, config, is_split=True): fn = split_or_merge_func( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, ) diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 3c042e17fdc..15df90dd3f0 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -459,6 +459,9 @@ def forward( out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0]) + if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed: out = forward_meta.attn_backend.reverse_transpose(out) @@ -609,8 +612,7 @@ def load_weights(self, weights_iterator) -> None: r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name ) process_weights_after_loading_fn(model_sublayer_name, param) - - if self.tie_word_embeddings: + if getattr(self, "tie_word_embeddings", False): self.lm_head.linear.weight.set_value( self.ernie.embed_tokens.embeddings.weight.transpose([1, 0]).astype(self.lm_head.linear.weight.dtype) ) @@ -791,7 +793,7 @@ def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True): fn = split_or_merge_func_v1( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 2d57ed504cb..40cfe0b170e 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -16,7 +16,6 @@ from __future__ import annotations -import re from functools import partial from typing import Dict, Union @@ -69,14 +68,14 @@ def _get_tensor_parallel_mappings(cls, config, is_split=True): fn = split_or_merge_func( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, ) def gqa_qkv_split_func( weight, - tensor_parallel_degree, + tensor_model_parallel_size, tensor_parallel_rank, num_attention_heads, num_key_value_heads, @@ -109,9 +108,9 @@ def split_tensor(tensor, degree): else: return np.split(tensor, degree, axis=-1) - q_list = split_tensor(q, tensor_parallel_degree) - k_list = split_tensor(k, tensor_parallel_degree) - v_list = split_tensor(v, tensor_parallel_degree) + q_list = split_tensor(q, tensor_model_parallel_size) + k_list = split_tensor(k, tensor_model_parallel_size) + v_list = split_tensor(v, tensor_model_parallel_size) if tensor_parallel_rank is None: return [np.concatenate([q_i, k_i, v_i], axis=-1) for q_i, k_i, v_i in zip(q_list, k_list, v_list)] @@ -126,9 +125,9 @@ def split_tensor(tensor, degree): ) def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_heads, head_dim): - tensor_parallel_degree = len(weight_list) - num_attention_heads = num_attention_heads // tensor_parallel_degree - num_key_value_heads = num_key_value_heads // tensor_parallel_degree + tensor_model_parallel_size = len(weight_list) + num_attention_heads = num_attention_heads // tensor_model_parallel_size + num_key_value_heads = num_key_value_heads // tensor_model_parallel_size is_paddle_tensor = not isinstance(weight_list[0], np.ndarray) @@ -170,7 +169,7 @@ def slice_tensor(tensor, start, end): if is_split: qkv_fn = partial( gqa_qkv_split_func, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, @@ -326,7 +325,10 @@ def forward( for i in range(self.num_layers): hidden_states, residual = self.mtp_block[i](forward_meta, hidden_states, residual) - hidden_states = self.norm(hidden_states, residual)[0] + hidden_states = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + hidden_states = self.norm.allgather(hidden_states, forward_meta.ids_remove_padding.shape[0]) return hidden_states @@ -354,7 +356,6 @@ def __init__(self, fd_config: FDConfig): self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = fd_config.speculative_config.sharing_model.lm_head - self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings @classmethod def name(self): @@ -372,11 +373,6 @@ def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]] and values are NumPy arrays or PaddlePaddle tensors. """ self.ernie.load_state_dict(state_dict) - # if self.tie_word_embeddings: - # self.lm_head.linear.weight.set_value( - # self.ernie.embed_tokens.embeddings.weight.transpose([1, 0])) - # else: - # self.lm_head.load_state_dict(state_dict) @paddle.no_grad() def load_weights(self, weights_iterator) -> None: @@ -386,47 +382,24 @@ def load_weights(self, weights_iterator) -> None: Args: weights_iterator (Iterator): An iterator yielding (name, weight) pairs. """ - - from fastdeploy.model_executor.utils import ( - default_weight_loader, - process_weights_after_loading, + from fastdeploy.model_executor.models.ernie4_5_moe import ( + Ernie4_5_MoeForCausalLM, + ) + from fastdeploy.model_executor.utils import remap_weight_keys + + Ernie4_5_MoeForCausalLM.load_weights( + self, + remap_weight_keys( + weights_iterator, + { + "mtp_emb_norm.0": "enorm", + "mtp_hidden_norm.0": "hnorm", + "mtp_linear_proj.0": "eh_proj.linear", + }, + ), ) - all_param_mapping = [ - # (param_name, weight_name, expert_id, shard_id) - ("embed_tokens.embeddings", "embed_tokens", None, None), - ("lm_head.linear", "lm_head", None, None), - ("enorm", "mtp_emb_norm.0", None, None), - ("hnorm", "mtp_hidden_norm.0", None, None), - ("eh_proj.linear", "mtp_linear_proj.0", None, None), - ] - - params_dict = dict(self.named_parameters()) - shard_id = None - process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config) - for loaded_weight_name, loaded_weight in weights_iterator: - for param_name, weight_name, exp_id, shard_id in all_param_mapping: - if weight_name not in loaded_weight_name: - continue - model_param_name = loaded_weight_name.replace(weight_name, param_name) - param = params_dict[model_param_name] - shard_id = shard_id - break - else: - if loaded_weight_name not in params_dict.keys(): - continue - model_param_name = loaded_weight_name - param = params_dict[loaded_weight_name] - - # Get weight loader from parameter and set weight - weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config)) - weight_loader(param, loaded_weight) - model_sublayer_name = re.sub( - r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name - ) - process_weights_after_loading_fn(model_sublayer_name, param) - - def compute_logits(self, hidden_states: paddle.Tensor): + def compute_logits(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): """ compute logits """ diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py index 2d8c53b2218..b4dd3aa26f0 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py @@ -14,7 +14,6 @@ # limitations under the License. """ -from functools import partial from typing import Optional import numpy as np @@ -160,15 +159,15 @@ def __init__( self, dim: int, num_heads: int = 16, - tensor_parallel_degree: int = 1, + tensor_model_parallel_size: int = 1, tensor_parallel_rank: int = 0, model_format: str = "", ) -> None: super().__init__() self.num_heads = num_heads - self.tensor_parallel_degree = tensor_parallel_degree + self.tensor_model_parallel_size = tensor_model_parallel_size self.tensor_parallel_rank = tensor_parallel_rank - if tensor_parallel_degree > 1: + if tensor_model_parallel_size > 1: use_fuse_matmul_bias = False if current_platform.is_maca() or current_platform.is_iluvatar() else True self.qkv = ColumnParallelLinear( dim, @@ -200,7 +199,7 @@ def __init__( self.head_dim = dim // num_heads # must added self.num_heads = num_heads self.hidden_size = dim - self.num_heads_per_rank = divide(self.num_heads, self.tensor_parallel_degree) + self.num_heads_per_rank = divide(self.num_heads, self.tensor_model_parallel_size) def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): weight_need_transpose = getattr(param, "weight_need_transpose", False) @@ -210,7 +209,9 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N if load_bias: head_dim = self.hidden_size // self.num_heads shard_weight = loaded_weight[...].reshape([3, self.num_heads, head_dim]) - shard_weight = paddle.split(shard_weight, self.tensor_parallel_degree, axis=-2)[self.tensor_parallel_rank] + shard_weight = paddle.split(shard_weight, self.tensor_model_parallel_size, axis=-2)[ + self.tensor_parallel_rank + ] shard_weight = shard_weight.reshape([-1]) else: shard_weight = loaded_weight[...].reshape( @@ -221,7 +222,9 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N self.head_dim, ] ) - shard_weight = paddle.split(shard_weight, self.tensor_parallel_degree, axis=-2)[self.tensor_parallel_rank] + shard_weight = paddle.split(shard_weight, self.tensor_model_parallel_size, axis=-2)[ + self.tensor_parallel_rank + ] shard_weight = shard_weight.reshape([self.hidden_size, -1]) shard_weight = get_tensor(shard_weight) shard_weight = fd_cast(shard_weight, param) @@ -253,7 +256,7 @@ def forward( [ seq_length, 3, - self.num_heads // self.tensor_parallel_degree, + self.num_heads // self.tensor_model_parallel_size, -1, ] ) @@ -333,13 +336,13 @@ def __init__( dim: int, hidden_dim: int, hidden_act: str, - tensor_parallel_degree: int = 1, + tensor_model_parallel_size: int = 1, model_format: str = "", ) -> None: super().__init__() - self.tensor_parallel_degree = tensor_parallel_degree + self.tensor_model_parallel_size = tensor_model_parallel_size - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: self.fc1 = ColumnParallelLinear( dim, hidden_dim, @@ -419,7 +422,7 @@ class DFNRopeVisionBlock(nn.Layer): def __init__( self, config, - tensor_parallel_degree: int, + tensor_model_parallel_size: int, tensor_parallel_rank: int, attn_implementation: str = "sdpa", model_format: str = "", @@ -438,7 +441,7 @@ def __init__( self.attn = VisionFlashAttention2( config.embed_dim, num_heads=config.num_heads, - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, tensor_parallel_rank=tensor_parallel_rank, model_format=model_format, ) @@ -446,7 +449,7 @@ def __init__( dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act, - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, model_format=model_format, ) self.config = config @@ -543,7 +546,7 @@ def __init__(self, config, prefix_name: str = "") -> None: [ DFNRopeVisionBlock( config.vision_config, - config.pretrained_config.tensor_parallel_degree, + config.pretrained_config.tensor_model_parallel_size, config.pretrained_config.tensor_parallel_rank, model_format=model_format, ) @@ -664,63 +667,6 @@ def extract_feature(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor) """ return self.forward(hidden_states, grid_thw) - @classmethod - def _get_tensor_parallel_mappings(cls, config, is_split=True): - """ - dummy - """ - - from paddleformers.transformers.conversion_utils import split_or_merge_func - - fn = split_or_merge_func( - is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, - tensor_parallel_rank=config.tensor_parallel_rank, - ) - vision_config = config.vision_config - - def split_qkv_weight(x): - head_dim = vision_config.hidden_size // vision_config.num_heads - x = x.reshape( - [ - vision_config.hidden_size, - 3, - vision_config.num_heads, - head_dim, - ] - ) - x = np.split(x, vision_config.tensor_parallel_degree, axis=-2)[vision_config.tensor_parallel_rank] - x = x.reshape([vision_config.hidden_size, -1]) - return x - - def split_qkv_bias(x): - head_dim = vision_config.hidden_size // vision_config.num_heads - x = x.reshape([3, vision_config.num_heads, head_dim]) - x = np.split(x, vision_config.tensor_parallel_degree, axis=-2)[vision_config.tensor_parallel_rank] - x = x.reshape([-1]) - return x - - def get_tensor_parallel_split_mappings(depth): - final_actions = {} - base_actions = { - "vision_model.blocks.0.attn.proj.weight": partial(fn, is_column=False), - "vision_model.blocks.0.fc1.weight": partial(fn, is_column=True), - "vision_model.blocks.0.fc1.bias": partial(fn, is_column=True), - "vision_model.blocks.0.fc2.weight": partial(fn, is_column=False), - "vision_model.blocks.0.qkv.weight": split_qkv_weight, - "vision_model.blocks.0.qkv.bias": split_qkv_bias, - } - - for key, action in base_actions.items(): - if "blocks.0." in key: - for i in range(depth): - newkey = key.replace("blocks.0.", f"blocks.{i}.") - final_actions[newkey] = action - return final_actions - - mappings = get_tensor_parallel_split_mappings(vision_config.depth) - return mappings - def load_state_dict(self, state_dict): params_dict = dict(self.named_parameters()) for param_name, param in params_dict.items(): diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 804d058bf7c..a0e67ee821a 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -548,6 +548,10 @@ def forward( ) out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0]) + return out @@ -724,7 +728,7 @@ def load_weights(self, weights_iterator) -> None: r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name ) process_weights_after_loading_fn(model_sublayer_name, param) - if self.tie_word_embeddings: + if getattr(self, "tie_word_embeddings", False): self.lm_head.linear.weight.set_value( self.ernie.embed_tokens.embeddings.weight.transpose([1, 0]).astype(self.lm_head.linear.weight.dtype) ) @@ -963,7 +967,7 @@ def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True): fn = split_or_merge_func_v1( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, @@ -971,7 +975,7 @@ def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True): ) vision_fn = split_or_merge_func_v1( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.vision_config.get("num_heads"), num_key_value_heads=config.vision_config.get("num_heads"), diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py index dfc0644e556..ff0d7e5e0fa 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py @@ -15,7 +15,6 @@ """ from copy import deepcopy -from functools import partial import numpy as np import paddle @@ -156,7 +155,7 @@ def __init__( self.temporal_conv_size = temporal_conv_size self.use_recompute_resampler = False self.use_temporal_conv = True - self.tensor_parallel_degree = config.pretrained_config.tensor_parallel_degree + self.tensor_model_parallel_size = config.pretrained_config.tensor_model_parallel_size self.prefix_name = prefix_name # for 空间四合一 @@ -175,7 +174,7 @@ def __init__( has_bias=True, fuse_matmul_bias=use_fuse_matmul_bias, ) - if self.tensor_parallel_degree > 1 + if self.tensor_model_parallel_size > 1 else nn.Linear(self.spatial_dim, self.spatial_dim) ), nn.GELU(), @@ -207,7 +206,7 @@ def __init__( out_config.hidden_size = out_dim self.after_norm = RMSNorm(out_config) - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: set_weight_attrs(self.spatial_linear[0].weight, {"output_dim": False}) def spatial_conv_reshape(self, x, spatial_conv_size): @@ -237,17 +236,17 @@ def fwd_spatial(x): x = self.spatial_conv_reshape(x, self.spatial_conv_size) num_pad = 0 - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: num_pad = ( - x.shape[0] + self.tensor_parallel_degree - 1 - ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[0] + x.shape[0] + self.tensor_model_parallel_size - 1 + ) // self.tensor_model_parallel_size * self.tensor_model_parallel_size - x.shape[0] if num_pad > 0: x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0]) x = self.spatial_linear(x) - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: x = AllGatherOp.apply(x) if num_pad > 0: @@ -303,13 +302,13 @@ def fwd_placeholder(x, grid_thw, to_tensor=False): def fwd_temporal(x): num_pad = 0 - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: num_pad = ( - x.shape[0] + self.tensor_parallel_degree - 1 - ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[0] + x.shape[0] + self.tensor_model_parallel_size - 1 + ) // self.tensor_model_parallel_size * self.tensor_model_parallel_size - x.shape[0] if num_pad > 0: x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0]) - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: x = ScatterOp.apply(x, axis=0) x = self.temporal_linear(x) @@ -321,7 +320,7 @@ def fwd_temporal(x): def fwd_mlp(x): x = self.mlp(x) x = self.after_norm(x) - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: x = AllGatherOp.apply(x) return x @@ -355,31 +354,3 @@ def load_state_dict(self, state_dict): raise ValueError(f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}") else: param.copy_(tensor, False) - - @classmethod - def _get_tensor_parallel_mappings(cls, config, is_split=True): - - from paddleformers.transformers.conversion_utils import split_or_merge_func - - fn = split_or_merge_func( - is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, - tensor_parallel_rank=config.tensor_parallel_rank, - num_attention_heads=config.num_attention_heads, - ) - res = {"spatial_linear.0.weight": partial(fn, is_column=False)} - for k in ( - "spatial_linear.0.bias", # row linear bias - "spatial_linear.2.weight", - "spatial_linear.2.bias", # linear - "spatial_linear.3.weight", - "spatial_linear.3.bias", # layernorm - "temporal_linear.0.weight", - "temporal_linear.0.weight", # linear - "temporal_linear.2.weight", - "temporal_linear.2.bias", # linear - "temporal_linear.3.weight", - "temporal_linear.3.bias", # bias - ): - res.update({k: lambda x: x}) - return res diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index d5ad6e3916b..1efecb2db70 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -161,7 +161,7 @@ def __init__( reduce_results=False, ) - def forward(self, x, forward_meta): + def forward(self, x, forward_meta: ForwardMeta = None): shared_experts_out = self.shared_experts(x) out = self.experts(x, self.gate, forward_meta) out = out + shared_experts_out @@ -249,6 +249,7 @@ def __init__( self, fd_config: FDConfig, prefix: str = "", + is_mtp: bool = False, ) -> None: super().__init__() @@ -259,9 +260,8 @@ def __init__( prefix=f"{prefix}.self_attn", ) - if ( - fd_config.model_config.n_routed_experts is not None - and layer_id >= fd_config.model_config.first_k_dense_replace + if fd_config.model_config.n_routed_experts is not None and ( + layer_id >= fd_config.model_config.first_k_dense_replace or is_mtp ): self.mlp = Glm4Moe(fd_config, layer_id, prefix=f"{prefix}.mlp") else: @@ -306,10 +306,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp( - hidden_states, - forward_meta, - ) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual @@ -373,6 +370,9 @@ def forward( out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0]) + return out @@ -552,7 +552,7 @@ def _get_tensor_parallel_mappings(cls, config, is_split=True): fn = split_or_merge_func_v1( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, diff --git a/fastdeploy/model_executor/models/glm4_mtp.py b/fastdeploy/model_executor/models/glm4_mtp.py new file mode 100644 index 00000000000..c28023202d2 --- /dev/null +++ b/fastdeploy/model_executor/models/glm4_mtp.py @@ -0,0 +1,371 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import annotations + +from functools import partial + +import paddle +from paddle import nn +from paddleformers.transformers import PretrainedModel +from paddleformers.utils.log import logger + +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) +from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection +from fastdeploy.model_executor.layers.normalization import RMSNorm +from fastdeploy.model_executor.models.glm4_moe import Glm4MoeDecoderLayer +from fastdeploy.model_executor.models.model_base import ( + ModelCategory, + ModelForCasualLM, + ModelRegistry, +) + + +class Glm4MTPPretrainedModel(PretrainedModel): + """ + Glm4MTPPretrainedModel + """ + + config_class = FDConfig + + def _init_weights(self, layer): + return None + + @classmethod + def arch_name(self): + return "Glm4MTPForCausalLM" + + @classmethod + def _get_tensor_parallel_mappings(cls, config, is_split=True): + logger.info("Glm4MTP inference model _get_tensor_parallel_mappings") + + from fastdeploy.model_executor.models.tp_utils import split_or_merge_func_v1 + + fn = split_or_merge_func_v1( + is_split=is_split, + tensor_model_parallel_size=config.tensor_model_parallel_size, + tensor_parallel_rank=config.tensor_parallel_rank, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + head_dim=config.head_dim, + ) + + def get_tensor_parallel_split_mappings(num_mtp_layers, mtp_start_layer_idx): + final_actions = {} + + base_actions = { + "layers.0.embed_tokens.weight": partial(fn, is_column=True), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + } + + # Self Attention Layer which are need TP. + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + + # Moe Layer + for expert_idx in range(config.n_routed_experts): + base_actions[f"layers.0.mlp.experts.{expert_idx}.up_proj.weight"] = partial(fn, is_column=True) + base_actions[f"layers.0.mlp.experts.{expert_idx}.gate_proj.weight"] = partial(fn, is_column=True) + base_actions[f"layers.0.mlp.experts.{expert_idx}.down_proj.weight"] = partial(fn, is_column=False) + + base_actions["layers.0.eh_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.shared_head.head.weight"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(mtp_start_layer_idx, mtp_start_layer_idx + num_mtp_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_nextn_predict_layers, config.start_layer_index) + return mappings + + +class SharedHead(nn.Module): + def __init__( + self, + fd_config: FDConfig, + prefix: str = "", + ) -> None: + super().__init__() + self.norm = RMSNorm( + fd_config, + hidden_size=fd_config.model_config.hidden_size, + eps=fd_config.model_config.rms_norm_eps, + prefix=f"{prefix}.shared_head.norm", + ) + if fd_config.speculative_config.sharing_model is not None: + self.head = fd_config.speculative_config.sharing_model.lm_head + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + # NOTE(wangyanpeng04): Just for compute logits + hidden_states = self.norm(hidden_states)[0] + return self.head(hidden_states) + + +class Glm4MTPLayer(nn.Layer): + """ + Glm4MTPLayer + """ + + def __init__( + self, + fd_config: FDConfig = None, + prefix: str = "", + ) -> None: + """ + Initializer for the Glm4MTPLayer class. + """ + super().__init__() + + self.enorm = RMSNorm( + fd_config, + hidden_size=fd_config.model_config.hidden_size, + eps=fd_config.model_config.rms_norm_eps, + prefix=f"{prefix}.enorm", + ) + self.hnorm = RMSNorm( + fd_config, + hidden_size=fd_config.model_config.hidden_size, + eps=fd_config.model_config.rms_norm_eps, + prefix=f"{prefix}.hnorm", + ) + self.eh_proj = ParallelEHProjection( + fd_config, + num_embeddings=fd_config.model_config.hidden_size, + embedding_dim=fd_config.model_config.hidden_size * 2, + prefix=f"{prefix}.eh_proj", + ) + self.shared_head = SharedHead( + fd_config, + prefix=prefix, + ) + self.mtp_block = Glm4MoeDecoderLayer( + fd_config, + prefix=prefix, + is_mtp=True, + ) + + def forward( + self, + ids_remove_padding: paddle.Tensor, + previous_hidden_states: paddle.Tensor, + inputs_embedding: paddle.Tensor, + forward_meta: ForwardMeta, + ): + """ + forward + """ + assert inputs_embedding is not None + + inputs_embedding = paddle.concat( + [self.enorm(inputs_embedding)[0], self.hnorm(previous_hidden_states)[0]], + axis=-1, + ) + + hidden_states = self.eh_proj(inputs_embedding) + hidden_states, residual = self.mtp_block(forward_meta, hidden_states, residual=None) + + hidden_states = residual + hidden_states + return hidden_states + + +@support_graph_optimization +class Glm4MTPModel(nn.Layer): + """ + Glm4MTPModel + """ + + def __init__( + self, + fd_config: FDConfig = None, + ) -> None: + super().__init__() + + self.mtp_start_layer_idx = fd_config.model_config.start_layer_index + self.num_mtp_layers = fd_config.model_config.num_nextn_predict_layers + + assert self.num_mtp_layers == 1, f"Currently only supports single MTP layer, but got {self.num_mtp_layers}" + + if fd_config.speculative_config.sharing_model is not None: + self.embed_tokens = fd_config.speculative_config.sharing_model.model.embed_tokens + + self.layers = nn.LayerDict( + { + str(i): Glm4MTPLayer( + fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(0, self.num_mtp_layers) + } + ) + + def forward( + self, + ids_remove_padding: paddle.Tensor, + previous_hidden_states: paddle.Tensor, + forward_meta: ForwardMeta, + inputs_embedding: paddle.Tensor = None, + ): + if inputs_embedding is None: + inputs_embedding = self.embed_tokens(ids_remove_padding) + + # NOTE(wangyanpeng04): Currently only supports single MTP layer + hidden_states = self.layers[str(0)]( + ids_remove_padding, + previous_hidden_states, + inputs_embedding, + forward_meta, + ) + + return hidden_states + + +@ModelRegistry.register_model_class( + architecture="Glm4MTPForCausalLM", + module_name="glm4_mtp", + category=ModelCategory.TEXT_GENERATION, + primary_use=ModelCategory.TEXT_GENERATION, +) +class Glm4MTPForCausalLM(ModelForCasualLM): + """ + Glm4MTPForCausalLM + """ + + def __init__(self, fd_config: FDConfig): + """ + Args: + fd_config (FDConfig): Configurations for the LLM model. + """ + super(Glm4MTPForCausalLM, self).__init__(fd_config) + self.fd_config = fd_config + self.model = Glm4MTPModel(fd_config) + self.ori_vocab_size = fd_config.model_config.ori_vocab_size + + self.mtp_start_layer_idx = fd_config.model_config.start_layer_index + self.num_mtp_layers = fd_config.model_config.num_nextn_predict_layers + + @classmethod + def name(self): + return "Glm4MTPForCausalLM" + + @paddle.no_grad() + def load_weights(self, weights_iterator): + """ + Load model parameters from a given weights_iterator object. + + Args: + weights_iterator (Iterator): An iterator yielding (name, weight) pairs. + """ + + from fastdeploy.model_executor.models.glm4_moe import Glm4MoeForCausalLM + from fastdeploy.model_executor.utils import remap_weight_keys + + template = { + "enorm": "enorm", + "hnorm": "hnorm", + "eh_proj": "eh_proj.linear", + "shared_head.norm": "shared_head.norm", + "shared_head.head": "shared_head.head.linear", + "self_attn.q_proj": "mtp_block.self_attn.q_proj", + "self_attn.k_proj": "mtp_block.self_attn.k_proj", + "self_attn.v_proj": "mtp_block.self_attn.v_proj", + "self_attn.o_proj": "mtp_block.self_attn.o_proj", + "mlp": "mtp_block.mlp", + "input_layernorm": "mtp_block.input_layernorm", + "post_attention_layernorm": "mtp_block.post_attention_layernorm", + } + remap = { + f"layers.{self.mtp_start_layer_idx}.embed_tokens": "embed_tokens.embeddings", + } + + # NOTE (wangyanpeng) Here we need to map the layer_id of MTP weights to start from 0, + # otherwise there will be out-of-bounds when accessing kv_cache in Attention + for key, value in template.items(): + for mtp_layer_id in range(self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers): + remap[f"layers.{mtp_layer_id}.{key}"] = f"layers.{mtp_layer_id - self.mtp_start_layer_idx}.{value}" + + weights_iterator = remap_weight_keys( + weights_iterator, + remap, + include_keys=[ + f"layers.{mtp_layer_id}" + for mtp_layer_id in range(self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers) + ], + ) + + Glm4MoeForCausalLM.load_weights( + self, + weights_iterator, + ) + + @paddle.no_grad() + def set_state_dict(self, state_dict): + """ + glm4_mtp only support loader_v1. + """ + assert False, "glm4_mtp only support --load-choices default_v1." + + def compute_logits(self, hidden_state: paddle.Tensor, forward_meta: ForwardMeta): + """ + compute_logits + """ + logits = self.model.layers[str(0)].shared_head(hidden_state) + logits = logits.astype(paddle.float32) + logits[:, self.ori_vocab_size :] = -float("inf") + + return logits + + def empty_input_forward(self, forward_meta): + """ + empty_input_forward + """ + fake_hidden_states = paddle.empty( + shape=[0, self.fd_config.model_config.hidden_size], + dtype=paddle.get_default_dtype(), + ) + self.model.layers[str(0)].mtp_block.mlp.experts( + fake_hidden_states, + self.model.layers[str(0)].mtp_block.mlp.gate, + forward_meta, + ) + + def forward( + self, + ids_remove_padding: paddle.Tensor, + previous_hidden_states: paddle.Tensor, + forward_meta: ForwardMeta, + ): + """ + forward + """ + hidden_states = self.model( + ids_remove_padding=ids_remove_padding, + previous_hidden_states=previous_hidden_states, + forward_meta=forward_meta, + ) + + return hidden_states diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py index 682c9f5f1ec..60f41965cf2 100644 --- a/fastdeploy/model_executor/models/gpt_oss.py +++ b/fastdeploy/model_executor/models/gpt_oss.py @@ -214,8 +214,12 @@ def forward(self, ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta): for i in range(self.num_layers): hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) - hidden_states = self.norm(hidden_states, residual)[0] - return hidden_states + out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0]) + + return out @ModelRegistry.register_model_class( diff --git a/fastdeploy/model_executor/models/paddleocr_vl/projector.py b/fastdeploy/model_executor/models/paddleocr_vl/projector.py index f1b5ef60928..434e416fc52 100644 --- a/fastdeploy/model_executor/models/paddleocr_vl/projector.py +++ b/fastdeploy/model_executor/models/paddleocr_vl/projector.py @@ -20,6 +20,8 @@ import paddle import paddle.nn as nn +from fastdeploy.model_executor.utils import h2d_copy + class GELUActivation(nn.Layer): """ @@ -97,6 +99,8 @@ def forward(self, image_features, image_grid_thw): def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): loaded_weight = loaded_weight.transpose([1, 0]) + if not param._is_initialized(): + param.initialize() assert param.shape == loaded_weight.shape, ( f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})" ) @@ -106,4 +110,4 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N loaded_weight = loaded_weight.view(param.dtype) else: loaded_weight = loaded_weight.cast(param.dtype) - param.copy_(loaded_weight, False) + h2d_copy(param, loaded_weight) diff --git a/fastdeploy/model_executor/models/paddleocr_vl/siglip.py b/fastdeploy/model_executor/models/paddleocr_vl/siglip.py index 0bb256cd51f..452d8dd1f76 100644 --- a/fastdeploy/model_executor/models/paddleocr_vl/siglip.py +++ b/fastdeploy/model_executor/models/paddleocr_vl/siglip.py @@ -100,6 +100,8 @@ def qkv_weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] def out_proj_weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): loaded_weight = loaded_weight.transpose([1, 0]) + if not param._is_initialized(): + param.initialize() assert param.shape == loaded_weight.shape, ( f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})" ) @@ -109,7 +111,7 @@ def out_proj_weight_loader(self, param, loaded_weight, loaded_shard_id: Optional loaded_weight = loaded_weight.view(param.dtype) else: loaded_weight = loaded_weight.cast(param.dtype) - param.copy_(loaded_weight, False) + h2d_copy(param, loaded_weight) def forward( self, @@ -287,6 +289,8 @@ def __init__(self, config): def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): loaded_weight = loaded_weight.transpose([1, 0]) + if not param._is_initialized(): + param.initialize() assert param.shape == loaded_weight.shape, ( f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})" ) @@ -296,7 +300,7 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N loaded_weight = loaded_weight.view(param.dtype) else: loaded_weight = loaded_weight.cast(param.dtype) - param.copy_(loaded_weight, False) + h2d_copy(param, loaded_weight) def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: hidden_states = self.fc1(hidden_states) diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 59164985c8f..e513492965f 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -375,7 +375,7 @@ def load_weights(self, weights_iterator) -> None: weight_loader(param, loaded_weight) model_sublayer_name = re.sub(r"\.(weight)$", "", model_param_name) process_weights_after_loading_fn(model_sublayer_name, param) - if self.tie_word_embeddings: + if getattr(self, "tie_word_embeddings", False): self.lm_head.linear.weight.set_value( self.qwen2.embed_tokens.embeddings.weight.transpose([1, 0]).astype(self.lm_head.linear.weight.dtype) ) @@ -445,7 +445,7 @@ def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): fn = split_or_merge_func( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, ) @@ -468,7 +468,7 @@ def get_tensor_parallel_split_mappings(num_layers): base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: + if config.num_key_value_heads % config.tensor_model_parallel_size == 0: base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py index 4414eb91712..f2f49605c0e 100644 --- a/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py +++ b/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py @@ -14,10 +14,8 @@ # limitations under the License. """ -from functools import partial from typing import Optional -import numpy as np import paddle import paddle.nn.functional as F from paddle import nn @@ -80,16 +78,16 @@ def __init__( self, dim: int, num_heads: int = 16, - tensor_parallel_degree: int = 1, + tensor_model_parallel_size: int = 1, tensor_parallel_rank: int = 0, model_format: str = "", ) -> None: super().__init__() self.num_heads = num_heads - self.tensor_parallel_degree = tensor_parallel_degree + self.tensor_model_parallel_size = tensor_model_parallel_size self.tensor_parallel_rank = tensor_parallel_rank - if tensor_parallel_degree > 1: + if tensor_model_parallel_size > 1: self.qkv = ColumnParallelLinear( dim, dim * 3, @@ -124,7 +122,7 @@ def __init__( self.head_dim = dim // num_heads # must added self.num_heads = num_heads self.hidden_size = dim - self.num_heads_per_rank = divide(self.num_heads, self.tensor_parallel_degree) + self.num_heads_per_rank = divide(self.num_heads, self.tensor_model_parallel_size) def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): weight_need_transpose = getattr(param, "weight_need_transpose", False) @@ -134,7 +132,9 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N if load_bias: head_dim = self.hidden_size // self.num_heads shard_weight = loaded_weight[...].reshape([3, self.num_heads, head_dim]) - shard_weight = paddle.split(shard_weight, self.tensor_parallel_degree, axis=-2)[self.tensor_parallel_rank] + shard_weight = paddle.split(shard_weight, self.tensor_model_parallel_size, axis=-2)[ + self.tensor_parallel_rank + ] shard_weight = shard_weight.reshape([-1]) else: shard_weight = loaded_weight[...].reshape( @@ -145,7 +145,9 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N self.head_dim, ] ) - shard_weight = paddle.split(shard_weight, self.tensor_parallel_degree, axis=-2)[self.tensor_parallel_rank] + shard_weight = paddle.split(shard_weight, self.tensor_model_parallel_size, axis=-2)[ + self.tensor_parallel_rank + ] shard_weight = shard_weight.reshape([self.hidden_size, -1]) shard_weight = fd_cast(shard_weight, param) assert param.shape == shard_weight.shape, ( @@ -178,7 +180,7 @@ def forward( [ seq_length, 3, - self.num_heads // self.tensor_parallel_degree, + self.num_heads // self.tensor_model_parallel_size, -1, ] ) @@ -267,13 +269,13 @@ def __init__( hidden_dim: int, bias: bool = False, hidden_act: str = "gelu", - tensor_parallel_degree: int = 1, + tensor_model_parallel_size: int = 1, model_format: str = "", ) -> None: super().__init__() - self.tensor_parallel_degree = tensor_parallel_degree + self.tensor_model_parallel_size = tensor_model_parallel_size - if self.tensor_parallel_degree > 1: + if self.tensor_model_parallel_size > 1: self.gate_proj = ColumnParallelLinear( dim, hidden_dim, @@ -416,7 +418,7 @@ def __init__( num_heads: int, mlp_hidden_dim: int, hidden_act: str = "gelu", - tensor_parallel_degree: int = 1, + tensor_model_parallel_size: int = 1, tensor_parallel_rank: int = 0, attn_implementation: str = "sdpa", model_format: str = "", @@ -434,7 +436,7 @@ def __init__( self.attn = VisionFlashAttention2( dim=dim, num_heads=num_heads, - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, tensor_parallel_rank=tensor_parallel_rank, model_format=model_format, ) @@ -444,7 +446,7 @@ def __init__( hidden_dim=mlp_hidden_dim, bias=True, hidden_act=hidden_act, - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, model_format=model_format, ) @@ -560,7 +562,7 @@ def __init__(self, config, prefix_name: str = "") -> None: num_heads=config.vision_config.num_heads, mlp_hidden_dim=config.vision_config.intermediate_size, hidden_act=config.vision_config.hidden_act, - tensor_parallel_degree=config.pretrained_config.tensor_parallel_degree, + tensor_model_parallel_size=config.pretrained_config.tensor_model_parallel_size, tensor_parallel_rank=config.pretrained_config.tensor_parallel_rank, model_format=model_format, ) @@ -731,65 +733,6 @@ def extract_feature(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor) """ return self.forward(hidden_states, grid_thw) - @classmethod - def _get_tensor_parallel_mappings(cls, config, is_split=True): - """ - dummy - """ - - from paddleformers.transformers.conversion_utils import split_or_merge_func - - fn = split_or_merge_func( - is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, - tensor_parallel_rank=config.tensor_parallel_rank, - ) - vision_config = config.vision_config - - def split_qkv_weight(x): - head_dim = vision_config.hidden_size // vision_config.num_heads - x = x.reshape( - [ - vision_config.hidden_size, - 3, - vision_config.num_heads, - head_dim, - ] - ) - x = np.split(x, vision_config.tensor_parallel_degree, axis=-2)[vision_config.tensor_parallel_rank] - x = x.reshape([vision_config.hidden_size, -1]) - return x - - def split_qkv_bias(x): - head_dim = vision_config.hidden_size // vision_config.num_heads - x = x.reshape([3, vision_config.num_heads, head_dim]) - x = np.split(x, vision_config.tensor_parallel_degree, axis=-2)[vision_config.tensor_parallel_rank] - x = x.reshape([-1]) - return x - - def get_tensor_parallel_split_mappings(depth): - final_actions = {} - base_actions = { - "visual.blocks.0.attn.proj.weight": partial(fn, is_column=False), - "visual.blocks.0.mlp.gate_proj.weight": partial(fn, is_column=True), - "visual.blocks.0.mlp.gate_proj.bias": partial(fn, is_column=True), - "visual.blocks.0.mlp.up_proj.weight": partial(fn, is_column=True), - "visual.blocks.0.mlp.up_proj.bias": partial(fn, is_column=True), - "visual.blocks.0.mlp.down_proj.weight": partial(fn, is_column=False), - "visual.blocks.0.qkv.weight": split_qkv_weight, - "visual.blocks.0.qkv.bias": split_qkv_bias, - } - - for key, action in base_actions.items(): - if "blocks.0." in key: - for i in range(depth): - newkey = key.replace("blocks.0.", f"blocks.{i}.") - final_actions[newkey] = action - return final_actions - - mappings = get_tensor_parallel_split_mappings(vision_config.depth) - return mappings - def load_state_dict(self, state_dict): params_dict = dict(self.named_parameters()) for param_name, param in params_dict.items(): diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py index 4e751ca9e1a..91345c8a53b 100644 --- a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py +++ b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py @@ -383,7 +383,7 @@ def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True): fn = split_or_merge_func_v1( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, @@ -392,7 +392,7 @@ def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True): vision_fn = split_or_merge_func_v1( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.vision_config.get("num_heads"), num_key_value_heads=config.vision_config.get("num_heads"), diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 67bccc35872..9fb0ebcf4c1 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -386,7 +386,7 @@ def _get_tensor_parallel_mappings(cls, config, is_split=True): fn = split_or_merge_func( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, ) @@ -407,7 +407,7 @@ def get_tensor_parallel_split_mappings(num_layers): base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: + if config.num_key_value_heads % config.tensor_model_parallel_size == 0: base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 0e7f26f9dda..8fb480e3c41 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -282,6 +282,9 @@ def forward( out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe: + out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0]) + return out @@ -470,7 +473,7 @@ def _get_tensor_parallel_mappings(cls, config, is_split=True): fn = split_or_merge_func( is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, + tensor_model_parallel_size=config.tensor_model_parallel_size, tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, ) @@ -493,7 +496,7 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: + if config.num_key_value_heads % config.tensor_model_parallel_size == 0: base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) diff --git a/fastdeploy/model_executor/models/tp_utils.py b/fastdeploy/model_executor/models/tp_utils.py index 2283d1b3f53..48c4ec98d42 100644 --- a/fastdeploy/model_executor/models/tp_utils.py +++ b/fastdeploy/model_executor/models/tp_utils.py @@ -202,7 +202,7 @@ def build_expanded_keys( def gqa_qkv_split_func( - tensor_parallel_degree, + tensor_model_parallel_size, tensor_parallel_rank, num_attention_heads, num_key_value_heads, @@ -258,15 +258,17 @@ def split_tensor(tensor, degree): else: return np.split(tensor, degree, axis=0) - q_list = split_tensor(q, tensor_parallel_degree) - repeat_kv = num_key_value_heads < tensor_parallel_degree and tensor_parallel_degree % num_key_value_heads == 0 - repeat_num = tensor_parallel_degree // num_key_value_heads if repeat_kv else 1 + q_list = split_tensor(q, tensor_model_parallel_size) + repeat_kv = ( + num_key_value_heads < tensor_model_parallel_size and tensor_model_parallel_size % num_key_value_heads == 0 + ) + repeat_num = tensor_model_parallel_size // num_key_value_heads if repeat_kv else 1 if repeat_kv: k_list = split_tensor(k, num_key_value_heads) v_list = split_tensor(v, num_key_value_heads) else: - k_list = split_tensor(k, tensor_parallel_degree) - v_list = split_tensor(v, tensor_parallel_degree) + k_list = split_tensor(k, tensor_model_parallel_size) + v_list = split_tensor(v, tensor_model_parallel_size) if tensor_parallel_rank is None: res = [] @@ -332,9 +334,9 @@ def gqa_qkv_merge_func(num_attention_heads, num_key_value_heads, head_dim): def fn(weight_list, is_column=True): """fn""" - tensor_parallel_degree = len(weight_list) - local_num_attention_heads = num_attention_heads // tensor_parallel_degree - local_num_key_value_heads = num_key_value_heads // tensor_parallel_degree + tensor_model_parallel_size = len(weight_list) + local_num_attention_heads = num_attention_heads // tensor_model_parallel_size + local_num_key_value_heads = num_key_value_heads // tensor_model_parallel_size is_paddle_tensor = not isinstance(weight_list[0], np.ndarray) @@ -391,7 +393,7 @@ def slice_tensor(tensor, start, end): def split_or_merge_qkv_func( is_split, - tensor_parallel_degree, + tensor_model_parallel_size, tensor_parallel_rank, num_attention_heads, num_key_value_heads, @@ -402,7 +404,7 @@ def split_or_merge_qkv_func( """ if is_split: return gqa_qkv_split_func( - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, tensor_parallel_rank=tensor_parallel_rank, num_attention_heads=num_attention_heads, num_key_value_heads=num_key_value_heads, @@ -418,7 +420,7 @@ def split_or_merge_qkv_func( def split_or_merge_func_v1( is_split, - tensor_parallel_degree, + tensor_model_parallel_size, tensor_parallel_rank, num_attention_heads=None, num_key_value_heads=None, @@ -435,14 +437,14 @@ def fn(x, **kwargs): if is_tp_row_bias: tensor = x[:, ...] if isinstance(tensor, paddle.Tensor): - res = tensor / tensor_parallel_degree + res = tensor / tensor_model_parallel_size else: - res = paddle.to_tensor(tensor, paddle.get_default_dtype()) / tensor_parallel_degree + res = paddle.to_tensor(tensor, paddle.get_default_dtype()) / tensor_model_parallel_size return res elif is_gqa: func = split_or_merge_qkv_func( is_split=is_split, - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, tensor_parallel_rank=tensor_parallel_rank, num_attention_heads=num_attention_heads, num_key_value_heads=num_key_value_heads, @@ -453,7 +455,7 @@ def fn(x, **kwargs): else: func = split_or_merge_func( is_split=is_split, - tensor_parallel_degree=tensor_parallel_degree, + tensor_model_parallel_size=tensor_model_parallel_size, tensor_parallel_rank=tensor_parallel_rank, num_attention_heads=num_attention_heads, ) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 4a4132597f0..d7786ef19d6 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -92,6 +92,11 @@ speculate_limit_thinking_content_length_v2, ) +from fastdeploy.model_executor.entropy_utils import ( + calculate_logits_entropy, + speculate_calculate_logits_entropy, +) +from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOutput @@ -144,7 +149,6 @@ def speculate_limit_thinking_content_length( step_idx: paddle.Tensor, limit_think_status: paddle.Tensor, accept_num: paddle.Tensor, - seq_lens_decoder: paddle.Tensor, stop_flags: paddle.Tensor, eos_token_ids: paddle.Tensor, think_end_id: int, @@ -158,7 +162,6 @@ def speculate_limit_thinking_content_length( step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, # 处理由于模型效果问题导致思考过程中输出eos token的问题 think_end_id, @@ -172,7 +175,6 @@ def speculate_limit_thinking_content_length( step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -316,12 +318,14 @@ def post_process_normal( sampler_output: SamplerOutput, model_output: ModelOutputData, share_inputs: Dict[str, paddle.Tensor], + sampling_metadata: SamplingMetadata, block_size: int = 64, save_each_rank: bool = False, skip_save_output: bool = False, async_output_queue: queue.Queue = None, think_end_id: int = -1, line_break_id: int = -1, + enable_entropy: bool = False, ): """Post-processing steps after completing a single token generation.""" if think_end_id > 0: @@ -387,6 +391,9 @@ def post_process_normal( False, ) + if enable_entropy: + calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature) + # 2. Update the input buffer of the model with paddle.framework._no_check_dy2st_diff(): if envs.ENABLE_V1_KVCACHE_SCHEDULER: @@ -452,10 +459,12 @@ def post_process_specualate( sampler_output: SamplerOutput, model_output: ModelOutputData, share_inputs: Dict[str, paddle.Tensor], + sampling_metadata: SamplingMetadata, save_each_rank: bool = False, skip_save_output: bool = False, think_end_id: int = -1, line_break_id: int = -1, + enable_entropy: bool = False, ): if think_end_id > 0: speculate_limit_thinking_content_length( @@ -465,7 +474,8 @@ def post_process_specualate( step_idx=share_inputs["step_idx"], limit_think_status=share_inputs["limit_think_status"], accept_num=share_inputs["accept_num"], - seq_lens_decoder=share_inputs["seq_lens_decoder"], + stop_flags=share_inputs["stop_flags"], + eos_token_ids=share_inputs["eos_token_id"], think_end_id=think_end_id, line_break_id=line_break_id, ) @@ -480,6 +490,10 @@ def post_process_specualate( model_output.stop_seqs_len, model_output.eos_token_id, ) + + if enable_entropy: + speculate_calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature) + speculate_update( model_output.seq_lens_encoder, model_output.seq_lens_decoder, @@ -541,6 +555,7 @@ def post_process( sampler_or_pooler_output: Union[SamplerOutput, PoolerOutput], model_output: ModelOutputData, share_inputs: Dict[str, paddle.Tensor], + sampling_metadata: SamplingMetadata = None, block_size: int = 64, save_each_rank: bool = False, speculative_decoding: bool = False, @@ -548,6 +563,7 @@ def post_process( async_output_queue: queue.Queue = None, think_end_id: int = -1, line_break_id: int = -1, + enable_entropy: bool = False, ) -> None: """Post-processing steps after completing a single token generation.""" @@ -567,22 +583,26 @@ def post_process( sampler_or_pooler_output, model_output, share_inputs, + sampling_metadata, save_each_rank, skip_save_output, think_end_id, line_break_id, + enable_entropy, ) else: post_process_normal( sampler_or_pooler_output, model_output, share_inputs, + sampling_metadata, block_size, save_each_rank, skip_save_output, async_output_queue, think_end_id, line_break_id, + enable_entropy, ) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 971ee58ae8a..284e9d22598 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -140,7 +140,7 @@ def process_weight_transpose(layer, weight_name): default_initializer=paddle.nn.initializer.Constant(0), is_bias=False, ) - if layer.fd_config.load_config.dynamic_load_weight or layer.fd_config.model_config.enable_cache: + if layer.fd_config.load_config.dynamic_load_weight or getattr(layer.fd_config.model_config, "enable_cache", False): free_tensor(weight) setattr(layer, weight_name, weight_tmp) return @@ -209,6 +209,16 @@ def apply(self, weight_name): return self._map_name(weight_name) +def remap_weight_keys(weights_iterator, mapper: dict, include_keys: Optional[List[str]] = None): + if include_keys is not None: + weights_iterator = filter(lambda item: any(key in item[0] for key in include_keys), weights_iterator) + + return ( + (next((key.replace(k, v) for k, v in mapper.items() if k in key), key), value) + for key, value in weights_iterator + ) + + def process_weights_before_loading( *, skip_prefixes: Optional[List[str]] = None, mapper: Optional[WeightsMapper] = None ): @@ -351,6 +361,9 @@ def is_paddle_support_new_h2d(): code = """ import paddle +import resource + +resource.setrlimit(resource.RLIMIT_CORE, (0, 0)) try: dst = paddle.zeros([2, 4], dtype='bfloat16') src = paddle.ones([2, 2], dtype='bfloat16', device='cpu') diff --git a/fastdeploy/model_executor/xpu_pre_and_post_process.py b/fastdeploy/model_executor/xpu_pre_and_post_process.py index 2673af27684..60620ce7671 100644 --- a/fastdeploy/model_executor/xpu_pre_and_post_process.py +++ b/fastdeploy/model_executor/xpu_pre_and_post_process.py @@ -14,15 +14,18 @@ # limitations under the License. """ -from typing import Dict, Optional +import queue +from typing import Dict, List, Optional +import numpy as np import paddle from fastdeploy import envs from fastdeploy.model_executor.forward_meta import XPUForwardMeta from fastdeploy.model_executor.layers.sample.sampler import Sampler +from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData from fastdeploy.platforms import current_platform -from fastdeploy.worker.output import ModelOutputData +from fastdeploy.worker.output import LogprobsTensors, ModelOutputData if current_platform.is_xpu(): from fastdeploy.model_executor.ops.xpu import ( @@ -49,6 +52,43 @@ ) +def _build_stream_transfer_data( + output_tokens: paddle.Tensor, + pooler_outputs: List = None, + logprobs: Optional[LogprobsTensors] = None, + prompt_logprobs_list: Optional[LogprobsTensors] = None, +): + """Split output_tokens and output""" + stream_transfer_datas = [] + if output_tokens is not None: + output_tokens = output_tokens.reshape([-1]).numpy() + output_tokens_lists = np.split(output_tokens, output_tokens.shape[0]) + + for bid, output_token_per_sample in enumerate(output_tokens_lists): + stream_transfer_data = StreamTransferData( + decoder_state=DecoderState.TEXT, tokens=output_token_per_sample, batch_id=bid + ) + if logprobs: + stream_transfer_data.logprobs = logprobs.slice_rows(bid, bid + 1) + if prompt_logprobs_list: + stream_transfer_data.prompt_logprobs = prompt_logprobs_list[bid] + stream_transfer_datas.append(stream_transfer_data) + elif pooler_outputs is not None: + for bid, pooler_output in enumerate(pooler_outputs): + if pooler_output is None: + continue + if pooler_output.dtype == paddle.bfloat16: + pooler_output = pooler_output.astype("float32") + + pooler_output = pooler_output.numpy() + + stream_transfer_data = StreamTransferData( + decoder_state=DecoderState.TEXT, pooler_output=pooler_output, batch_id=bid + ) + stream_transfer_datas.append(stream_transfer_data) + return stream_transfer_datas + + def xpu_pre_process( input_ids: paddle.Tensor, seq_lens_this_time: int, @@ -217,6 +257,8 @@ def xpu_post_process_normal( share_inputs: Dict[str, paddle.Tensor], block_size: int = 64, skip_save_output: bool = False, + save_each_rank: bool = False, + async_output_queue: queue.Queue = None, think_end_id: int = None, line_break_id: int = None, ) -> None: @@ -314,27 +356,37 @@ def xpu_post_process_normal( # 3. Transmit the model's output and stop generation signal via message queue. # In the future, we will abandon this approach. if not skip_save_output: - if sampler_output.logprobs_tensors is None: - save_output( - sampled_token_ids, - model_output.not_need_stop, - model_output.mp_rank, - False, # use_ep - ) + if envs.FD_USE_GET_SAVE_OUTPUT_V1: + if save_each_rank or model_output.mp_rank == 0: + output = _build_stream_transfer_data( + sampled_token_ids, + logprobs=sampler_output.logprobs_tensors, + prompt_logprobs_list=model_output.prompt_logprobs_list, + ) + if async_output_queue is not None: + async_output_queue.put(output) else: - if save_output_topk is None: - raise ImportError( - "save_output_topk operator is not available. " - "Please rebuild the XPU operators with the new get_output_msg_with_topk.cc and save_output_msg_with_topk.cc files." + if sampler_output.logprobs_tensors is None: + save_output( + sampled_token_ids, + model_output.not_need_stop, + model_output.mp_rank, + False, # use_ep + ) + else: + if save_output_topk is None: + raise ImportError( + "save_output_topk operator is not available. " + "Please rebuild the XPU operators with the new get_output_msg_with_topk.cc and save_output_msg_with_topk.cc files." + ) + save_output_topk( + sampled_token_ids, + sampler_output.logprobs_tensors.logprob_token_ids, + sampler_output.logprobs_tensors.logprobs, + sampler_output.logprobs_tensors.selected_token_ranks, + model_output.not_need_stop, + model_output.mp_rank, ) - save_output_topk( - sampled_token_ids, - sampler_output.logprobs_tensors.logprob_token_ids, - sampler_output.logprobs_tensors.logprobs, - sampler_output.logprobs_tensors.selected_token_ranks, - model_output.not_need_stop, - model_output.mp_rank, - ) def xpu_post_process_specualate( diff --git a/fastdeploy/multimodal/hasher.py b/fastdeploy/multimodal/hasher.py index 1f2d01f8cf1..6d2fc4f9b91 100644 --- a/fastdeploy/multimodal/hasher.py +++ b/fastdeploy/multimodal/hasher.py @@ -19,8 +19,6 @@ import numpy as np -from fastdeploy.utils import data_processor_logger - class MultimodalHasher: @@ -28,8 +26,4 @@ class MultimodalHasher: def hash_features(cls, obj: object) -> str: if isinstance(obj, np.ndarray): return hashlib.sha256((obj.tobytes())).hexdigest() - - data_processor_logger.warning( - f"Unsupported type for hashing features: {type(obj)}" + ", use pickle for serialization" - ) return hashlib.sha256((pickle.dumps(obj))).hexdigest() diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 764028419d3..00eeb04dc76 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -35,6 +35,7 @@ Request, RequestMetrics, RequestOutput, + SpeculateMetrics, ) from fastdeploy.inter_communicator import ZmqIpcServer from fastdeploy.metrics.metrics import main_process_metrics @@ -48,12 +49,9 @@ MAX_DRAFT_TOKENS = 6 SPECULATE_MAX_BSZ = 256 -if current_platform.is_xpu(): - MAX_BSZ = 128 - K = 5 -else: - MAX_BSZ = 512 - K = 20 + +MAX_BSZ = 512 +K = 20 class TokenProcessor: @@ -78,6 +76,7 @@ def __init__(self, cfg, cached_generated_tokens, engine_worker_queue, split_conn self.speculative_decoding = self.cfg.speculative_config.method is not None self.use_logprobs = self.cfg.model_config.enable_logprob + self.enable_draft_logprob = self.cfg.speculative_config.enable_draft_logprob if self.speculative_decoding: if self.use_logprobs: @@ -112,16 +111,13 @@ def __init__(self, cfg, cached_generated_tokens, engine_worker_queue, split_conn self.num_accepted_tokens = 0 self.num_emitted_tokens = 0 self.max_num_emitted_tokens = 0 - self.num_rest_requests_per_head = [ - 0, - ] * MAX_DRAFT_TOKENS - self.num_accept_requests_per_head = [ - 0, - ] * MAX_DRAFT_TOKENS self.executor = ThreadPoolExecutor(max_workers=1) self.prefill_result_status = dict() self._finalizer = weakref.finalize(self, self._cleanup_resources) self._batch_result_buffer = None + self.total_step_per_request = {} + self.accept_token_num_per_head_per_request = {} + self.accept_token_num_per_head = [0] * MAX_DRAFT_TOKENS def _cleanup_resources(self): """Cleaning up shared memory resources""" @@ -180,7 +176,7 @@ def _reschedule_preempt_task(self, batch_size): if envs.ENABLE_V1_KVCACHE_SCHEDULER: need_to_be_reschedule_req_ids = list(self.resource_manager.to_be_rescheduled_request_id_set) for request_id in need_to_be_reschedule_req_ids: - if self.resource_manager.requests[request_id].idx >= ( + if self.resource_manager.requests[request_id].idx > ( batch_size - 1 ): # No more token generated for preempted request self.resource_manager.reschedule_preempt_task(request_id) @@ -422,7 +418,7 @@ def postprocess(self, batch_result: List[RequestOutput], mtype=3): batch_result (list): batch results """ try: - if self.cfg.speculative_config.method and self.use_logprobs: + if self.cfg.speculative_config.method and self.use_logprobs and self.enable_draft_logprob: if mtype == 3: # target finished_batch_result, unfinished_batch_result = [], [] for r in batch_result: @@ -506,7 +502,7 @@ def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False if task_id in self.tokens_counter: del self.tokens_counter[task_id] - def _compute_speculative_status(self): + def _compute_speculative_status(self, result: RequestOutput): # TODO(liuzichang): Supplement more statistics interval = 1 if self.speculative_stats_step % interval == 0: @@ -519,13 +515,11 @@ def _compute_speculative_status(self): if self.cfg.speculative_config.method in ["mtp"]: single_head_acceptance_rates = [] - for head in range(self.cfg.speculative_config.num_speculative_tokens): - if self.num_rest_requests_per_head[head] != 0: + for i in range(1, self.cfg.speculative_config.num_speculative_tokens + 1): + if self.accept_token_num_per_head[i - 1] != 0: single_head_acceptance_rates.append( - self.num_accept_requests_per_head[head] / self.num_rest_requests_per_head[head] + self.accept_token_num_per_head[i] / self.accept_token_num_per_head[i - 1] ) - else: - single_head_acceptance_rates.append(0) spec_logger.info(f" Single head accept ratio: {single_head_acceptance_rates}") if self.number_of_output_tokens > 1000000: @@ -533,6 +527,45 @@ def _compute_speculative_status(self): self.total_step = 0 self.speculative_stats_step += 1 + # For result + req_id = result.request_id + accept_num_list = self.accept_token_num_per_head_per_request[req_id] + req_total_step = self.total_step_per_request[req_id] + req_total_draft_tokens = req_total_step * (self.cfg.speculative_config.num_speculative_tokens + 1) + req_accepted_tokens = sum(accept_num_list) + req_rejected_tokens = req_total_draft_tokens - req_accepted_tokens + req_accept_ratio = 1 - req_total_step / req_accepted_tokens + req_avg_accept_length = req_accepted_tokens / req_total_step + + accept_ratio_per_head = [] + for i in range(1, len(accept_num_list)): + if accept_num_list[i - 1] != 0: + accept_ratio_per_head.append(accept_num_list[i] / accept_num_list[i - 1]) + else: + accept_ratio_per_head.append(0) + + result.metrics.speculate_metrics = SpeculateMetrics( + accepted_tokens=req_accepted_tokens, + rejected_tokens=req_rejected_tokens, + accept_ratio=req_accept_ratio, + average_accept_length=req_avg_accept_length, + accepted_tokens_per_head=accept_num_list[: self.cfg.speculative_config.num_speculative_tokens + 1], + accept_ratio_per_head=accept_ratio_per_head[: self.cfg.speculative_config.num_speculative_tokens], + ) + + # Log + spec_logger.info( + f"req_id: {result.request_id}, total_step: {req_total_step}, " + f"accept_ratio: {accept_ratio}, average_accept_length: {req_avg_accept_length}, " + f"accepted_tokens: {req_accepted_tokens}, rejected_tokens: {req_rejected_tokens}, " + f"accepted_tokens_per_head: {accept_num_list[: self.cfg.speculative_config.num_speculative_tokens + 1]}, " + f"accept_ratio_per_head: {accept_ratio_per_head[: self.cfg.speculative_config.num_speculative_tokens]}" + ) + + # Clear request record + self.accept_token_num_per_head_per_request.pop(req_id) + self.total_step_per_request.pop(req_id) + def _process_batch_draft_tokens(self, mtype, batch, accept_num, tokens, scores, ranks): """ Process batch draft tokens and generate corresponding request outputs @@ -620,7 +653,7 @@ def _process_batch_output(self): else: batch = self.output_tokens[1] accept_num = tokens[2 : batch + 2] - self._record_speculative_decoding_mertics(accept_num) + elif self.use_logprobs: batch = self.output_tokens[1, 0] tokens = tokens[2 : batch * (K + 1) + 2].reshape([batch, K + 1])[:, : (K + 1)] @@ -642,6 +675,7 @@ def _process_batch_output(self): task_id = task.request_id if self.cfg.speculative_config.method: + self._record_speculative_decoding_accept_num_per_request(task_id, accept_num[i]) if accept_num[i] == -3: recovery_stop = True if recovery_stop: @@ -658,7 +692,9 @@ def _process_batch_output(self): + i * MAX_DRAFT_TOKENS + accept_num[i] ].tolist() - if (not recovery_stop) and (len(token_ids) == 0 or token_ids[-1] <= 0): + if len(token_ids) > 0 and token_ids[-1] <= 0: + llm_logger.warning(f"Invalid token is generated! token_id {token_ids[-1]} at task {task_id}") + if (not recovery_stop) and (len(token_ids) == 0 or token_ids[-1] < 0): if envs.ENABLE_V1_KVCACHE_SCHEDULER: if task_id in self.resource_manager.to_be_rescheduled_request_id_set: self.resource_manager.reschedule_preempt_task(task_id) @@ -750,7 +786,9 @@ def _process_batch_output(self): and self.cfg.cache_config.enable_prefix_caching and self.cfg.cache_config.enable_output_caching ): - if (task.num_total_tokens - 1) % self.cfg.cache_config.block_size == 0: + if (task.num_total_tokens - 1) % self.cfg.cache_config.block_size == 0 and ( + task_id not in self.resource_manager.to_be_rescheduled_request_id_set + ): self.resource_manager.cache_output_tokens( task ) # when enable prefix caching, cache kv cache for output tokens @@ -790,7 +828,7 @@ def _process_batch_output(self): ) llm_logger.info(f"{self.resource_manager.info()}") if self.cfg.speculative_config.method: - self._compute_speculative_status() + self._compute_speculative_status(result) if not is_prefill: self._record_completion_metrics(task, current_time) self._recycle_resources(task_id, i, task, result, is_prefill) @@ -799,6 +837,8 @@ def _process_batch_output(self): llm_logger.debug(f"get response from infer: {result}") batch_result.append(result) + if self.cfg.speculative_config.method: + self._record_speculative_decoding_metrics(accept_num) self.postprocess(batch_result, mtype) def _record_metrics(self, task, current_time, token_ids): @@ -832,7 +872,7 @@ def _record_completion_metrics(self, task, current_time): main_process_metrics.request_inference_time.observe(current_time - task.inference_start_time) main_process_metrics.request_generation_tokens.observe(self.tokens_counter[task.request_id]) - def _record_speculative_decoding_mertics(self, accept_num): + def _record_speculative_decoding_metrics(self, accept_num): """Record metrics of speculative decoding""" if not hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"): main_process_metrics._init_speculative_metrics( @@ -841,15 +881,13 @@ def _record_speculative_decoding_mertics(self, accept_num): ) real_accept_num = [x for x in accept_num if x > 0] - num_accepted_tokens = sum([x - 1 for x in real_accept_num]) - self.num_accepted_tokens += num_accepted_tokens - num_emitted_tokens = sum(real_accept_num) - if num_emitted_tokens == 0: + self.num_accepted_tokens = sum(self.accept_token_num_per_head[1:]) + self.num_emitted_tokens = sum(self.accept_token_num_per_head) + if self.num_emitted_tokens == 0: return - self.num_emitted_tokens += num_emitted_tokens - main_process_metrics.spec_decode_num_accepted_tokens_total.inc(num_accepted_tokens) - main_process_metrics.spec_decode_num_emitted_tokens_total.inc(num_emitted_tokens) + main_process_metrics.spec_decode_num_accepted_tokens_total.set(self.num_accepted_tokens) + main_process_metrics.spec_decode_num_emitted_tokens_total.set(self.num_emitted_tokens) if self.cfg.speculative_config.method in ["ngram"]: main_process_metrics.spec_decode_draft_acceptance_rate.set( @@ -870,25 +908,26 @@ def _record_speculative_decoding_mertics(self, accept_num): main_process_metrics.spec_decode_efficiency.set(self.num_emitted_tokens / self.max_num_emitted_tokens) main_process_metrics.spec_decode_num_draft_tokens_total.inc(num_draft_tokens) - num_rest_requests = len(real_accept_num) - for head in range(self.cfg.speculative_config.num_speculative_tokens): - num_accept_requests = len([x for x in real_accept_num if x >= head + 2]) - # Accumulate the number of requests for each head - self.num_accept_requests_per_head[head] += num_accept_requests - self.num_rest_requests_per_head[head] += num_rest_requests - # Update the rest requests for each head - num_rest_requests = num_accept_requests - # Calculate the acceptance rate for each head - if self.num_rest_requests_per_head[head] != 0: + for i in range(1, self.cfg.speculative_config.num_speculative_tokens + 1): + if self.accept_token_num_per_head[i - 1] != 0: single_head_acceptance_rate = ( - self.num_accept_requests_per_head[head] / self.num_rest_requests_per_head[head] + self.accept_token_num_per_head[i] / self.accept_token_num_per_head[i - 1] ) - else: - single_head_acceptance_rate = 0 - main_process_metrics.spec_decode_draft_single_head_acceptance_rate[head].set( + main_process_metrics.spec_decode_draft_single_head_acceptance_rate[i - 1].set( single_head_acceptance_rate ) + def _record_speculative_decoding_accept_num_per_request(self, req_id, accept_num): + if req_id not in self.total_step_per_request: + self.total_step_per_request[req_id] = 0 + if req_id not in self.accept_token_num_per_head_per_request: + self.accept_token_num_per_head_per_request[req_id] = [0] * MAX_DRAFT_TOKENS + + self.total_step_per_request[req_id] += 1 + for i in range(accept_num): + self.accept_token_num_per_head_per_request[req_id][i] += 1 + self.accept_token_num_per_head[i] += 1 + def clear_data(self): if envs.ENABLE_V1_KVCACHE_SCHEDULER: self.resource_manager.clear_data() diff --git a/fastdeploy/rl/dynamic_weight_manager.py b/fastdeploy/rl/dynamic_weight_manager.py index a6b61151654..b70783b54c5 100644 --- a/fastdeploy/rl/dynamic_weight_manager.py +++ b/fastdeploy/rl/dynamic_weight_manager.py @@ -24,7 +24,7 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig -from fastdeploy.inter_communicator import ModelWeightsStatus +from fastdeploy.inter_communicator import KVCacheStatus, ModelWeightsStatus class DynamicWeightManager: @@ -62,17 +62,18 @@ def _capture_model_state(self): logger.info(f"Model param: {name}, shape={param.shape}, dtype={param.dtype}") self.state_dict[name] = param - def update_parameters(self, pid: int = 0) -> None: + def update_parameters(self, pid: int = 0, restart_process_group=False) -> None: """Core method to update model parameters based on strategy.""" start_time = time.perf_counter() paddle.device.cuda.empty_cache() # step1 : restart paddle process group if not self.first_load: - paddle.distributed.restart_process_group() - paddle.distributed.restart_process_group(self.parallel_config.tp_group) - if self.parallel_config.enable_expert_parallel: - paddle.distributed.restart_process_group(self.parallel_config.ep_group) + if restart_process_group: + paddle.distributed.restart_process_group() + paddle.distributed.restart_process_group(self.parallel_config.tp_group) + if self.parallel_config.enable_expert_parallel: + paddle.distributed.restart_process_group(self.parallel_config.ep_group) # step2 : recreat deepep buffer when enable expert parallel if self.parallel_config.enable_expert_parallel and not self.first_load: @@ -86,6 +87,7 @@ def update_parameters(self, pid: int = 0) -> None: strategy_handlers = { "ipc_snapshot": self._update_ipc_snapshot, "ipc": self._update_ipc, + "normal": self._normal_load_weight, } if handler := strategy_handlers.get(self.load_config.load_strategy): @@ -100,6 +102,14 @@ def update_parameters(self, pid: int = 0) -> None: # step5: recapture cuda_graph # step6: update weight status signal + def _normal_load_weight(self): + """use for RL mock.""" + from fastdeploy.model_executor.model_loader import get_model_loader + + model_loader = get_model_loader(load_config=self.fd_config.load_config) + state_dict = model_loader.load_rl_mock_model(fd_config=self.fd_config).state_dict() + self._update_model_from_state(state_dict, "raw") + def _update_ipc_snapshot(self): """Update using IPC snapshot strategy for elastic recovery.""" model_path = os.path.join( @@ -108,7 +118,7 @@ def _update_ipc_snapshot(self): ) try: - ipc_state_dict = paddle.load(model_path) + ipc_state_dict = paddle.load(model_path, safetensors=True) except FileNotFoundError: fallback_path = f"/shared_ipc_meta/model_state.tp0{self.meta_src_id}.pdparams" ipc_state_dict = paddle.load(fallback_path) @@ -123,7 +133,7 @@ def _update_ipc(self): self._update_model_from_state(state_dict, "raw") logger.info(f"IPC update parameters completed from file: {self.ipc_path}") - def clear_parameters(self, pid: int = 0) -> None: + def clear_parameters(self, pid: int = 0, shutdown_process_group=False) -> None: """Clear all model parameters and free memory.""" logger.info("start clear paramaters") @@ -135,8 +145,9 @@ def clear_parameters(self, pid: int = 0) -> None: DeepEPBufferManager.clear_buffer() # ep barrier paddle.distributed.barrier(self.parallel_config.ep_group) - # shutdown ep group - paddle.distributed.shutdown_process_group(self.parallel_config.ep_group) + if shutdown_process_group: + # shutdown ep group + paddle.distributed.shutdown_process_group(self.parallel_config.ep_group) paddle.device.cuda.empty_cache() # step2: release model weight @@ -149,11 +160,14 @@ def clear_parameters(self, pid: int = 0) -> None: if self.parallel_config.tensor_parallel_size > 1: # tp barrier paddle.distributed.barrier(self.parallel_config.tp_group) - paddle.distributed.shutdown_process_group(self.parallel_config.tp_group) + if shutdown_process_group: + paddle.distributed.shutdown_process_group(self.parallel_config.tp_group) if self.parallel_config.enable_expert_parallel: paddle.distributed.barrier(self.parallel_config.ep_group) - paddle.distributed.shutdown_process_group(self.parallel_config.ep_group) - paddle.distributed.shutdown_process_group() + if shutdown_process_group: + paddle.distributed.shutdown_process_group(self.parallel_config.ep_group) + if shutdown_process_group: + paddle.distributed.shutdown_process_group() self._update_shared_status(pid, ModelWeightsStatus.CLEARED) def _update_model_from_state(self, state_dict: Dict[str, paddle.Tensor], src_type: str): @@ -253,23 +267,38 @@ def _update_shared_status(self, pid: int, status: int) -> None: value[self.rank] = status @staticmethod - def check_model_weights_status(model_weights_status, model_runner, pid): + def check_model_weights_status(model_weights_status, kv_cache_status, model_runner, pid, block): """ - check model weights status + A function to handle the state of model weights, check the model weights state, + and perform corresponding operations as needed. + + - model_weights_status (`IPCSignal`): The signal indicating the status of model weights. + - kv_cache_status (`IPCSignal`): The signal indicating the status of key-value cache. + - model_runner (`ModelRunnerBase`): The model runner instance. + - block (`bool`): Block mode keeps the worker process blocked in the status-check loop, + avoiding communication operations in the worker event loop. """ logger.info(f"dynamic weight manager is check model weights status! {model_weights_status.value[0]}") - while ( - model_weights_status.value[0] != ModelWeightsStatus.NORMAL - and model_weights_status.value[0] != ModelWeightsStatus.CLEARED + while model_weights_status.value[0] != ModelWeightsStatus.NORMAL and ( + block or model_weights_status.value[0] != ModelWeightsStatus.CLEARED ): if model_weights_status.value[0] == ModelWeightsStatus.UPDATING: logger.info("infer engine stopped! start to load new checkpoint...") + if kv_cache_status: + kv_cache_status.value[0] = KVCacheStatus.UPDATING model_runner.clear_requests() model_runner.update_parameters(pid) + while model_weights_status.value[0] != ModelWeightsStatus.NORMAL: + time.sleep(0.01) logger.info("finished loading new checkpoint") elif model_weights_status.value[0] == ModelWeightsStatus.CLEARING: logger.info("infer engine stopped! start to clear checkpoint...") + if kv_cache_status: + kv_cache_status.value[0] = KVCacheStatus.CLEARING model_runner.clear_requests() model_runner.clear_parameters(pid) + while model_weights_status.value[0] != ModelWeightsStatus.CLEARED: + time.sleep(0.01) logger.info("finished clearing checkpoint") - time.sleep(0.01) + else: + time.sleep(0.01) diff --git a/fastdeploy/rl/rollout_config.py b/fastdeploy/rl/rollout_config.py index 6bd3c3bcb35..a9c2ed027b0 100644 --- a/fastdeploy/rl/rollout_config.py +++ b/fastdeploy/rl/rollout_config.py @@ -65,6 +65,9 @@ def __init__( data_parallel_size: int = 1, num_nextn_predict_layers: int = 0, eplb_config: str = {}, + routing_replay_config: str = None, + load_choices: str = "default_v1", + lm_head_fp32: bool = False, ): # Required parameters self.model = model_name_or_path @@ -113,6 +116,9 @@ def __init__( self.plas_attention_config = plas_attention_config self.num_nextn_predict_layers = num_nextn_predict_layers self.eplb_config = eplb_config + self.routing_replay_config = routing_replay_config + self.load_choices = load_choices + self.lm_head_fp32 = lm_head_fp32 def __str__(self): return "\n".join(f"{k}: {v}" for k, v in self.__dict__.items()) diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index e9410d9728b..421d8d7397e 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -18,9 +18,12 @@ from typing import Dict import paddle +import paddle.distributed as dist from paddle import nn +from fastdeploy import envs from fastdeploy.config import FDConfig +from fastdeploy.model_executor.model_loader import get_model_loader from fastdeploy.model_executor.models.ernie4_5_moe import ( Ernie4_5_MoeForCausalLM, Ernie4_5_MoePretrainedModel, @@ -33,6 +36,10 @@ Glm4MoeForCausalLM, Glm4MoePretrainedModel, ) +from fastdeploy.model_executor.models.glm4_mtp import ( + Glm4MTPForCausalLM, + Glm4MTPPretrainedModel, +) from fastdeploy.model_executor.models.model_base import ModelRegistry from fastdeploy.model_executor.models.qwen2 import ( Qwen2ForCausalLM, @@ -50,6 +57,10 @@ Qwen3MoeForCausalLM, Qwen3MoePretrainedModel, ) +from fastdeploy.model_executor.utils import ( + multi_switch_config_context, + process_final_after_loading, +) from fastdeploy.rl.rollout_config import RolloutModelConfig @@ -64,13 +75,34 @@ def __init__(self, rollout_model_config: RolloutModelConfig): def _init_model(self) -> nn.Layer: """Load model from loader based on config.""" + model_loader = get_model_loader(load_config=self.fd_config.load_config) + return model_loader.load_model(fd_config=self.fd_config) + + def load_weights(self, weights_iterator): + """Load weights_iterator.""" + context = paddle.LazyGuard() architectures = f"{self.fd_config.model_config.architectures[0]}RL" - with context: - model_cls = ModelRegistry.get_class(architectures) - model = model_cls(self.fd_config) - model.eval() - return model + if self.fd_config.quant_config is not None: + quantization_context = multi_switch_config_context( + (self.fd_config.quant_config, "is_checkpoint_bf16", True), + (self.fd_config.load_config, "dynamic_load_weight", False), + ) + else: + # bf16 + quantization_context = multi_switch_config_context( + (self.fd_config.load_config, "dynamic_load_weight", False) + ) + with quantization_context: + with context: + model_cls = ModelRegistry.get_class(architectures) + self.tmp_model = model_cls(self.fd_config) + self.tmp_model.eval() + self.tmp_model.load_weights(weights_iterator) + if self.fd_config.speculative_config.model_type != "mtp": + process_final_after_loading(self.tmp_model, self.fd_config) + self.rollout_model = self.tmp_model + self.tmp_model = None def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: """Get parameter name mappings between rollout and training models.""" @@ -548,12 +580,42 @@ def __init__(self, fd_config: FDConfig): fd_config (FDConfig): Configurations for the LLM model. """ super(Glm4MoeForCausalLMRL, self).__init__(fd_config) + self.num_nextn_predict_layers = fd_config.model_config.num_nextn_predict_layers + + if self.num_nextn_predict_layers > 0: + fd_config.parallel_config.tp_group = None + fd_config.parallel_config.ep_group = None + self.mtp_fd_config = copy.deepcopy(fd_config) + fd_config.parallel_config.tp_group = dist.get_group( + fd_config.parallel_config.data_parallel_rank + envs.FD_TP_GROUP_GID_OFFSET + ) + fd_config.parallel_config.ep_group = dist.get_group( + fd_config.parallel_config.data_parallel_size + envs.FD_TP_GROUP_GID_OFFSET + ) + self.fd_config.parallel_config.tp_group = dist.get_group( + fd_config.parallel_config.data_parallel_rank + envs.FD_TP_GROUP_GID_OFFSET + ) + self.fd_config.parallel_config.ep_group = dist.get_group( + fd_config.parallel_config.data_parallel_size + envs.FD_TP_GROUP_GID_OFFSET + ) + self.update_mtp_config(self.mtp_fd_config) + self.mtp_layers = Glm4MTPForCausalLMRL(self.mtp_fd_config) @classmethod def name(self) -> str: """name""" return "Glm4MoeForCausalLMRL" + def update_mtp_config(self, mtp_fd_config): + mtp_fd_config.model_config.architectures[0] = mtp_fd_config.model_config.architectures[0].replace("Moe", "MTP") + mtp_fd_config.speculative_config.sharing_model = None + mtp_fd_config.model_config.start_layer_index = mtp_fd_config.model_config.num_hidden_layers + mtp_fd_config.model_config.num_hidden_layers = 1 + mtp_fd_config.model_config.model = mtp_fd_config.speculative_config.model + if mtp_fd_config.speculative_config.quantization != "": + mtp_fd_config.model_config.quantization = mtp_fd_config.speculative_config.quantization + mtp_fd_config.speculative_config.model_type = "mtp" + def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: """Generate mapping between inference and training parameter for RL(donot delete!).""" if self._mappings_built: @@ -607,9 +669,106 @@ def _add_layer_mappings(layer_idx: int): _add_layer_mappings(layer_idx) self._complete_missing_mappings() + + # extra for mtp + if self.num_nextn_predict_layers > 0: + mtp_infer_to_train_mapping = self.mtp_layers.get_name_mappings_to_training(trainer_degree) + self.infer_to_train_mapping.update(mtp_infer_to_train_mapping) + infer_to_train_mapping_copy = copy.deepcopy(self.infer_to_train_mapping) for key in infer_to_train_mapping_copy.keys(): if "mlp.experts.gate_correction_bias" in key: self.infer_to_train_mapping.pop(key) return self.infer_to_train_mapping + + +class Glm4MTPForCausalLMRL(Glm4MTPForCausalLM, BaseRLModel): + """ + Glm4MTPForCausalLMRL + """ + + _get_tensor_parallel_mappings = Glm4MTPPretrainedModel._get_tensor_parallel_mappings + + def __init__(self, fd_config: FDConfig): + """ + Args: + fd_config (FDConfig): Configurations for the LLM model. + """ + super(Glm4MTPForCausalLMRL, self).__init__(fd_config) + + @classmethod + def name(self) -> str: + """name""" + return "Glm4MTPForCausalLMRL" + + def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: + """Generate mapping between inference and training parameter for RL(donot delete!).""" + if self._mappings_built: + return self.infer_to_train_mapping + + self.infer_to_train_mapping = {} + self._mappings_built = True + + # Prepare placeholders + place_holders = ["weight"] + + base_name = "model.layers" + + # Helper function to add layer mappings + def _add_layer_mappings(layer_idx: int): + # MTP specific mappings + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.shared_head.head.weight"] = ( + f"{base_name}.{layer_idx}.shared_head.head.weight" + ) + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.shared_head.norm.weight"] = ( + f"{base_name}.{layer_idx}.shared_head.norm.weight" + ) + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.eh_proj.weight"] = ( + f"{base_name}.{layer_idx}.eh_proj.weight" + ) + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.enorm.weight"] = ( + f"{base_name}.{layer_idx}.enorm.weight" + ) + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.hnorm.weight"] = ( + f"{base_name}.{layer_idx}.hnorm.weight" + ) + + # MoE specific mappings + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate.weight"] = ( + f"{base_name}.{layer_idx}.mlp.gate.weight" + ) + + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate.e_score_correction_bias"] = ( + f"{base_name}.{layer_idx}.mlp.gate.e_score_correction_bias" + ) + + # MoE experts mappings + for expert_idx in range(self.fd_config.model_config.n_routed_experts): + for ph in place_holders: + # up_gate_proj (up_gate_proj) + up_gate_proj_key = f"{base_name}.{layer_idx}.mlp.experts.up_gate_proj_weight" + if up_gate_proj_key not in self.infer_to_train_mapping: + self.infer_to_train_mapping[up_gate_proj_key] = [] + self.infer_to_train_mapping[up_gate_proj_key].append( + f"{base_name}.{layer_idx}.mlp.experts.{expert_idx}.up_gate_proj.{ph}" + ) + + # down_proj (down_proj) + down_proj_key = f"{base_name}.{layer_idx}.mlp.experts.down_proj_weight" + if down_proj_key not in self.infer_to_train_mapping: + self.infer_to_train_mapping[down_proj_key] = [] + self.infer_to_train_mapping[down_proj_key].append( + f"{base_name}.{layer_idx}.mlp.experts.{expert_idx}.down_proj.{ph}" + ) + + # Process MoE layers + for layer_idx in range( + self.fd_config.model_config.start_layer_index, + self.fd_config.model_config.start_layer_index + self.fd_config.model_config.num_nextn_predict_layers, + ): + _add_layer_mappings(layer_idx) + + self._complete_missing_mappings() + + return self.infer_to_train_mapping diff --git a/fastdeploy/scheduler/splitwise_scheduler.py b/fastdeploy/scheduler/splitwise_scheduler.py index 83f763ad454..5b85f64316d 100644 --- a/fastdeploy/scheduler/splitwise_scheduler.py +++ b/fastdeploy/scheduler/splitwise_scheduler.py @@ -17,6 +17,7 @@ import copy import hashlib import math +import pickle import random import threading import time @@ -412,8 +413,7 @@ def sync_results(self, keys): for result in results: try: # logger.info(f"Scheduler Get Results: {result.request_id}") - data = orjson.loads(result) - result = RequestOutput.from_dict(data) + result = pickle.loads(result) self.data.appendleft(result) except Exception as e: logger.error(f"Parse Result Error:{e}, {str(traceback.format_exc())}, {result}") @@ -892,7 +892,7 @@ def put_results(self, results): if self.role == "prefill" and result.outputs.send_idx == 0: result.finished = False - result_str = orjson.dumps(result.to_dict()) + result_str = pickle.dumps(result, protocol=5) # if self.role == "prefill" or result.error_code != 200 or result.finished: # logger.info(f"Infer Put Finish Result: {result_str}") groups[key].append(result_str) diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index dd7aba6c4a3..7ea09973af9 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -15,6 +15,7 @@ """ import os +import time from typing import List import numpy as np @@ -24,6 +25,7 @@ from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request, RequestType +from fastdeploy.inter_communicator import IPCSignal from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import ( @@ -45,6 +47,7 @@ eagle_get_self_hidden_states, mtp_save_first_token, mtp_step_paddle, + set_data_ipc, share_external_data, ) from fastdeploy.model_executor.xpu_pre_and_post_process import ( @@ -65,6 +68,8 @@ speculate_get_logits, speculate_save_output_topk, update_attn_mask_offsets, + set_data_ipc, + unset_data_ipc, ) from fastdeploy.model_executor.pre_and_post_process import pre_process, rebuild_padding @@ -94,6 +99,8 @@ def __init__( self.mtp_strategy = self.speculative_config.mtp_strategy self.hybrid_mode = self.mtp_strategy == "with_ngram" and self.max_draft_token_num > self.num_model_steps self.enable_logprob = self.model_config.enable_logprob + self.enable_draft_logprob = self.speculative_config.enable_draft_logprob + self.cache_kvs_map = {} # [mixed, prefill, decoder] self.role = self.scheduler_config.splitwise_role @@ -128,10 +135,12 @@ def _update_mtp_config(self, main_model): self.forward_meta: ForwardMeta = None self.model_config.architectures[0] = self.model_config.architectures[0].replace("Moe", "MTP") self.speculative_config.sharing_model = main_model + # TODO (wangyanpeng): The number of MTP layers should be read from model config self.model_config.num_hidden_layers = 1 self.model_config.model = self.speculative_config.model - self.model_config.pretrained_config.prefix_name = "ernie.mtp_block" - self.model_config.prefix_layer_name = "mtp_block" + if "Ernie" in self.model_config.architectures[0]: + self.model_config.pretrained_config.prefix_name = "ernie.mtp_block" + self.model_config.prefix_layer_name = "mtp_block" if self.speculative_config.quantization != "": self.model_config.quantization = self.speculative_config.quantization self.model_config.start_layer_index = self.num_main_model_layers @@ -203,19 +212,49 @@ def initialize_kv_cache(self, main_model_num_blocks, profile: bool = False): if kv_cache_quant_type == "block_wise_fp8": kv_cache_scale_shape = [key_cache_shape[0], key_cache_shape[1], key_cache_shape[2]] local_rank = self.local_rank % self.parallel_config.tensor_parallel_size - if not profile and self.scheduler_config.splitwise_role != "mixed": + + cache_ready_signal_data = np.zeros(shape=[self.parallel_config.tensor_parallel_size], dtype=np.int32) + cache_ready_signal = IPCSignal( + name="cache_ready_signal", + array=cache_ready_signal_data, + dtype=np.int32, + suffix=self.parallel_config.engine_worker_queue_port, + create=False, + ) + + # Check if gpu runner needs to create kv cache + # 1. During profiling, it creates its own kv cache. + # 2. If no need to profile, create kv cache if cache managers do not exist. + create_cache_tensor = profile or not ( + self.fd_config.cache_config.num_cpu_blocks > 0 or self.fd_config.scheduler_config.splitwise_role != "mixed" + ) + + if not create_cache_tensor: + logger.info(f"Waiting for cache managers to create kv cache.. {cache_ready_signal.value}") + while cache_ready_signal.value[local_rank] != 1: + time.sleep(1) + logger.info(f"OK! Stop waiting. {cache_ready_signal.value}") + + logger.info(f"Initializing kv cache for all layers. {cache_ready_signal.value}") + + if not create_cache_tensor: cache_kvs_list = [] for i in range( self.num_main_model_layers, self.num_main_model_layers + self.model_config.num_hidden_layers, ): + logger.info( + f"..attaching kv cache for mtp layer {i}: key:{key_cache_shape}, value:{value_cache_shape}" + ) key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}" key_cache = share_external_data(key_cache, key_cache_name, key_cache_shape) + self.cache_kvs_map[key_cache_name] = key_cache cache_kvs_list.append(key_cache) value_cache = paddle.empty(shape=[], dtype=cache_type) value_cache = share_external_data(value_cache, val_cache_name, value_cache_shape) + self.cache_kvs_map[val_cache_name] = value_cache cache_kvs_list.append(value_cache) if kv_cache_quant_type == "block_wise_fp8": @@ -223,40 +262,66 @@ def initialize_kv_cache(self, main_model_num_blocks, profile: bool = False): scale_val_cache_name = f"value_cache_scales_{i}_rank{local_rank}.device{self.device_id}" key_scale_cache = paddle.empty(shape=[], dtype=paddle.get_default_dtype()) key_scale_cache = share_external_data(key_scale_cache, scale_key_cache_name, kv_cache_scale_shape) + self.cache_kvs_map[scale_key_cache_name] = key_scale_cache cache_kvs_list.append(key_scale_cache) value_scale_cache = paddle.empty(shape=[], dtype=paddle.get_default_dtype()) value_scale_cache = share_external_data( value_scale_cache, scale_val_cache_name, kv_cache_scale_shape ) + self.cache_kvs_map[scale_val_cache_name] = value_scale_cache cache_kvs_list.append(value_scale_cache) self.model_inputs["caches"] = cache_kvs_list else: - for i in range(self.model_config.num_hidden_layers): - self.cache_kvs[f"key_caches_{i}"] = paddle.full( + cache_kvs_list = [] + for i in range( + self.num_main_model_layers, + self.num_main_model_layers + self.model_config.num_hidden_layers, + ): + logger.info(f"..creating kv cache for mtp layer {i}: key:{key_cache_shape}, value:{value_cache_shape}") + key_cache = paddle.full( shape=key_cache_shape, fill_value=0, dtype=cache_type, ) - self.cache_kvs[f"value_caches_{i}"] = paddle.full( + key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}" + set_data_ipc(key_cache, key_cache_name) + self.cache_kvs_map[key_cache_name] = key_cache + cache_kvs_list.append(key_cache) + + val_cache = paddle.full( shape=value_cache_shape, fill_value=0, dtype=cache_type, ) + val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}" + set_data_ipc(val_cache, val_cache_name) + self.cache_kvs_map[val_cache_name] = val_cache + cache_kvs_list.append(val_cache) + if kv_cache_quant_type == "block_wise_fp8": - self.cache_kvs[f"key_cache_scales_{i}"] = paddle.full( + key_cache_scales = paddle.full( shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype(), ) - self.cache_kvs[f"value_cache_scales_{i}"] = paddle.full( + key_cache_scales_name = f"key_cache_scales_{i}_rank{local_rank}.device{self.device_id}" + set_data_ipc(key_cache_scales, key_cache_scales_name) + self.cache_kvs_map[key_cache_scales_name] = key_cache_scales + cache_kvs_list.append(key_cache_scales) + + val_cache_scales = paddle.full( shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype(), ) - self.model_inputs["caches"] = list(self.cache_kvs.values()) - for value in self.cache_kvs.values(): - del value + val_cache_scales_name = f"value_cache_scales_{i}_rank{local_rank}.device{self.device_id}" + set_data_ipc(val_cache_scales, val_cache_scales_name) + self.cache_kvs_map[val_cache_scales_name] = val_cache_scales + cache_kvs_list.append(val_cache_scales) + + self.model_inputs["caches"] = cache_kvs_list + self._empty_cache() def _initialize_attn_backend( @@ -331,10 +396,17 @@ def _initialize_attn_backend( ) self.attn_backends.append(attn_backend) - def clear_mtp_cache(self): + def clear_mtp_cache(self, profile=False): """ Clear allocated cacheKV """ + create_cache_tensor = profile or not ( + self.fd_config.cache_config.num_cpu_blocks > 0 or self.fd_config.scheduler_config.splitwise_role != "mixed" + ) + if not create_cache_tensor: + for name, tensor in self.cache_kvs_map.items(): + unset_data_ipc(tensor, name, True, False) + self.cache_kvs_map.clear() del self.model_inputs["caches"] if self.forward_meta is not None: del self.forward_meta.caches @@ -413,6 +485,7 @@ def _init_model_inputs(self): position_ids=tmp_position_ids, base=self.model_config.rope_theta, model_config=self.model_config, + partial_rotary_factor=self.model_config.partial_rotary_factor, ) # self.model_inputs["caches"] = self.cache_kvs # Inherit generation hyperparameters from the main model for consistency @@ -493,6 +566,12 @@ def _init_model_inputs(self): shape=[self.max_num_seqs + 1], fill_value=0, dtype="int32" ) self.model_inputs["mask_rollback"] = paddle.full([self.max_num_seqs, 1], 0, dtype="int32") + # NOTE(liuzichang): In speculative decoding, accepted tokens' KV cache is recomputed + # using the target model's hidden states. + self.model_inputs["recompute_token_num"] = paddle.full( + [self.max_num_seqs, 1], self.num_model_steps - 1, dtype="int32" + ) + # attn_mask if self.enable_mm: self.model_inputs["attn_mask_offsets"] = paddle.full( @@ -562,7 +641,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int): self.fd_config.scheduler_config.splitwise_role == "decode" ): # In PD, we continue to decode after P generates first token self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0 - # P-D split need rollback one step + self.model_inputs["recompute_token_num"][idx : idx + 1] = 0 + # NOTE(liuzichang): + # extra 1 : P-D split need rollback one step self.model_inputs["mask_rollback"][idx : idx + 1] = 1 # has_prefill_task = True @@ -673,7 +754,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.model_inputs["not_need_stop"][0] = True self.model_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer - def _initialize_forward_meta(self, step_use_cudagraph: bool = False): + def _initialize_forward_meta(self, step_use_cudagraph: bool = False, is_dummy_run: bool = False, substep: int = 0): """ Initialize forward meta and attention meta data """ @@ -709,7 +790,12 @@ def _initialize_forward_meta(self, step_use_cudagraph: bool = False): for attn_backend in self.attn_backends: attn_backend.init_attention_metadata(self.forward_meta) - self.forward_meta.step_use_cudagraph = step_use_cudagraph and self.draft_model_use_cudagraph + # Notes(liuzichang): + # 1. CUDA Graph capture sizes must be recorded in descending order (large → small). + # 2. In multi-step execution, only the first step should be captured. + self.forward_meta.step_use_cudagraph = ( + step_use_cudagraph and self.draft_model_use_cudagraph and not (substep > 0 and is_dummy_run) + ) def _initialize_forward_meta_xpu(self): @@ -767,6 +853,8 @@ def _prepare_inputs(self, full_hidden_states): self.model_inputs["batch_drop"], self.model_inputs["is_block_step"], self.model_inputs["pre_ids"], + self.model_inputs["mask_rollback"], + self.model_inputs["recompute_token_num"], self.target_model_inputs["accept_tokens"], self.target_model_inputs["accept_num"], self.target_model_inputs["seq_lens_this_time"], @@ -892,7 +980,9 @@ def _propose_cuda(self, step_use_cudagraph: bool = False, is_dummy_run: bool = F self.model_inputs["output_padding_offset"].copy_(output_padding_offset, False) # Initialize forward meta data - self._initialize_forward_meta(step_use_cudagraph=step_use_cudagraph) + self._initialize_forward_meta( + step_use_cudagraph=step_use_cudagraph, is_dummy_run=is_dummy_run, substep=substep + ) self.forward_meta.batch_id_per_token.copy_(batch_id_per_token, False) # Padding inputs for cuda graph @@ -917,9 +1007,10 @@ def _propose_cuda(self, step_use_cudagraph: bool = False, is_dummy_run: bool = F top_p_normalized_logprobs=self.model_inputs["top_p_normalized_logprobs"], share_inputs=self.model_inputs, ) - + # Note(liuzichang): + # paddle.clone would raise error 700 in cudaGraph mode if self.num_model_steps > 1: - self.last_seq_lens_this_time = paddle.clone(self.model_inputs["seq_lens_this_time"]) + self.last_seq_lens_this_time.copy_(self.model_inputs["seq_lens_this_time"], False) model_output = self.model( ids_remove_padding=self.model_inputs["ids_remove_padding"], @@ -941,9 +1032,11 @@ def _propose_cuda(self, step_use_cudagraph: bool = False, is_dummy_run: bool = F ) # 4. Compute logits, Sample - logits = self.model.compute_logits(hidden_states) - if self.enable_logprob and substep == 0: - first_token_logits = self.model.compute_logits(self.model_inputs["first_token_hidden_states"]) + logits = self.model.compute_logits(hidden_states, forward_meta=self.forward_meta) + if self.enable_logprob and self.enable_draft_logprob and substep == 0: + first_token_logits = self.model.compute_logits( + self.model_inputs["first_token_hidden_states"], forward_meta=self.forward_meta + ) speculate_get_logits( self.model_inputs["draft_logits"], @@ -1054,7 +1147,7 @@ def _propose_xpu(self, step_use_cudagraph: bool = False, is_dummy_run: bool = Fa model_output, self.model_inputs["cum_offsets"], self.forward_meta, self.model_inputs ) # 4. Compute logits, Sample - logits = self.model.compute_logits(hidden_states) + logits = self.model.compute_logits(hidden_states, forward_meta=self.forward_meta) sampled_token_ids, sampler_output = self.sampler( logits, self.sampling_metadata, diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py index d82fbec849f..0f324fb7cf9 100644 --- a/fastdeploy/splitwise/splitwise_connector.py +++ b/fastdeploy/splitwise/splitwise_connector.py @@ -73,8 +73,8 @@ def _init_network(self): self.router_socket.setsockopt(zmq.LINGER, 0) self.router_socket.setsockopt(zmq.SNDHWM, 1000) self.router_socket.setsockopt(zmq.ROUTER_MANDATORY, 1) - self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.pd_comm_port[0]}") - self.logger.info(f"_init_network: bind {self.cfg.cache_config.pd_comm_port}") + self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.pd_comm_port[self.local_data_parallel_id]}") + self.logger.info(f"_init_network: bind {self.cfg.cache_config.pd_comm_port[self.local_data_parallel_id]}") self.poller = zmq.Poller() self.poller.register(self.router_socket, zmq.POLLIN) diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index a0878fa7c73..c557b1b492e 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -970,7 +970,6 @@ def check_download_links(bos_client, links, timeout=1): def init_bos_client(): from baidubce.auth.bce_credentials import BceCredentials from baidubce.bce_client_configuration import BceClientConfiguration - from baidubce.exception import BceHttpClientError, BceServerError from baidubce.services.bos.bos_client import BosClient cfg = BceClientConfiguration( @@ -981,14 +980,12 @@ def init_bos_client(): try: client = BosClient(cfg) client.list_buckets() - except BceServerError as e: - if e.status_code == 403: - raise Exception("BOS authentication failed: Invalid AK/SK") from e - raise Exception(f"BOS connection failed: {str(e)}") from e - except BceHttpClientError as e: - raise Exception(f"Invalid BOS endpoint configuration: {str(e)}") from e except Exception as e: - raise Exception(f"BOS client validation error: {str(e)}") from e + raise Exception( + "Create BOSClient Error, Please check your ENV [ ENCODE_FEATURE_BOS_AK, ENCODE_FEATURE_BOS_SK, ENCODE_FEATURE_ENDPOINT ] \n" + f"Current ENV AK: {envs.ENCODE_FEATURE_BOS_AK}, SK: {envs.ENCODE_FEATURE_BOS_SK}, Endpoint: {envs.ENCODE_FEATURE_ENDPOINT} \n" + f"{str(e)}" + ) return client @@ -1049,7 +1046,7 @@ def _bos_download(bos_client, link): console_logger = get_logger("console", "console.log", print_to_console=True) spec_logger = get_logger("speculate", "speculate.log") zmq_client_logger = get_logger("zmq_client", "zmq_client.log") -trace_logger = FastDeployLogger().get_trace_logger("trace_logger", "trace_logger.log") +trace_logger = FastDeployLogger().get_trace_logger("trace", "trace.log") router_logger = get_logger("router", "router.log") diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 2d5f47def46..0eeacdf5445 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -45,6 +45,9 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, ) +from fastdeploy.model_executor.layers.moe.routing_indices_cache import ( + RoutingReplayManager, +) from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler @@ -71,6 +74,7 @@ share_external_data, speculate_schedule_cache, set_data_ipc, + unset_data_ipc, ) from fastdeploy.model_executor.pre_and_post_process import ( @@ -123,11 +127,19 @@ def __init__( self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop self.is_pooling_model = self.fd_config.model_config.runner_type == "pooling" self.ori_vocab_size = self.fd_config.model_config.ori_vocab_size - self.max_logprobs = ( - self.ori_vocab_size if fd_config.model_config.max_logprobs == -1 else fd_config.model_config.max_logprobs - ) + self.max_logprobs = None + if self.enable_logprob: + self.max_logprobs = ( + self.ori_vocab_size + if fd_config.model_config.max_logprobs == -1 + else fd_config.model_config.max_logprobs + ) + self.temp_scaled_logprobs = True + self.top_p_normalized_logprobs = True self.prompt_logprobs_reqs: dict[str, Request] = {} self.in_progress_prompt_logprobs: dict[str, LogprobsTensors] = {} + self.forward_batch_reqs_list: list[Request] = [None for _ in range(self.scheduler_config.max_num_seqs)] + self.cache_kvs_map: dict = {} # VL model config: if self.enable_mm: @@ -180,9 +192,13 @@ def __init__( # Initialize share inputs self._init_share_inputs(self.scheduler_config.max_num_seqs) + increment_value = ( + 4 if not self.speculative_decoding else (self.speculative_config.num_speculative_tokens + 1) * 4 + ) + self.infer_seed_increment = paddle.full( shape=[self.scheduler_config.max_num_seqs, 1], - fill_value=4, + fill_value=increment_value, dtype="int64", ).cpu() @@ -202,6 +218,9 @@ def __init__( os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port) logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}") + # Rollout routing replay config + self.routing_replay_manager = None + self.zmq_client = None self.async_output_queue = None if envs.FD_USE_GET_SAVE_OUTPUT_V1: @@ -217,6 +236,18 @@ def __init__( ) self.async_output_copy_thread.start() + self.enable_entropy = self.model_config.enable_entropy + + # init signal + cache_ready_signal_data = np.zeros(shape=[self.parallel_config.tensor_parallel_size], dtype=np.int32) + self.cache_ready_signal = IPCSignal( + name="cache_ready_signal", + array=cache_ready_signal_data, + dtype=np.int32, + suffix=self.parallel_config.engine_worker_queue_port, + create=False, + ) + def _async_output_busy_loop(self): """Entrypoint for the thread which handles outputs asynchronously.""" while True: @@ -556,10 +587,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = req_dict: A list of Request dict num_running_requests: batch_size """ - # NOTE(luotingdan): Lazy initialize kv cache - if "caches" not in self.share_inputs: - self.initialize_kv_cache() - req_len = len(req_dicts) has_prefill_task = False has_decode_task = False @@ -584,6 +611,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = logits_info = None prefill_tokens = [] if request.task_type.value == RequestType.PREFILL.value: # prefill task + self.share_inputs["req_ids"][idx] = str(request.request_id) # guided decoding if ( request.guided_json is not None @@ -648,6 +676,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0 self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids) self.share_inputs["is_block_step"][idx : idx + 1] = False + self.share_inputs["is_chunk_step"][idx : idx + 1] = prefill_end_index < len(input_ids) self.share_inputs["step_idx"][idx : idx + 1] = ( len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0 ) @@ -655,7 +684,14 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = # pooling model request.sampling_params is None if request.sampling_params is not None and request.sampling_params.prompt_logprobs is not None: self.prompt_logprobs_reqs[request.request_id] = request + self.forward_batch_reqs_list[idx] = request has_prefill_task = True + + # Routing Replay + if self.fd_config.routing_replay_config.enable_routing_replay: + # 1.prefix task(need regist) 2. chunkend task(not need regist) + self.routing_replay_manager.register_request(batch_id=idx, request_id=request.request_id) + if ( self.fd_config.scheduler_config.splitwise_role == "decode" ): # In PD, we continue to decode after P generate first token @@ -670,6 +706,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = ) if self.share_inputs["is_block_step"][idx]: # has tasks to continue to decode has_decode_task = True + continue else: # preempted task logger.info(f"Handle preempted request {request} at idx {idx}") @@ -681,6 +718,12 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = self.share_inputs["is_block_step"][idx : idx + 1] = False self.prompt_logprobs_reqs.pop(request.request_id, None) self.in_progress_prompt_logprobs.pop(request.request_id, None) + self.forward_batch_reqs_list[idx] = None + + # Routing Replay + if self.fd_config.routing_replay_config.enable_routing_replay: + self.routing_replay_manager.clear_request(batch_id=idx) + continue assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens @@ -1005,14 +1048,10 @@ def get_input_length_list( """ # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token max_dec_len = expected_decode_len + 1 - if batch_size == 0: - # Note(ZKK): divided by 0 is invalid, here we give a input_length = 1 - input_length = 1 - else: - input_length = min( - num_tokens // (1 if capture_prefill else batch_size), - self.model_config.max_model_len - max_dec_len, - ) + input_length = min( + num_tokens // (1 if capture_prefill else batch_size), + self.model_config.max_model_len - max_dec_len, + ) # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. @@ -1140,6 +1179,7 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.seq_lens_routing_buffer = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["step_seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["prompt_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") @@ -1152,6 +1192,7 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["bad_tokens_len"] = paddle.full([max_num_seqs], 1, dtype="int64") self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool") + self.share_inputs["is_chunk_step"] = paddle.full([max_num_seqs], False, dtype="bool").cpu() self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32") self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32") @@ -1235,6 +1276,9 @@ def _init_share_inputs(self, max_num_seqs: int): -1, dtype="int64", ) + self.share_inputs["req_ids"] = [""] * max_num_seqs + self.share_inputs["entropy_list"] = [[] for _ in range(max_num_seqs)] + if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( @@ -1329,6 +1373,28 @@ def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None: self.cache_config.block_size, self.speculative_config.num_speculative_tokens if self.speculative_decoding else 0, ) + logprobs_reqs = [ + req + for req in self.forward_batch_reqs_list + if req is not None and req.sampling_params is not None and req.sampling_params.logprobs is not None + ] + if len(logprobs_reqs): + self.max_logprobs = ( + max( + [ + self.ori_vocab_size if req.sampling_params.logprobs < 0 else req.sampling_params.logprobs + for req in logprobs_reqs + ] + ) + if not self.speculative_decoding + else 20 + ) + self.temp_scaled_logprobs = any(req.sampling_params.temp_scaled_logprobs for req in logprobs_reqs) + self.top_p_normalized_logprobs = any( + req.sampling_params.top_p_normalized_logprobs for req in logprobs_reqs + ) + elif self.enable_logprob: + self.max_logprobs = None if not self.speculative_decoding else 0 # Remove padding ( @@ -1384,9 +1450,11 @@ def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None: min_dec_lens=self.share_inputs["min_dec_len"], bad_words_token_ids=self.share_inputs["bad_tokens"][:, :max_bad_tokens_len], eos_token_ids=self.share_inputs["eos_token_id"], - max_num_logprobs=self.max_logprobs if self.enable_logprob else None, + max_num_logprobs=self.max_logprobs, enable_early_stop=self.enable_early_stop, stop_flags=self.share_inputs["stop_flags"], + temp_scaled_logprobs_flag=self.temp_scaled_logprobs, + top_p_normalized_logprobs_flag=self.top_p_normalized_logprobs, temp_scaled_logprobs=self.share_inputs["temp_scaled_logprobs"], top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"], logits_processors=self.share_inputs["logits_processors"], @@ -1422,6 +1490,9 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False): Initialize forward meta, attention meta data and update some config. """ # Initialize forward meta + routing_replay_table = None + if self.routing_replay_manager is not None: + routing_replay_table = self.routing_replay_manager.get_routing_table() self.forward_meta = ForwardMeta( ids_remove_padding=self.share_inputs["ids_remove_padding"], rotary_embs=self.share_inputs["rope_emb"], @@ -1448,6 +1519,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False): kv_batch_ids=self.share_inputs["kv_batch_ids"], kv_tile_ids_per_batch=self.share_inputs["kv_tile_ids_per_batch"], kv_num_blocks_x_cpu=self.share_inputs["kv_num_blocks_x_cpu"], + routing_replay_table=routing_replay_table, ) dist_status = self.collect_distributed_status() @@ -1463,14 +1535,16 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False): if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed": self.fd_config.model_config.moe_phase.phase = "decode" if if_only_decode else "prefill" if self.speculative_decoding: - self.proposer.fd_config.parallel_config.moe_phase.phase = "decode" if if_only_decode else "prefill" + self.proposer.fd_config.model_config.moe_phase.phase = "decode" if if_only_decode else "prefill" # Update Batch type for cuda graph for only_prefill_batch only_prefill_use_cudagraph = self.use_cudagraph and self.cudagraph_only_prefill and self.only_prefill() # When support capture both prefill-only and decode-only, this will use [only_prefill_use_cudagraph or only_decode_use_cudagraph] self.forward_meta.step_use_cudagraph = ( - only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph + only_prefill_use_cudagraph + if self.cudagraph_only_prefill + else only_decode_use_cudagraph and self.forward_meta.ids_remove_padding.shape[0] > 0 ) # Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends @@ -1506,20 +1580,14 @@ def initialize_kv_cache(self, profile: bool = False) -> None: kv_cache_scale_shape = [key_cache_shape[0], key_cache_shape[1], key_cache_shape[2]] local_rank = self.local_rank % self.parallel_config.tensor_parallel_size - cache_ready_signal_data = np.zeros(shape=[self.parallel_config.tensor_parallel_size], dtype=np.int32) - cache_ready_signal = IPCSignal( - name="cache_ready_signal", - array=cache_ready_signal_data, - dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, - create=False, - ) - # Check if gpu runner needs to create kv cache # 1. During profiling, it creates its own kv cache. - # 2. GPU runner creates kv cache tensor unless p/d disaggregation is enabled. - create_cache_tensor = profile or self.scheduler_config.splitwise_role == "mixed" + # 2. If no need to profile, create kv cache if cache managers do not exist. + create_cache_tensor = profile or not ( + self.fd_config.cache_config.num_cpu_blocks > 0 or self.fd_config.scheduler_config.splitwise_role != "mixed" + ) + cache_ready_signal = self.cache_ready_signal if not create_cache_tensor: logger.info(f"Waiting for cache managers to create kv cache.. {cache_ready_signal.value}") while cache_ready_signal.value[local_rank] != 1: @@ -1529,10 +1597,6 @@ def initialize_kv_cache(self, profile: bool = False) -> None: logger.info(f"Initializing kv cache for all layers. {cache_ready_signal.value}") cache_kvs_list = [] - # NOTE:(changwenbin) Determine whether it is Multi-Head Latent Attention, - # To rationalize the allocation of kvcache. - from fastdeploy import envs - self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN" for i in range(self.model_config.num_hidden_layers): # init key cache @@ -1545,9 +1609,11 @@ def initialize_kv_cache(self, profile: bool = False) -> None: logger.info(f"..creating kv cache for layer {i}: key:{key_cache_shape}, value:{value_cache_shape}") key_cache = paddle.full(shape=key_cache_shape, fill_value=0, dtype=cache_type) set_data_ipc(key_cache, key_cache_name) + self.cache_kvs_map[key_cache_name] = key_cache if value_cache_shape: val_cache = paddle.full(shape=value_cache_shape, fill_value=0, dtype=cache_type) set_data_ipc(val_cache, val_cache_name) + self.cache_kvs_map[val_cache_name] = val_cache cache_kvs_list.extend([key_cache, val_cache]) else: cache_kvs_list.extend([key_cache]) @@ -1555,10 +1621,14 @@ def initialize_kv_cache(self, profile: bool = False) -> None: key_cache_scales = paddle.full( shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype() ) + set_data_ipc(key_cache_scales, key_cache_scales_name) + self.cache_kvs_map[key_cache_scales_name] = key_cache_scales if value_cache_shape: val_cache_scales = paddle.full( shape=kv_cache_scale_shape, fill_value=0, dtype=paddle.get_default_dtype() ) + set_data_ipc(val_cache_scales, value_cache_scales_name) + self.cache_kvs_map[value_cache_scales_name] = val_cache_scales cache_kvs_list.extend([key_cache_scales, val_cache_scales]) else: cache_kvs_list.extend([key_cache_scales]) @@ -1566,20 +1636,24 @@ def initialize_kv_cache(self, profile: bool = False) -> None: logger.info(f"..attaching kv cache for layer {i}: key:{key_cache_shape}, value:{value_cache_shape}") key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache = share_external_data(key_cache, key_cache_name, key_cache_shape) + self.cache_kvs_map[key_cache_name] = key_cache if kv_cache_quant_type == "block_wise_fp8": key_cache_scales = paddle.empty(shape=[], dtype=paddle.get_default_dtype()) key_cache_scales = share_external_data( key_cache_scales, key_cache_scales_name, kv_cache_scale_shape ) + self.cache_kvs_map[key_cache_scales_name] = key_cache_scales if value_cache_shape: val_cache = paddle.empty(shape=[], dtype=cache_type) val_cache = share_external_data(val_cache, val_cache_name, value_cache_shape) + self.cache_kvs_map[val_cache_name] = val_cache cache_kvs_list.extend([key_cache, val_cache]) if kv_cache_quant_type == "block_wise_fp8": val_cache_scales = paddle.empty(shape=[], dtype=paddle.get_default_dtype()) val_cache_scales = share_external_data( val_cache_scales, value_cache_scales_name, kv_cache_scale_shape ) + self.cache_kvs_map[value_cache_scales_name] = val_cache_scales cache_kvs_list.extend([key_cache_scales, val_cache_scales]) else: cache_kvs_list.extend([key_cache]) @@ -1725,6 +1799,7 @@ def _dummy_pooler_run( sampler_or_pooler_output=pooler_output, model_output=model_output_data, share_inputs=self.share_inputs, + sampling_metadata=self.sampling_metadata, block_size=self.cache_config.block_size, speculative_decoding=self.speculative_decoding, skip_save_output=True, @@ -1761,7 +1836,7 @@ def _dummy_sampler_run( group=self.parallel_config.tp_group, ) else: - self.sampler( + sampler_output = self.sampler( logits, self.sampling_metadata, self.model_config.max_model_len, @@ -1769,7 +1844,6 @@ def _dummy_sampler_run( accept_all_drafts, reject_all_drafts, ) - sampler_output = None if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], @@ -1826,12 +1900,14 @@ def _dummy_sampler_run( sampler_or_pooler_output=sampler_output, model_output=model_output_data, share_inputs=self.share_inputs, + sampling_metadata=self.sampling_metadata, block_size=self.cache_config.block_size, speculative_decoding=self.speculative_decoding, skip_save_output=True, async_output_queue=self.async_output_queue, think_end_id=self.model_config.think_end_id, line_break_id=self.model_config.line_break_id, + enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0, ) if self.speculative_decoding: if self.speculative_method == "mtp": @@ -2026,64 +2102,21 @@ def capture_model(self) -> None: ) elif self.speculative_decoding and self.speculative_method == "mtp": # Capture Target Model without bsz 1 - for batch_size in sorted(capture_sizes, reverse=True): - if batch_size == 1: - logger.info("Skip token_num = 1, when capture target model for mtp") - else: - assert batch_size % 2 == 0 - self._dummy_run( - num_tokens=( - self.scheduler_config.max_num_seqs - * (self.speculative_config.num_speculative_tokens + 1) - if self.scheduler_config.splitwise_role == "decode" - else self.scheduler_config.max_num_batched_tokens - ), - batch_size=int(batch_size / 2), - in_capturing=True, - expected_decode_len=1, - ) - logger.info( - f"Warm up the Target model with the num_tokens:{batch_size}, expected_decode_len:{1}" - ) - if self.graph_opt_config.draft_model_use_cudagraph: - # Capture Draft Model without bsz 1 - # NOTE(liujundong): expected_decode_len = 1, will affect mtp capture in cudagraph - for batch_size in sorted(capture_sizes, reverse=True): - if batch_size == 1: - logger.info("Skip token_num = 1, when capture Draft model for mtp") - else: - assert batch_size % 2 == 0 - self._dummy_run( - num_tokens=( - self.scheduler_config.max_num_seqs - if self.scheduler_config.splitwise_role == "decode" - else self.scheduler_config.max_num_batched_tokens - ), - batch_size=int(batch_size / 2), - in_capturing=True, - expected_decode_len=3, - accept_all_drafts=True, - ) - logger.info( - f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}" - ) - # Capture Draft Model with bsz 1 - if 1 in capture_sizes: - self._dummy_run( - num_tokens=( - self.scheduler_config.max_num_seqs - if self.scheduler_config.splitwise_role == "decode" - else self.scheduler_config.max_num_batched_tokens - ), - batch_size=int(1), - in_capturing=True, - expected_decode_len=3, - accept_all_drafts=False, - reject_all_drafts=True, - ) - logger.info( - f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}" - ) + for capture_size in sorted(capture_sizes, reverse=True): + self._dummy_run( + num_tokens=( + self.scheduler_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1) + if self.scheduler_config.splitwise_role == "decode" + else self.scheduler_config.max_num_batched_tokens + ), + batch_size=int(capture_size / (self.speculative_config.num_speculative_tokens + 1)), + in_capturing=True, + expected_decode_len=self.speculative_config.num_speculative_tokens * 2 + 1, + accept_all_drafts=True, + ) + logger.info( + f"Warm up the model with the num_tokens:{capture_size}, expected_decode_len:{self.speculative_config.num_speculative_tokens}" + ) else: for batch_size in sorted(capture_sizes, reverse=True): self._dummy_run( @@ -2199,6 +2232,11 @@ class at the server level, which is too granular for ModelRunner. self._prepare_inputs() self.sampler.pre_process(p_done_idxs) + if self.fd_config.routing_replay_config.enable_routing_replay: + self.positions = self.routing_replay_manager.get_token_positions( + seq_lens_decoder=self.share_inputs["seq_lens_decoder"], + seq_lens_this_time=self.seq_lens_this_time_buffer, + ) # 1.1 Update state of logits processor for proc in self.sampling_metadata.logits_processors: @@ -2268,11 +2306,13 @@ class at the server level, which is too granular for ModelRunner. sampler_or_pooler_output=pooler_output, model_output=model_output_data, share_inputs=self.share_inputs, + sampling_metadata=self.sampling_metadata, block_size=self.cache_config.block_size, save_each_rank=self.parallel_config.use_ep, speculative_decoding=self.speculative_decoding, skip_save_output=False, async_output_queue=self.async_output_queue, + enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0, ) return None @@ -2305,6 +2345,19 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, p_done_idxs, ) + + if ( + self.enable_logprob + and not envs.FD_USE_GET_SAVE_OUTPUT_V1 + and sampler_output.logprobs_tensors is None + ): + sampler_output.logprobs_tensors = LogprobsTensors( + logprob_token_ids=sampler_output.sampled_token_ids, + logprobs=paddle.empty_like(sampler_output.sampled_token_ids, device="cpu", dtype="float32"), + selected_token_ranks=paddle.empty( + [sampler_output.sampled_token_ids.shape[0]], device="cpu", dtype="int64" + ), + ) if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( sampler_output.sampled_token_ids, @@ -2380,6 +2433,7 @@ class at the server level, which is too granular for ModelRunner. sampler_or_pooler_output=sampler_output, model_output=model_output_data, share_inputs=self.share_inputs, + sampling_metadata=self.sampling_metadata, block_size=self.cache_config.block_size, save_each_rank=self.parallel_config.use_ep, speculative_decoding=self.speculative_decoding, @@ -2387,6 +2441,7 @@ class at the server level, which is too granular for ModelRunner. async_output_queue=self.async_output_queue, think_end_id=self.model_config.think_end_id, line_break_id=self.model_config.line_break_id, + enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0, ) if self.guided_backend is not None and sampler_output is not None: self.sampler.post_process(sampler_output.sampled_token_ids) @@ -2434,7 +2489,29 @@ class at the server level, which is too granular for ModelRunner. self.speculative_config.num_speculative_tokens, ) - return None + # Routing replay + if self.fd_config.routing_replay_config.enable_routing_replay: + # Update host cache + slot_mapping = self.routing_replay_manager.compute_slot_mapping(positions=self.positions) + self.routing_replay_manager.update_host_cache(positions=self.positions, slot_mapping=slot_mapping) + + # Put routing of finished requests to store + is_empty_batch = paddle.equal(self.seq_lens_routing_buffer[:, 0], 0) # 1.empty batch 2. preempted request + not_block_chunk_empty = paddle.logical_not( + paddle.logical_or( + is_empty_batch, + paddle.logical_or(self.share_inputs["is_block_step"], self.share_inputs["is_chunk_step"]), + ) + ) + finished_batch_ids = paddle.logical_and(self.share_inputs["stop_flags"][:, 0], not_block_chunk_empty) + self.routing_replay_manager.put_finished_batch( + finished_batch_ids=finished_batch_ids, + seq_lens_decoder=self.seq_lens_routing_buffer, + ) + + paddle.assign(self.share_inputs["seq_lens_decoder"], self.seq_lens_routing_buffer) + + return None def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Optional[ModelRunnerOutput]: @@ -2514,9 +2591,9 @@ def profile_run(self) -> None: ) # 3. gc - self.clear_cache() if self.speculative_method in ["mtp"]: - self.proposer.clear_mtp_cache() + self.proposer.clear_mtp_cache(profile=True) + self.clear_cache(profile=True) def update_share_input_block_num(self, num_gpu_blocks: int) -> None: """ @@ -2597,8 +2674,18 @@ def not_need_stop(self) -> bool: """Stop decoding if the tensor meets the termination condition""" return self.share_inputs["not_need_stop"][0] - def clear_cache(self): + def clear_cache(self, profile=False): """Clear cached data from shared inputs and forward metadata""" + create_cache_tensor = profile or not ( + self.fd_config.cache_config.num_cpu_blocks > 0 or self.fd_config.scheduler_config.splitwise_role != "mixed" + ) + local_rank = self.local_rank % self.parallel_config.tensor_parallel_size + + if not create_cache_tensor: + for name, tensor in self.cache_kvs_map.items(): + unset_data_ipc(tensor, name, True, False) + self.cache_ready_signal.value[local_rank] = 0 + self.cache_kvs_map.clear() self.share_inputs.pop("caches", None) if self.forward_meta is not None: self.forward_meta.clear_caches() @@ -2610,7 +2697,11 @@ def clear_parameters(self, pid): if self.use_cudagraph: self.model.clear_grpah_opt_backend() # Clear parameters and Send single - self.dynamic_weight_manager.clear_parameters(pid) + self.dynamic_weight_manager.clear_parameters( + pid, self.fd_config.parallel_config.shutdown_comm_group_if_worker_idle + ) + if self.speculative_method in ["mtp"]: + self.proposer.clear_mtp_cache() self.clear_cache() paddle.device.cuda.empty_cache() @@ -2622,11 +2713,16 @@ def clear_requests(self): # prompt_logprobs self.prompt_logprobs_reqs.clear() self.in_progress_prompt_logprobs.clear() + self.forward_batch_reqs_list = [None for _ in range(self.scheduler_config.max_num_seqs)] def update_parameters(self, pid): """Dynamic model loader use to update parameters use for RL""" # Update parameters - self.dynamic_weight_manager.update_parameters(pid) + self.dynamic_weight_manager.update_parameters( + pid, self.fd_config.parallel_config.shutdown_comm_group_if_worker_idle + ) + if self.speculative_method in ["mtp"]: + self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks) self.initialize_kv_cache() # Recapture CUDAGraph if self.use_cudagraph: @@ -2901,3 +2997,12 @@ def _get_prompt_logprobs_list( del self.prompt_logprobs_reqs[req.request_id] del self.in_progress_prompt_logprobs[req.request_id] return prompt_logprobs_list + + def initialize_routing_replay_manager(self): + """Initialize the routing replay manager after initialize the KVCache""" + # Use updated block number + self.routing_replay_manager = RoutingReplayManager( + fd_config=self.fd_config, + block_table=self.share_inputs["block_tables"], + total_block_num=self.num_gpu_blocks, + ) diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 9fcf9efcc9a..2744b51a8e0 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -184,6 +184,10 @@ def initialize_cache(self, num_gpu_blocks: int) -> None: # accurate cache size self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks) + # Initialize routing replay manager + if self.fd_config.routing_replay_config.enable_routing_replay: + self.model_runner.initialize_routing_replay_manager() + def execute_model( self, model_forward_batch: Optional[List[Request]] = None, diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py index 2b66ce4e138..fdcb291f535 100644 --- a/fastdeploy/worker/output.py +++ b/fastdeploy/worker/output.py @@ -128,6 +128,43 @@ def slice_rows(self, start: int, end: int): PromptLogprobs = LogprobsTensors | list[dict[int, Logprob] | None] +@dataclass +class SpeculateMetrics: + """ + Speculative decoding metrics + """ + + """ + The number of accepted tokens in the current request + """ + accepted_tokens: int + + """ + The number of rejected tokens in the current request + """ + rejected_tokens: int + + """ + The acceptance rate of the current request + """ + accept_ratio: float + + """ + Average number of accepted tokens per step for the current request + """ + average_accept_length: float + + """ + The number of accepted tokens of each head in the current request + """ + accepted_tokens_per_head: list[int] + + """ + Average acceptance rate of each head in the current request + """ + accept_ratio_per_head: list[float] + + @dataclass class SamplerOutput: """ """ @@ -140,6 +177,7 @@ class SamplerOutput: logprobs_tensors: Optional[LogprobsTensors] token_num_per_batch: Optional[paddle.Tensor] = None cu_batch_token_offset: Optional[paddle.Tensor] = None + logits: Optional[paddle.Tensor] = None @dataclass diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 02d66f4bc53..14ba5c95b3e 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -38,9 +38,11 @@ ModelConfig, ParallelConfig, PlasAttentionConfig, + RoutingReplayConfig, SpeculativeConfig, StructuredOutputsConfig, ) +from fastdeploy.engine.request import RequestType from fastdeploy.eplb.async_expert_loader import ( MODEL_MAIN_NAME, REARRANGE_EXPERT_MAGIC_NUM, @@ -128,7 +130,7 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]: def update_fd_config_for_mm(fd_config: FDConfig) -> None: architectures = fd_config.model_config.architectures if fd_config.model_config.enable_mm and ErnieArchitectures.contains_ernie_arch(architectures): - fd_config.model_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size + fd_config.model_config.tensor_model_parallel_size = fd_config.parallel_config.tensor_parallel_size fd_config.model_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank fd_config.model_config.vision_config.dtype = fd_config.model_config.dtype @@ -233,6 +235,16 @@ def init_health_status(self) -> None: create=False, ) + # init kv_cache_status + kv_cache_status_data = np.zeros(shape=[1], dtype=np.int32) + self.kv_cache_status = IPCSignal( + name="kv_cache_status", + array=kv_cache_status_data, + dtype=np.int32, + suffix=self.parallel_config.engine_worker_queue_port, + create=False, + ) + # init exist_task_signal workers_exist_task = np.zeros([1], dtype=np.int32) self.exist_task_signal = IPCSignal( @@ -286,7 +298,8 @@ def update_weights_from_tensor(self, mmap_infos): def _broadcast_model_weights_signal(self, src: int, group) -> int: model_weights_signal_tensor = paddle.full(shape=[1], fill_value=self.model_weights_signal[0], dtype="int32") paddle.distributed.broadcast(model_weights_signal_tensor, src=src, group=group) - return model_weights_signal_tensor.item() + value = model_weights_signal_tensor.numpy()[0] + return int(value) def _tp_barrier_wait(self): if current_platform.is_xpu(): @@ -413,25 +426,19 @@ def event_loop_normal(self) -> None: tp_size = self.parallel_config.tensor_parallel_size # Currently, only support single node self.nnode = int((tp_size + 7) // 8) - req_ids = [] - num_running_requests = 0 + max_occupied_batch_index = 0 tp_rank = self.local_rank % tp_size + # TODO: Unify status variables model_weights_status (shared memory) and model_weights_signal (numpy array) to one self.model_weights_signal = np.zeros([1], dtype=np.int32) while True: # run eplb self._run_eplb(tp_rank) - if tp_rank == 0: - if self.model_weights_status.value[0] != ModelWeightsStatus.NORMAL: - self.model_weights_signal[0] = int(self.model_weights_status.value[0]) - if self.fd_config.load_config.dynamic_load_weight and self.parallel_config.enable_expert_parallel: - self.model_weights_signal[0] = self._broadcast_model_weights_signal( - src=0, group=self.parallel_config.ep_group - ) - if self.fd_config.load_config.dynamic_load_weight and tp_size > 1: - self.model_weights_signal[0] = self._broadcast_model_weights_signal( - src=0, group=self.parallel_config.tp_group - ) + + if self.fd_config.load_config.dynamic_load_weight: + self.model_weights_signal[0] = int(self.model_weights_status.value[0]) + if self.ranks > 1: + self.model_weights_signal[0] = self._broadcast_model_weights_signal(src=0, group=None) self.insert_step = False req_dicts = None @@ -453,11 +460,9 @@ def event_loop_normal(self) -> None: self._tp_barrier_wait() if self.fd_config.load_config.dynamic_load_weight: - if self.parallel_config.enable_expert_parallel: - paddle.distributed.barrier(self.parallel_config.ep_group) - else: - paddle.distributed.barrier(self.parallel_config.tp_group) if self.model_weights_signal[0] != ModelWeightsStatus.NORMAL: + if self.ranks > 1: + paddle.distributed.barrier() logger.info( f"Rank: {self.local_rank} to update or clear parameters, signal is {self.model_weights_signal[0]}, [-1:clear, 1:update]" ) @@ -466,16 +471,42 @@ def event_loop_normal(self) -> None: ) self.model_weights_status.value[0] = self.model_weights_signal[0] + self.kv_cache_status.value[0] = self.model_weights_signal[0] DynamicWeightManager.check_model_weights_status( self.model_weights_status, + self.kv_cache_status if self.fd_config.cache_config.num_cpu_blocks > 0 else None, # model_weights_signal self.worker.model_runner, self.parallel_config.engine_worker_queue_port, + self.parallel_config.shutdown_comm_group_if_worker_idle, ) logger.info(f"current task queue data: {self.task_queue.num_tasks()}") self.task_queue.clear_data() - self.model_weights_signal[0] = ModelWeightsStatus.NORMAL - logger.info(f"Rank: {self.local_rank} has updated or cleared parameters.") + + if self.model_weights_signal[0] == ModelWeightsStatus.UPDATING: + logger.info( + f"Rank: {self.local_rank} has updated parameters. {self.model_weights_status.value[0]}" + ) + self.model_weights_signal[0] = ModelWeightsStatus.NORMAL + elif self.model_weights_signal[0] == ModelWeightsStatus.CLEARING: + logger.info( + f"Rank: {self.local_rank} has cleared parameters. {self.model_weights_status.value[0]}" + ) + # 如果清理权重后不关闭通信组,那么将推理进程统一阻塞在下面的循环中,否则信号量可能同步混乱;直到下次权重更新时唤醒 + if not self.fd_config.parallel_config.shutdown_comm_group_if_worker_idle: + if self.ranks > 1: # 所有 Rank 同时入睡,监听下次的更新信号 + paddle.distributed.barrier() + while self.model_weights_signal[0] != ModelWeightsStatus.UPDATING: + self.model_weights_signal[0] = self.model_weights_status.value[0] + if self.ranks > 1: + self.model_weights_signal[0] = self._broadcast_model_weights_signal( + src=0, group=None + ) + time.sleep(1) + self.model_weights_status.value[0] = ( + ModelWeightsStatus.UPDATING + ) # 所有 Rank 已同步唤醒,启动权重更新流程 + continue if self.exist_task_signal.value[0] == ExistTaskStatus.EXIST or self.task_queue.read_finish_flag.get() == 1: logger.info(f"Rank: {self.local_rank} Detected new requests.") @@ -489,17 +520,22 @@ def event_loop_normal(self) -> None: req_dicts = [] for req_dict, bsz in tasks: - num_running_requests = int(bsz) + max_occupied_batch_index = int(bsz) req_dicts.extend(req_dict) - req_ids = [req.request_id for req in req_dicts] + # Count prefill requests in current batch + num_prefill_requests = sum(1 for req in req_dicts if req.task_type == RequestType.PREFILL) + num_scheduled_requests = len(req_dicts) + scheduled_request_ids = [req.request_id for req in req_dicts] logger.info( - f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " - f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}" + f"Rank: {self.local_rank}, num_prefill_requests: {num_prefill_requests}, " + f"max_occupied_batch_index: {max_occupied_batch_index}, " + f"num_scheduled_requests: {num_scheduled_requests}, " + f"scheduled_request_ids: {scheduled_request_ids}" ) # Process prefill inputs - self.worker.preprocess_new_task(req_dicts, num_running_requests) + self.worker.preprocess_new_task(req_dicts, max_occupied_batch_index) if (not self.parallel_config.use_ep) and (not self.worker.model_runner.not_need_stop()): if self.ranks > 1: @@ -511,7 +547,7 @@ def event_loop_normal(self) -> None: # Execute model to generate token. The generated token will be written to the buffer. # These generated tokens can be obtained through get_output op. start_execute_time = time.time() - self.worker.execute_model(req_dicts, num_running_requests) + self.worker.execute_model(req_dicts, max_occupied_batch_index) self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill() logger.debug(f"execute model cost: {time.time()-start_execute_time:.5f} s") @@ -545,7 +581,7 @@ def initialize_kv_cache(self) -> None: if num_blocks_local <= 0: raise ValueError( - "The total number of blocks cannot be less than zero. " + f"The total number of blocks cannot be less than zero bug got {num_blocks_local}. " "Please increase gpu_memory_utilization " "Or decrease max_num_batched_tokens(max model length)." ) @@ -820,8 +856,8 @@ def parse_args(): parser.add_argument( "--load_choices", type=str, - default="default", - help="The format of the model weights to load. default/new_loader.", + default="default_v1", + help="The format of the model weights to load. default/default_v1.", ) parser.add_argument( @@ -885,6 +921,32 @@ def parse_args(): help="EPLB Configuration.", ) + parser.add_argument( + "--routing_replay_config", + type=json.loads, + default=None, + help="Configation of Rollout Routing Replay.", + ) + + parser.add_argument( + "--shutdown_comm_group_if_worker_idle", + action="store_true", + help="Shutdown comm group if worker idle.", + ) + + parser.add_argument( + "--enable_entropy", + action="store_true", + help="Enable output of token-level entropy.", + ) + + parser.add_argument( + "--num_cpu_blocks", + type=int, + default=0, + help="Number of cpu blocks.", + ) + args = parser.parse_args() return args @@ -944,10 +1006,11 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: eplb_config = EPLBConfig(args.eplb_config) structured_outputs_config: StructuredOutputsConfig = StructuredOutputsConfig(args=vars(args)) + routing_replay_config = RoutingReplayConfig(args.routing_replay_config) # Note(tangbinhan): used for load_checkpoint model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank - model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size + model_config.pretrained_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size model_config.pretrained_config.is_mtp = False model_config.pretrained_config.head_dim = model_config.head_dim @@ -1003,6 +1066,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: plas_attention_config=plas_attention_config, structured_outputs_config=structured_outputs_config, eplb_config=eplb_config, + routing_replay_config=routing_replay_config, ) update_fd_config_for_mm(fd_config) if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config): diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index f9bbb4ea95d..99688ba425e 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -15,19 +15,22 @@ """ import os +import queue import random import time +from threading import Thread from typing import List, Optional import numpy as np import paddle +import zmq from paddle import nn from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request, RequestType from fastdeploy.input.ernie4_5_vl_processor import DataProcessor -from fastdeploy.inter_communicator import IPCSignal +from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.utils import ( profile_run_guard, @@ -59,7 +62,7 @@ from fastdeploy.spec_decode import MTPProposer from fastdeploy.utils import get_logger from fastdeploy.worker.model_runner_base import ModelRunnerBase -from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput +from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, ModelRunnerOutput logger = get_logger("xpu_model_runner", "xpu_model_runner.log") @@ -118,7 +121,7 @@ def __init__( self.speculative_decoding = self.speculative_method is not None # used by SamplingMetadata - self.enable_logprob = False # fd_config.model_config.enable_logprob + self.enable_logprob = fd_config.model_config.enable_logprob # fd_config.model_config.enable_logprob self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop # Sampler @@ -156,6 +159,106 @@ def __init__( self.pd_disaggregation_mode: str = self.fd_config.parallel_config.pd_disaggregation_mode + # Initialize ZMQ client for async output + self.zmq_client = None + self.async_output_queue = None + if envs.FD_USE_GET_SAVE_OUTPUT_V1: + logger.info(f"zmq client get_save_output_rank{local_rank}") + self.zmq_client = ZmqIpcClient(name=f"get_save_output_rank{local_rank}", mode=zmq.PUSH) + self.zmq_client.connect() + self.zmq_client.socket.SNDTIMEO = 3000 + self.async_output_queue: queue.Queue = queue.Queue() + self.async_output_copy_thread = Thread( + target=self._async_output_busy_loop, + daemon=True, + name="WorkerAsyncOutputCopy", + ) + self.async_output_copy_thread.start() + # prompt logprobs state + self.prompt_logprobs_reqs: dict[str, Request] = {} + self.in_progress_prompt_logprobs: dict[str, LogprobsTensors] = {} + + def _async_output_busy_loop(self): + """Entrypoint for the thread which handles outputs asynchronously.""" + while True: + try: + if self.async_output_queue is None or self.zmq_client is None: + break + output = self.async_output_queue.get() + if self.zmq_client is not None: + self.zmq_client.send_pyobj(output) + except Exception as e: + logger.exception("Exception in async output loop: %s", e) + + def _get_prompt_logprobs_list(self, hidden_states: paddle.Tensor) -> list[Optional[LogprobsTensors]]: + """ + Build prompt_logprobs for requests that asked for it. + """ + if len(self.prompt_logprobs_reqs) > 0: + assert ( + not self.fd_config.cache_config.enable_prefix_caching + ), "prompt_logprobs must disable prefix caching, --no-enable-prefix-caching." + + if len(self.prompt_logprobs_reqs) == 0: + return self.scheduler_config.max_num_seqs * [None] + + logprobs_mode = self.fd_config.model_config.logprobs_mode + prompt_logprobs_list: list[Optional[LogprobsTensors]] = self.scheduler_config.max_num_seqs * [None] + completed_prefill_reqs: list[Request] = [] + + for req_id, request in self.prompt_logprobs_reqs.items(): + if not hasattr(request, "sampling_params") or request.sampling_params is None: + continue + num_prompt_logprobs = request.sampling_params.prompt_logprobs + if request.prompt_token_ids is None or num_prompt_logprobs is None: + continue + if num_prompt_logprobs == -1: + num_prompt_logprobs = self.ori_vocab_size + + num_tokens = request.prefill_end_index - request.prefill_start_index + num_prompt_tokens = len(request.prompt_token_ids) + + logprobs_tensors = self.in_progress_prompt_logprobs.get(req_id) + if not logprobs_tensors: + logprobs_tensors = LogprobsTensors.empty_cpu(num_prompt_tokens - 1, num_prompt_logprobs + 1) + self.in_progress_prompt_logprobs[req_id] = logprobs_tensors + + start_idx = request.prefill_start_index + start_tok = start_idx + 1 + num_remaining_tokens = num_prompt_tokens - start_tok + if num_tokens <= num_remaining_tokens: + num_logits = num_tokens + else: + num_logits = num_remaining_tokens + completed_prefill_reqs.append(request) + prompt_logprobs_list[request.idx] = logprobs_tensors + if num_logits <= 0: + continue + + offset = self.share_inputs["cu_seqlens_q"][request.idx] + prompt_hidden_states = hidden_states[offset : offset + num_logits] + logits = self.model.compute_logits(prompt_hidden_states) + prompt_token_ids = request.prompt_token_ids[start_tok : start_tok + num_logits] + prompt_token_ids_tensor = paddle.to_tensor(prompt_token_ids, dtype="int64") + if logprobs_mode == "raw_logprobs": + raw_logprobs = self.sampler.compute_logprobs(logits) + elif logprobs_mode == "raw_logits": + raw_logprobs = logits + else: + raw_logprobs = self.sampler.compute_logprobs(logits) + token_ids, logprobs, ranks = self.sampler.gather_logprobs( + raw_logprobs, num_prompt_logprobs, prompt_token_ids_tensor + ) + chunk_slice = slice(start_idx, start_idx + num_logits) + logprobs_tensors.logprob_token_ids[chunk_slice].copy_(token_ids, False) + logprobs_tensors.logprobs[chunk_slice].copy_(logprobs, False) + logprobs_tensors.selected_token_ranks[chunk_slice].copy_(ranks, False) + + for req in completed_prefill_reqs: + del self.prompt_logprobs_reqs[req.request_id] + del self.in_progress_prompt_logprobs[req.request_id] + return prompt_logprobs_list + def exist_prefill(self): """ check whether prefill stage exist @@ -405,6 +508,13 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int): self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + if ( + hasattr(request, "sampling_params") + and request.sampling_params is not None + and request.sampling_params.prompt_logprobs is not None + ): + self.prompt_logprobs_reqs[request.request_id] = request + if len(request.output_token_ids) == 0: input_ids = request.prompt_token_ids else: @@ -704,7 +814,9 @@ def _init_share_inputs(self, max_num_seqs: int): dtype="int64", ) self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64") - self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") + # self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") + # self.share_inputs["top_p"] default to 0.0 on XPU for consideration of the performance + self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32") self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["top_k_list"] = [0] * max_num_seqs self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32") @@ -1294,6 +1406,10 @@ class at the server level, which is too granular for ModelRunner. # 5. Speculative decode # 6. Post Process + prompt_logprobs_list = None + if not self.speculative_decoding: + prompt_logprobs_list = self._get_prompt_logprobs_list(model_output) + model_output_data = ModelOutputData( next_tokens=self.share_inputs["next_tokens"], stop_flags=self.share_inputs["stop_flags"], @@ -1321,6 +1437,7 @@ class at the server level, which is too granular for ModelRunner. accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], + prompt_logprobs_list=prompt_logprobs_list, ) if self.speculative_decoding: # base model post process @@ -1332,6 +1449,7 @@ class at the server level, which is too granular for ModelRunner. share_inputs=self.share_inputs, block_size=self.cache_config.block_size, skip_save_output=is_dummy_run, + async_output_queue=self.async_output_queue, think_end_id=self.model_config.think_end_id, line_break_id=self.model_config.line_break_id, ) diff --git a/requirements.txt b/requirements.txt index 50f95e00a3e..f97b5f59028 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn>=0.38.0 fastapi -paddleformers>=0.3.1 +paddleformers==0.4.1 redis etcd3 httpx @@ -45,3 +45,4 @@ msgspec einops setproctitle aistudio_sdk +p2pstore diff --git a/requirements_dcu.txt b/requirements_dcu.txt index 714e0ae1d63..1f0a20f2d44 100644 --- a/requirements_dcu.txt +++ b/requirements_dcu.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn==0.29.0 fastapi -paddleformers +paddleformers @ https://paddle-qa.bj.bcebos.com/ernie/paddleformers-0.4.0.post20251222-py3-none-any.whl redis etcd3 httpx diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt index d91cf1639b0..fb0d702c4fa 100644 --- a/requirements_iluvatar.txt +++ b/requirements_iluvatar.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn==0.29.0 fastapi -paddleformers==0.3.1 +paddleformers @ https://paddle-qa.bj.bcebos.com/ernie/paddleformers-0.4.0.post20251222-py3-none-any.whl redis etcd3 httpx diff --git a/requirements_metaxgpu.txt b/requirements_metaxgpu.txt index d49339b0f6d..96f1c458472 100644 --- a/requirements_metaxgpu.txt +++ b/requirements_metaxgpu.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn==0.29.0 fastapi -paddleformers==0.3.2 +paddleformers @ https://paddle-qa.bj.bcebos.com/ernie/paddleformers-0.4.0.post20251222-py3-none-any.whl redis etcd3 httpx diff --git a/run_r3_test.sh b/run_r3_test.sh new file mode 100644 index 00000000000..b1fbc2d8154 --- /dev/null +++ b/run_r3_test.sh @@ -0,0 +1,27 @@ +unset http_proxy +unset https_proxy +export ENABLE_V1_KVCACHE_SCHEDULER=1 +export FD_DEBUG=1 +export PYTHONPATH=/root/paddlejob/workspace/env_run/output/gongshaotian/baidu/paddle_internal/FastDeploy:$PYTHONPATH +export CUDA_VISIBLE_DEVICES=0 +export SPECULATE_VERIFY_USE_TARGET_SAMPLING=1 + +rm -rf log +rm -rf core.* + +config_yaml=./benchmarks/yaml/eb45-32k-wint2-tp4.yaml +model_path=/root/paddlejob/workspace/env_run/output/models/paddle/ERNIE-4.5-21B-A3B-Paddle +python -m fastdeploy.entrypoints.openai.api_server --config ${config_yaml} --model ${model_path} \ + --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 64 \ + --enable-chunked-prefill --enable-prefix-caching --port 8888 --max-num-batched-tokens 64 --metrics-port 8889 --engine-worker-queue-port 9999 \ + --graph-optimization-config '{"use_cudagraph": true}' \ + --routing-replay-config '{"enable_routing_replay":true, "routing_store_type":"rdma", "local_store_dir":"./routing_replay_output", "use_fused_put":true, "rdma_store_server":"redis://10.95.239.155:6379"}' \ + # --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "num_model_steps": 1,"model": "'$model_path'/mtp"}' \ + + +curl -X POST "http://0.0.0.0:8888/v1/chat/completions" -H "Content-Type: application/json" -d '{ + "messages": [ + {"role": "system", "content": "你是谁"} + ] , + "temperature":0 + }' diff --git a/scripts/calculate_avg_entropy.py b/scripts/calculate_avg_entropy.py new file mode 100644 index 00000000000..e2e272d0c5e --- /dev/null +++ b/scripts/calculate_avg_entropy.py @@ -0,0 +1,70 @@ +import argparse +import glob +import os +import re +from typing import List, Optional + + +def extract_entropy_values(log_path: str) -> List[float]: + pattern = r"entropy:\s*([0-9]+\.?[0-9]*(?:[eE][+-]?[0-9]+)?)" + + entropy_values = [] + with open(log_path, "r") as f: + lines = f.readlines() + for line in lines: + match = re.search(pattern, line) + if match: + try: + entropy_value = float(match.group(1)) + entropy_values.append(entropy_value) + except ValueError: + continue + + return entropy_values + + +def calculate_average(entropy_values: List[float], drop_ratio: float = 0.1) -> Optional[float]: + if not entropy_values: + return None + sorted_vals = sorted(entropy_values) + n = len(sorted_vals) + drop_count = int(n * drop_ratio) + filtered_vals = sorted_vals[drop_count : n - drop_count] if drop_count > 0 else sorted_vals + if not filtered_vals: + return None, [] + avg = sum(filtered_vals) / len(filtered_vals) + return avg, filtered_vals + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--log-dir", type=str, required=True) + parser.add_argument("--drop-ratio", "-d", type=float, default=0.1) + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument("--start-id", "-s", type=int) + parser.add_argument("--end-id", "-e", type=int) + args = parser.parse_args() + + log_files = glob.glob(os.path.join(args.log_dir, "data_processor.log.*")) + if not log_files: + print(f"No log files found in {args.log_dir}") + return + + entropy_values = [] + for log_file in log_files: + entropy_values.extend(extract_entropy_values(log_file)) + if args.start_id and args.end_id: + entropy_values = entropy_values[args.start_id : args.end_id] + average_entropy, filtered_vals = calculate_average(entropy_values, args.drop_ratio) + + print(f"{len(entropy_values)} entropy values were found") + print(f"effective entropy values: {len(filtered_vals)} (drop ratio {args.drop_ratio})") + print(f"Average entropy: {average_entropy:.10f}") + if args.verbose: + print("\nentropy details:") + for i, value in enumerate(filtered_vals, 1): + print(f" {i}. {value}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_xpu_ci_pytest.sh b/scripts/run_xpu_ci_pytest.sh index a7175350be0..f57e096f71e 100644 --- a/scripts/run_xpu_ci_pytest.sh +++ b/scripts/run_xpu_ci_pytest.sh @@ -72,14 +72,14 @@ echo "卸载旧版本..." python -m pip uninstall paddlepaddle-xpu -y python -m pip uninstall fastdeploy-xpu -y -# 安装PaddlePaddle -echo "安装PaddlePaddle..." -python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/ +# 安装PaddlePaddle Release分支安装对应的paddle +echo "安装release分支PaddlePaddle..." +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl # ============ 编译项目 ============ echo "============================编译项目============================" -bash custom_ops/xpu_ops/download_dependencies.sh develop +bash custom_ops/xpu_ops/download_dependencies.sh stable export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm bash build.sh || exit 1 diff --git a/setup.py b/setup.py index 8e6037aeb27..e6cbf436f90 100644 --- a/setup.py +++ b/setup.py @@ -251,7 +251,7 @@ def get_name(): cmdclass_dict = {"bdist_wheel": CustomBdistWheel} cmdclass_dict["build_ext"] = CMakeBuild -FASTDEPLOY_VERSION = os.environ.get("FASTDEPLOY_VERSION", "2.3.0-dev") +FASTDEPLOY_VERSION = os.environ.get("FASTDEPLOY_VERSION", "2.4.0") cmdclass_dict["build_optl"] = PostInstallCommand diff --git a/subprocess_test_case.py b/subprocess_test_case.py new file mode 100644 index 00000000000..825845e0a31 --- /dev/null +++ b/subprocess_test_case.py @@ -0,0 +1,206 @@ +import numpy +from typing import List, Dict, Any, TypedDict +from concurrent.futures import ThreadPoolExecutor +from multiprocessing import Process, Queue +import asyncio +import time +import numpy as np +import paddle +from concurrent.futures import ThreadPoolExecutor +from typing import List, Dict, Any, TypedDict +import atexit +import threading +import os + +class RoutingManager(object): + def __init__(self) -> None: + + # Initialize routing store + self._routing_store = RoutingStoreLocal() + + # Initialize routing store wrapper + self._routing_store_process = StoreWrapper( + routing_store=self._routing_store + ) + +class StoreTask(TypedDict): + task_type: str + key: str + data: np.ndarray + +class StoreProcess(Process): + def __init__(self, task_queue: Queue, routing_store: object) -> None: + self._task_quequ = task_queue + self._routing_store = routing_store + + def run(self): + print(f"[R3] Start Running Store Wrapper in sub process {os.getpid()}") + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + while True: + try: + + task = self._task_quequ.get() + + if task is None: # Sentinel + self._task_quequ.task_done() + break + + if task['task_type'] == 'put': + executor.submit(self.process_put_task, task) + elif task['task_type'] == 'clear_store': + executor.submit(self.process_clear_store_task, task) + self._task_quequ.task_done() + elif task['task_type'] == 'clear_prefix_batch': + executor.submit(self.process_clear_prefix_batch_task, task) + else: + raise ValueError(f'Unknown task type: {task["task_type"]}') + + except Exception as e: + self._task_quequ.task_done() + raise ValueError(f'{e}') + + print(f"[Consumer Process {Process.current_process().pid}] Shutdown.") + + def process_put_task(self, store_task: StoreTask) -> None: + """ """ + self._routing_store.put(store_task.key, store_task.data) + + def process_clear_store_task(self, store_task: StoreTask) -> None: + """ """ + self._routing_store.clear() + + def process_clear_prefix_batch_task(self, store_task: StoreTask) -> None: + """ """ + self._routing_store.delete_prefix_batch(store_task.key) + +class StoreWrapper(object): + def __init__(self) -> None: + # Initialize task queue + layer_num = 61 + max_request = 200 + self.queue_max_size = layer_num * max_request + self._task_queue = Queue(maxsize=self.queue_max_size) + self._monitor_thread: threading.Thread = None + self._stop_monitor = threading.Event() + + # Initialize consumer process + self._routing_store_process = StoreProcess( + task_queue=self._task_queue, + routing_store=self._routing_store + ) + self._is_running = False + + # Register atexit handler + atexit.register(self.shutdown) + + def shutdown(self): + """ """ + if not self._is_running: + return + print.info("Shutting down...") + self._is_running = False + + # Put a sentinel value to signal the consumer to stop + try: + self._task_queue.put_nowait(None) + except: + pass + if self._consumer_process and self._consumer_process.is_alive(): + # Wait for all tasks to be processed + self._consumer_process.join(timeout=5.0) + if self._consumer_process.is_alive(): + self._consumer_process.terminate() + self._is_running = False + + def start_store_warpper(self): + """ """ + if self._wrapper_is_running: + return + self._is_running = True + + # Start monitor thread + self._stop_monitor.clear() + self._monitor_thread = threading.Thread(target=self._monitor_queue_load, daemon=True) + self._monitor_thread.start() + + # Start Routing Store Wrapper in sub process + self._routing_store_process.run() + + def _monitor_queue_load(self): + """ """ + while not self._stop_monitor.is_set(): + time.sleep(2.0) + qsize = self._task_queue.qsize() + + # Alarm when the task exceeds 80% of the queue capacity + if qsize > self.queue_max_size * 0.8: + print( + f"[Monitor] Queue load is HIGH: {qsize}/{self.queue_max_size}. " + f"Dropped tasks so far: {self._dropped_tasks}. " + "Consider increasing max_workers or queue_max_size." + ) + else: + print(f"[Monitor] Queue load: {qsize}/{self.queue_max_size}. Healthy.") + + def submit_put_task(self, routing_indices: paddle.Tensor, rollout_id: str, layer_idx: int) -> None: + """ Submit a put task to the task queue""" + if not self._is_running: + raise RuntimeError("Store not started.") + + rdma_rollout_key = f"{rollout_id}_{layer_idx}" + routing_indices_cpu = routing_indices.cpu() + routing_indices_np = np.array(routing_indices_cpu.numpy(), copy=True) + + task: StoreTask = { + "type": "put", + "key": rdma_rollout_key, + "data": routing_indices_np + } + + try: + self._task_queue.put_nowait(task) + except Exception: + raise RuntimeError( + f"Queue is FULL. Dropping put task for key: {rdma_rollout_key}. " + ) + + def submit_clear_task(self) -> None: + """ Submit clear store task """ + if not self._is_running: + raise RuntimeError("Store not started.") + + task: StoreTask = { + "type": "clear_store", + "key": None, + "data": None + } + + try: + self._task_queue.put_nowait(task) + # Wait for the task to be processed + self._task_queue.join() + except Exception: + raise RuntimeError( + f"Queue is FULL. Dropping put task for key: clear_store. " + ) + + def submit_clear_prefix_batch_task(self, rollout_id) -> None: + """ Submit clear prefix batch task""" + if not self._is_running: + raise RuntimeError("Store not started.") + prefix_batch = self.get_needed_clear_ids(rollout_id) + + if prefix_batch is None: + return + + task :StoreTask = { + "type": "clear_prefix_batch", + "key": prefix_batch, + "data": None + } + try: + self._task_queue.put_nowait(task) + except Exception: + raise RuntimeError( + f"Queue is FULL. Dropping put task for key: clear_store. ") + diff --git a/test_subprocess.py b/test_subprocess.py new file mode 100644 index 00000000000..7c5853e47e6 --- /dev/null +++ b/test_subprocess.py @@ -0,0 +1,251 @@ +import asyncio +import time +import numpy as np +import paddle +import logging +from multiprocessing import Process, Queue +from concurrent.futures import ThreadPoolExecutor +from typing import List, Dict, Any, TypedDict +import atexit +import threading + +# ... (省略之前的 Mock 和基础类定义,保持不变) ... + +class PutTask(TypedDict): + type: str + key: str + data: np.ndarray + +class RoutingStoreRDMA(RoutingStoreBase): + """ + Producer-Consumer RDMA Store with NON-BLOCKING producer. + Goal: Main process never waits for IO. + """ + + def __init__(self, fd_config: FDConfig, max_workers: int = 4, queue_max_size: int = 10000) -> None: + super().__init__(fd_config=fd_config) + try: + from p2pstore import P2PClient, P2PConfig + except ModuleNotFoundError: + raise ModuleNotFoundError("RoutingStoreRDMA and p2pstore only supported in RLHF environment.") + + self.max_workers = max_workers + self.queue_max_size = queue_max_size + + # 使用更大的队列减少丢弃概率 + self._task_queue: Queue = Queue(maxsize=self.queue_max_size) + + self._consumer_process: Process = None + self._monitor_thread: threading.Thread = None + self._stop_monitor = threading.Event() + + self.p2p_config = P2PConfig(metadata_server=fd_config.routing_replay_config.rdma_store_server) + self.p2p_client = None # 将在子进程中初始化 + + self._is_running = False + self._dropped_tasks = 0 + + atexit.register(self.shutdown) + + # --- 消费者侧逻辑 (子进程) --- + + def _consumer_worker(self, task: PutTask): + """工作线程执行实际的 put""" + key = task['key'] + data = task['data'] + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(self.p2p_client.put(key, data)) + except Exception as e: + logger.error(f"Worker failed for key {key}: {e}") + finally: + loop.close() + + def _consumer_process_main(self, task_queue: Queue, p2p_config: P2PConfig): + """消费者进程主循环""" + print(f"[Consumer Process {Process.current_process().pid}] Started with {self.max_workers} workers.") + self.p2p_client = P2PClient(p2p_config) + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + while True: + try: + # 阻塞等待任务,这是消费者进程该做的事 + task = task_queue.get() + if task is None: # Sentinel + break + + # 提交给线程池异步执行 + executor.submit(self._consumer_worker, task) + + except Exception as e: + logger.error(f"Consumer loop error: {e}") + break + + print(f"[Consumer Process {Process.current_process().pid}] Shutdown.") + + # --- 生产者侧逻辑 (主进程) --- + + def _monitor_queue_load(self): + """后台监控线程:仅用于观察,绝不阻塞主逻辑""" + while not self._stop_monitor.is_set(): + time.sleep(2.0) + qsize = self._task_queue.qsize() + # 如果队列长度超过 80%,说明消费者跟不上了,需要告警 + if qsize > self.queue_max_size * 0.8: + logger.warning( + f"[Monitor] Queue load is HIGH: {qsize}/{self.queue_max_size}. " + f"Dropped tasks so far: {self._dropped_tasks}. " + "Consider increasing max_workers or queue_max_size." + ) + else: + logger.info(f"[Monitor] Queue load: {qsize}/{self.queue_max_size}. Healthy.") + + def start(self): + """启动消费者进程和监控线程""" + if self._is_running: + return + + self._is_running = True + self._consumer_process = Process( + target=self._consumer_process_main, + args=(self._task_queue, self.p2p_config), + daemon=True + ) + self._consumer_process.start() + + # 启动监控线程(守护线程) + self._stop_monitor.clear() + self._monitor_thread = threading.Thread(target=self._monitor_queue_load, daemon=True) + self._monitor_thread.start() + + logger.info(f"RoutingStoreRDMA started. Consumer PID: {self._consumer_process.pid}") + + async def put(self, routing_indices: paddle.Tensor, rollout_id: str, layer_idx: int) -> None: + """ + 【非阻塞】生产者接口:极速入队,立即返回。 + 如果队列满了,直接丢弃并计数(也可以选择抛异常或其他策略)。 + """ + if not self._is_running: + raise RuntimeError("Store not started.") + + rdma_rollout_key = f"{rollout_id}_{layer_idx}" + + # 数据准备(这部分在主进程做,因为需要访问 Tensor) + routing_indices_cpu = routing_indices.cpu() + routing_indices_np = np.array(routing_indices_cpu.numpy(), copy=True) + + task: PutTask = { + "type": "put", + "key": rdma_rollout_key, + "data": routing_indices_np + } + + try: + # 核心:put_nowait 绝对不阻塞 + self._task_queue.put_nowait(task) + except Exception: + # 队列满了 + self._dropped_tasks += 1 + logger.warning( + f"Queue is FULL. Dropping put task for key: {rdma_rollout_key}. " + f"Total dropped: {self._dropped_tasks}" + ) + # 这里不抛异常,不阻塞,仅仅记录日志 + + async def fused_put(self, routing_indices: paddle.Tensor, rollout_id: str) -> None: + """【非阻塞】生产者接口:极速入队""" + if not self._is_running: + raise RuntimeError("Store not started.") + + rdma_rollout_key = f"{rollout_id}" + routing_indices_cpu = routing_indices.cpu() + routing_indices_np = routing_indices_cpu.numpy() + + task: PutTask = { + "type": "fused_put", + "key": rdma_rollout_key, + "data": routing_indices_np + } + + try: + self._task_queue.put_nowait(task) + except Exception: + self._dropped_tasks += 1 + logger.warning( + f"Queue is FULL. Dropping fused_put task for key: {rdma_rollout_key}. " + f"Total dropped: {self._dropped_tasks}" + ) + + # --- 同步/管理接口 --- + + def wait_completion(self, timeout: float = 30.0): + """ + 【可选同步】等待所有队列中的任务被处理完。 + 仅在程序退出前调用,平时不要调用。 + """ + if not self._is_running: + return + + logger.info("Waiting for consumer to finish remaining tasks...") + start = time.time() + + # 1. 发送停止信号给消费者进程 + self._task_queue.put(None) + + # 2. 等待消费者进程结束 + self._consumer_process.join(timeout=timeout) + + if self._consumer_process.is_alive(): + logger.error("Consumer did not finish in time. Terminating.") + self._consumer_process.terminate() + + # 3. 停止监控 + self._stop_monitor.set() + if self._monitor_thread: + self._monitor_thread.join(timeout=2.0) + + logger.info(f"Wait completed in {time.time() - start:.2f}s. Total dropped tasks: {self._dropped_tasks}") + + def shutdown(self): + """优雅关闭""" + if not self._is_running: + return + + logger.info("Shutting down...") + self._is_running = False + + # 确保队列里有东西让消费者醒来(如果之前空了) + try: + self._task_queue.put_nowait(None) + except: + pass + + if self._consumer_process and self._consumer_process.is_alive(): + self._consumer_process.join(timeout=5.0) + if self._consumer_process.is_alive(): + self._consumer_process.terminate() + + self._is_running = False + logger.info("Shutdown complete.") + + # ... (get, clear 等同步方法保持不变,它们直接创建临时 client) ... + def get(self, rollout_id: str, layer_idx: int = None) -> paddle.Tensor: + rdma_rollout_key = f"{rollout_id}_{layer_idx}" if layer_idx is not None else rollout_id + # 临时创建 client 用于同步读 + tmp_client = P2PClient(self.p2p_config) + tmp_routing = asyncio.run(tmp_client.get(rdma_rollout_key)) + return paddle.to_tensor(tmp_routing) + + def clear(self, rollout_id: str, layer_idx: int = None) -> None: + rdma_rollout_key = f"{rollout_id}_{layer_idx}" if layer_idx is not None else rollout_id + tmp_client = P2PClient(self.p2p_config) + asyncio.run(tmp_client.delete(rdma_rollout_key)) + + async def clear_prefix_batch(self, roullout_id_prefixes: List[str]): + tmp_client = P2PClient(self.p2p_config) + await tmp_client.delete_prefix_batch(roullout_id_prefixes) + + def clear_store(self): + tmp_client = P2PClient(self.p2p_config) + asyncio.run(tmp_client.clear()) \ No newline at end of file diff --git a/tests/cache_manager/test_cache_transfer_manager.py b/tests/cache_manager/test_cache_transfer_manager.py index f09fc603325..c52471a72e7 100644 --- a/tests/cache_manager/test_cache_transfer_manager.py +++ b/tests/cache_manager/test_cache_transfer_manager.py @@ -26,6 +26,7 @@ class Args: value_cache_shape = "" create_cache_tensor = False cache_dtype = "bfloat16" + default_dtype = "bfloat16" # ========================== diff --git a/tests/ce/deploy/deploy.py b/tests/ce/deploy/deploy.py index be6a4f0bf7d..856a7b594ad 100644 --- a/tests/ce/deploy/deploy.py +++ b/tests/ce/deploy/deploy.py @@ -89,7 +89,7 @@ def build_command(config): # 添加配置参数 for key, value in config.items(): - if "--enable" in key: + if "--enable" in key or "--no-enable" in key: value = bool(value if isinstance(value, bool) else eval(value)) if value: cmd.append(key) diff --git a/tests/ce/server/test_logprobs.py b/tests/ce/server/test_logprobs.py index 83ca89486c9..3674b3a6b96 100644 --- a/tests/ce/server/test_logprobs.py +++ b/tests/ce/server/test_logprobs.py @@ -25,10 +25,10 @@ def test_unstream_with_logprobs(): # 校验返回内容与概率信息 assert resp_json["choices"][0]["message"]["content"] == "牛顿的" assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], "top_logprobs": None, } @@ -102,10 +102,10 @@ def test_stream_with_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], } @@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.006811376195400953, + "logprob": -0.0068125599063932896, "bytes": [231, 137, 155, 233, 161, 191], } diff --git a/tests/ce/stable_cases/launch_model.sh b/tests/ce/stable_cases/launch_model.sh index 3b758a15a2a..7975a847873 100644 --- a/tests/ce/stable_cases/launch_model.sh +++ b/tests/ce/stable_cases/launch_model.sh @@ -1,46 +1,44 @@ #!/bin/bash MODEL_PATH="${1}/TP2" -FD_API_PORT=${FD_API_PORT:-8000} -FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT:-8001} -FD_METRICS_PORT=${FD_METRICS_PORT:-8002} -FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT:-8003} - - +FD_API_PORT=${FD_API_PORT:-8180} +FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT:-8181} +FD_METRICS_PORT=${FD_METRICS_PORT:-8182} +FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT:-8183} if [ -z "$MODEL_PATH" ]; then - echo "❌ 用法: $0 <模型路径>" + echo "❌ Usage: $0 " exit 1 fi if [ ! -d "$MODEL_PATH" ]; then - echo "❌ 错误:模型目录不存在: $MODEL_PATH" + echo "❌ Error: Model directory does not exist: $MODEL_PATH" exit 1 fi -echo "使用模型: $MODEL_PATH" - +echo "Using model: $MODEL_PATH" -# 清理日志 +# Clean logs rm -rf log/* mkdir -p log -# 环境变量 +# Environment variables export CUDA_VISIBLE_DEVICES=0,1 export INFERENCE_MSG_QUEUE_ID=${FD_INFERENCE_MSG_QUEUE_ID:-7679} export ENABLE_V1_KVCACHE_SCHEDULER=1 - +echo "Starting API server" python -m fastdeploy.entrypoints.openai.api_server \ --tensor-parallel-size 2 \ --port ${FD_API_PORT} \ --engine-worker-queue-port ${FD_ENGINE_QUEUE_PORT} \ --metrics-port ${FD_METRICS_PORT} \ --cache-queue-port ${FD_CACHE_QUEUE_PORT} \ - --quantization wint8 \ --max-model-len 32768 \ --max-num-seqs 1 \ --gpu-memory-utilization 0.9 \ --model "$MODEL_PATH" \ + --no-shutdown-comm-group-if-worker-idle \ + --swap-space 10 \ --load-strategy ipc_snapshot \ --dynamic-load-weight & @@ -48,12 +46,13 @@ success=0 for i in $(seq 1 300); do if (echo > /dev/tcp/127.0.0.1/$FD_API_PORT) >/dev/null 2>&1; then - echo "API server is up on port $FD_API_PORT on iteration $i" + echo "API server is up on port $FD_API_PORT at iteration $i" success=1 break fi sleep 1 done + if [ $success -eq 0 ]; then - echo "超时: API 服务在 300 秒内未启动 (端口 $FD_API_PORT)" + echo "Timeout: API server did not start within 300 seconds (port $FD_API_PORT)" fi diff --git a/tests/ce/stable_cases/run.sh b/tests/ce/stable_cases/run.sh index 81197253ba5..e0d77eafbae 100644 --- a/tests/ce/stable_cases/run.sh +++ b/tests/ce/stable_cases/run.sh @@ -1,18 +1,18 @@ #!/bin/bash # ================== Configuration Parameters ================== -FD_API_PORT=${FD_API_PORT:-8000} -FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT:-8001} -FD_METRICS_PORT=${FD_METRICS_PORT:-8002} -FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT:-8003} +FD_API_PORT=${FD_API_PORT:-8180} +FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT:-8181} +FD_METRICS_PORT=${FD_METRICS_PORT:-8182} +FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT:-8183} HOST="0.0.0.0" -PORT="${FD_API_PORT}" # 这里需要配合启动脚本那个URL PORT +PORT="${FD_API_PORT}" BASE_URL="http://$HOST:$PORT" -TOTAL_ROUNDS=30 -CHAT_REQUESTS_PER_ROUND=1 +TOTAL_ROUNDS=6 +CHAT_REQUESTS_PER_ROUND=3 export CUDA_VISIBLE_DEVICES=0,1 MAX_MEMORY_MB=10240 # 10GB @@ -79,24 +79,72 @@ check_gpu_memory() { local gpu_ids gpu_ids=($(get_visible_gpu_ids)) + echo "========== GPU Memory Check ==========" + echo "CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES" + echo "MAX_MEMORY_MB = $MAX_MEMORY_MB" + echo "======================================" + if [ ${#gpu_ids[@]} -eq 0 ]; then echo "Assertion failed: No valid GPU IDs in CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'" >&2 exit 1 fi for gpu_id in "${gpu_ids[@]}"; do - local memory_used - memory_used=$(nvidia-smi -i "$gpu_id" --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null) || \ - assert_success $? "Failed to query GPU $gpu_id memory usage" - - if ! [[ "$memory_used" =~ ^[0-9]+ ]]; then - echo "Assertion failed: Invalid memory value for GPU $gpu_id: $memory_used" >&2 + echo + echo "---- GPU $gpu_id ----" + + # Query summary + local summary + summary=$(nvidia-smi -i "$gpu_id" \ + --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu \ + --format=csv,noheader,nounits) || { + echo "Failed to query GPU $gpu_id summary" >&2 + exit 1 + } + + # Parse fields + IFS=',' read -r idx name mem_total mem_used mem_free util <<< "$summary" + + echo "GPU $idx: $name" + echo "Total Memory : ${mem_total} MB" + echo "Used Memory : ${mem_used} MB" + echo "Free Memory : ${mem_free} MB" + echo "GPU Util : ${util} %" + + # --- Hard assertions --- + assert_true "$(( mem_used <= MAX_MEMORY_MB ))" \ + "GPU $gpu_id memory.used ${mem_used} MB > MAX_MEMORY_MB ${MAX_MEMORY_MB} MB" + + # --- Soft safety check: usage ratio --- + local used_ratio + used_ratio=$(( mem_used * 100 / mem_total )) + + echo "Used Ratio : ${used_ratio} %" + + if [ "$used_ratio" -gt 90 ]; then + echo "Assertion failed: GPU $gpu_id memory usage > 90% (${used_ratio}%)" >&2 exit 1 fi - assert_true "$(( memory_used <= MAX_MEMORY_MB ))" \ - "GPU $gpu_id memory $memory_used MB > $MAX_MEMORY_MB MB" + # --- Process-level attribution --- + echo "Processes on GPU $gpu_id:" + local proc_info + proc_info=$(nvidia-smi -i "$gpu_id" \ + --query-compute-apps=pid,process_name,used_memory \ + --format=csv,noheader,nounits) + + if [ -z "$proc_info" ]; then + echo " (No active compute processes)" + else + echo "$proc_info" | while IFS=',' read -r pid pname pmem; do + echo " PID=$pid NAME=$pname MEM=${pmem}MB" + done + fi + + echo "GPU $gpu_id memory check PASSED" done + + echo "========== GPU Memory Check DONE ==========" } # ==================================================== @@ -108,6 +156,7 @@ for round in $(seq 1 $TOTAL_ROUNDS); do echo "[Step 1] Clearing load weight..." curl_get_status -i "$BASE_URL/clear_load_weight" assert_eq "$http_code" "200" "/clear_load_weight failed with HTTP $http_code" + sleep 5 # Step 2: Check GPU memory usage echo "[Step 2] Checking GPU memory..." diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index e51018f201e..acbf7872e66 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24") else: - base_file = "ernie-4_5-vl-base-tp2-dev" + base_file = "ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/ci_use/GLM-45-AIR/baseline.txt b/tests/ci_use/GLM-45-AIR/baseline.txt index bddb29fdace..4ebb05f0ce9 100644 --- a/tests/ci_use/GLM-45-AIR/baseline.txt +++ b/tests/ci_use/GLM-45-AIR/baseline.txt @@ -2,12 +2,26 @@ lm_head.linear.weight lm_head.linear.weight:lm_head.weight model.embed_tokens.embeddings.weight model.embed_tokens.embeddings.weight:model.embed_tokens.weight +model.layers.0.eh_proj.linear.weight:model.layers.0.eh_proj.linear.weight +model.layers.0.enorm.weight:model.layers.0.enorm.weight +model.layers.0.hnorm.weight:model.layers.0.hnorm.weight model.layers.0.input_layernorm.weight model.layers.0.input_layernorm.weight:model.layers.0.input_layernorm.weight model.layers.0.mlp.down_proj.weight model.layers.0.mlp.down_proj.weight:model.layers.0.mlp.down_proj.weight model.layers.0.mlp.up_gate_proj.weight model.layers.0.mlp.up_gate_proj.weight:model.layers.0.mlp.up_gate_proj.weight +model.layers.0.mtp_block.input_layernorm.weight:model.layers.0.mtp_block.input_layernorm.weight +model.layers.0.mtp_block.mlp.experts.down_proj_weight:model.layers.0.mtp_block.mlp.experts.down_proj_weight +model.layers.0.mtp_block.mlp.experts.up_gate_proj_weight:model.layers.0.mtp_block.mlp.experts.up_gate_proj_weight +model.layers.0.mtp_block.mlp.gate.e_score_correction_bias:model.layers.0.mtp_block.mlp.gate.e_score_correction_bias +model.layers.0.mtp_block.mlp.gate.weight:model.layers.0.mtp_block.mlp.gate.weight +model.layers.0.mtp_block.mlp.shared_experts.down_proj.weight:model.layers.0.mtp_block.mlp.shared_experts.down_proj.weight +model.layers.0.mtp_block.mlp.shared_experts.up_gate_proj.weight:model.layers.0.mtp_block.mlp.shared_experts.up_gate_proj.weight +model.layers.0.mtp_block.post_attention_layernorm.weight:model.layers.0.mtp_block.post_attention_layernorm.weight +model.layers.0.mtp_block.self_attn.o_proj.weight:model.layers.0.mtp_block.self_attn.o_proj.weight +model.layers.0.mtp_block.self_attn.qkv_proj.bias:model.layers.0.mtp_block.self_attn.qkv_proj.bias +model.layers.0.mtp_block.self_attn.qkv_proj.weight:model.layers.0.mtp_block.self_attn.qkv_proj.weight model.layers.0.post_attention_layernorm.weight model.layers.0.post_attention_layernorm.weight:model.layers.0.post_attention_layernorm.weight model.layers.0.self_attn.o_proj.weight @@ -16,6 +30,7 @@ model.layers.0.self_attn.qkv_proj.bias model.layers.0.self_attn.qkv_proj.bias:model.layers.0.self_attn.qkv_proj.bias model.layers.0.self_attn.qkv_proj.weight model.layers.0.self_attn.qkv_proj.weight:model.layers.0.self_attn.qkv_proj.weight +model.layers.0.shared_head.norm.weight:model.layers.0.shared_head.norm.weight model.layers.1.input_layernorm.weight model.layers.1.input_layernorm.weight:model.layers.1.input_layernorm.weight model.layers.1.mlp.experts.down_proj_weight @@ -39,5 +54,45 @@ model.layers.1.self_attn.qkv_proj.bias model.layers.1.self_attn.qkv_proj.bias:model.layers.1.self_attn.qkv_proj.bias model.layers.1.self_attn.qkv_proj.weight model.layers.1.self_attn.qkv_proj.weight:model.layers.1.self_attn.qkv_proj.weight +model.layers.2.eh_proj.weight:model.layers.2.eh_proj.weight +model.layers.2.enorm.weight:model.layers.2.enorm.weight +model.layers.2.hnorm.weight:model.layers.2.hnorm.weight +model.layers.2.mlp.experts.down_proj_weight:['model.layers.2.mlp.experts.0.down_proj.weight', 'model.layers.2.mlp.experts.1.down_proj.weight', 'model.layers.2.mlp.experts.2.down_proj.weight', 'model.layers.2.mlp.experts.3.down_proj.weight', 'model.layers.2.mlp.experts.4.down_proj.weight', 'model.layers.2.mlp.experts.5.down_proj.weight', 'model.layers.2.mlp.experts.6.down_proj.weight', 'model.layers.2.mlp.experts.7.down_proj.weight', 'model.layers.2.mlp.experts.8.down_proj.weight', 'model.layers.2.mlp.experts.9.down_proj.weight', 'model.layers.2.mlp.experts.10.down_proj.weight', 'model.layers.2.mlp.experts.11.down_proj.weight', 'model.layers.2.mlp.experts.12.down_proj.weight', 'model.layers.2.mlp.experts.13.down_proj.weight', 'model.layers.2.mlp.experts.14.down_proj.weight', 'model.layers.2.mlp.experts.15.down_proj.weight', 'model.layers.2.mlp.experts.16.down_proj.weight', 'model.layers.2.mlp.experts.17.down_proj.weight', 'model.layers.2.mlp.experts.18.down_proj.weight', 'model.layers.2.mlp.experts.19.down_proj.weight', 'model.layers.2.mlp.experts.20.down_proj.weight', 'model.layers.2.mlp.experts.21.down_proj.weight', 'model.layers.2.mlp.experts.22.down_proj.weight', 'model.layers.2.mlp.experts.23.down_proj.weight', 'model.layers.2.mlp.experts.24.down_proj.weight', 'model.layers.2.mlp.experts.25.down_proj.weight', 'model.layers.2.mlp.experts.26.down_proj.weight', 'model.layers.2.mlp.experts.27.down_proj.weight', 'model.layers.2.mlp.experts.28.down_proj.weight', 'model.layers.2.mlp.experts.29.down_proj.weight', 'model.layers.2.mlp.experts.30.down_proj.weight', 'model.layers.2.mlp.experts.31.down_proj.weight', 'model.layers.2.mlp.experts.32.down_proj.weight', 'model.layers.2.mlp.experts.33.down_proj.weight', 'model.layers.2.mlp.experts.34.down_proj.weight', 'model.layers.2.mlp.experts.35.down_proj.weight', 'model.layers.2.mlp.experts.36.down_proj.weight', 'model.layers.2.mlp.experts.37.down_proj.weight', 'model.layers.2.mlp.experts.38.down_proj.weight', 'model.layers.2.mlp.experts.39.down_proj.weight', 'model.layers.2.mlp.experts.40.down_proj.weight', 'model.layers.2.mlp.experts.41.down_proj.weight', 'model.layers.2.mlp.experts.42.down_proj.weight', 'model.layers.2.mlp.experts.43.down_proj.weight', 'model.layers.2.mlp.experts.44.down_proj.weight', 'model.layers.2.mlp.experts.45.down_proj.weight', 'model.layers.2.mlp.experts.46.down_proj.weight', 'model.layers.2.mlp.experts.47.down_proj.weight', 'model.layers.2.mlp.experts.48.down_proj.weight', 'model.layers.2.mlp.experts.49.down_proj.weight', 'model.layers.2.mlp.experts.50.down_proj.weight', 'model.layers.2.mlp.experts.51.down_proj.weight', 'model.layers.2.mlp.experts.52.down_proj.weight', 'model.layers.2.mlp.experts.53.down_proj.weight', 'model.layers.2.mlp.experts.54.down_proj.weight', 'model.layers.2.mlp.experts.55.down_proj.weight', 'model.layers.2.mlp.experts.56.down_proj.weight', 'model.layers.2.mlp.experts.57.down_proj.weight', 'model.layers.2.mlp.experts.58.down_proj.weight', 'model.layers.2.mlp.experts.59.down_proj.weight', 'model.layers.2.mlp.experts.60.down_proj.weight', 'model.layers.2.mlp.experts.61.down_proj.weight', 'model.layers.2.mlp.experts.62.down_proj.weight', 'model.layers.2.mlp.experts.63.down_proj.weight', 'model.layers.2.mlp.experts.64.down_proj.weight', 'model.layers.2.mlp.experts.65.down_proj.weight', 'model.layers.2.mlp.experts.66.down_proj.weight', 'model.layers.2.mlp.experts.67.down_proj.weight', 'model.layers.2.mlp.experts.68.down_proj.weight', 'model.layers.2.mlp.experts.69.down_proj.weight', 'model.layers.2.mlp.experts.70.down_proj.weight', 'model.layers.2.mlp.experts.71.down_proj.weight', 'model.layers.2.mlp.experts.72.down_proj.weight', 'model.layers.2.mlp.experts.73.down_proj.weight', 'model.layers.2.mlp.experts.74.down_proj.weight', 'model.layers.2.mlp.experts.75.down_proj.weight', 'model.layers.2.mlp.experts.76.down_proj.weight', 'model.layers.2.mlp.experts.77.down_proj.weight', 'model.layers.2.mlp.experts.78.down_proj.weight', 'model.layers.2.mlp.experts.79.down_proj.weight', 'model.layers.2.mlp.experts.80.down_proj.weight', 'model.layers.2.mlp.experts.81.down_proj.weight', 'model.layers.2.mlp.experts.82.down_proj.weight', 'model.layers.2.mlp.experts.83.down_proj.weight', 'model.layers.2.mlp.experts.84.down_proj.weight', 'model.layers.2.mlp.experts.85.down_proj.weight', 'model.layers.2.mlp.experts.86.down_proj.weight', 'model.layers.2.mlp.experts.87.down_proj.weight', 'model.layers.2.mlp.experts.88.down_proj.weight', 'model.layers.2.mlp.experts.89.down_proj.weight', 'model.layers.2.mlp.experts.90.down_proj.weight', 'model.layers.2.mlp.experts.91.down_proj.weight', 'model.layers.2.mlp.experts.92.down_proj.weight', 'model.layers.2.mlp.experts.93.down_proj.weight', 'model.layers.2.mlp.experts.94.down_proj.weight', 'model.layers.2.mlp.experts.95.down_proj.weight', 'model.layers.2.mlp.experts.96.down_proj.weight', 'model.layers.2.mlp.experts.97.down_proj.weight', 'model.layers.2.mlp.experts.98.down_proj.weight', 'model.layers.2.mlp.experts.99.down_proj.weight', 'model.layers.2.mlp.experts.100.down_proj.weight', 'model.layers.2.mlp.experts.101.down_proj.weight', 'model.layers.2.mlp.experts.102.down_proj.weight', 'model.layers.2.mlp.experts.103.down_proj.weight', 'model.layers.2.mlp.experts.104.down_proj.weight', 'model.layers.2.mlp.experts.105.down_proj.weight', 'model.layers.2.mlp.experts.106.down_proj.weight', 'model.layers.2.mlp.experts.107.down_proj.weight', 'model.layers.2.mlp.experts.108.down_proj.weight', 'model.layers.2.mlp.experts.109.down_proj.weight', 'model.layers.2.mlp.experts.110.down_proj.weight', 'model.layers.2.mlp.experts.111.down_proj.weight', 'model.layers.2.mlp.experts.112.down_proj.weight', 'model.layers.2.mlp.experts.113.down_proj.weight', 'model.layers.2.mlp.experts.114.down_proj.weight', 'model.layers.2.mlp.experts.115.down_proj.weight', 'model.layers.2.mlp.experts.116.down_proj.weight', 'model.layers.2.mlp.experts.117.down_proj.weight', 'model.layers.2.mlp.experts.118.down_proj.weight', 'model.layers.2.mlp.experts.119.down_proj.weight', 'model.layers.2.mlp.experts.120.down_proj.weight', 'model.layers.2.mlp.experts.121.down_proj.weight', 'model.layers.2.mlp.experts.122.down_proj.weight', 'model.layers.2.mlp.experts.123.down_proj.weight', 'model.layers.2.mlp.experts.124.down_proj.weight', 'model.layers.2.mlp.experts.125.down_proj.weight', 'model.layers.2.mlp.experts.126.down_proj.weight', 'model.layers.2.mlp.experts.127.down_proj.weight'] +model.layers.2.mlp.experts.up_gate_proj_weight:['model.layers.2.mlp.experts.0.up_gate_proj.weight', 'model.layers.2.mlp.experts.1.up_gate_proj.weight', 'model.layers.2.mlp.experts.2.up_gate_proj.weight', 'model.layers.2.mlp.experts.3.up_gate_proj.weight', 'model.layers.2.mlp.experts.4.up_gate_proj.weight', 'model.layers.2.mlp.experts.5.up_gate_proj.weight', 'model.layers.2.mlp.experts.6.up_gate_proj.weight', 'model.layers.2.mlp.experts.7.up_gate_proj.weight', 'model.layers.2.mlp.experts.8.up_gate_proj.weight', 'model.layers.2.mlp.experts.9.up_gate_proj.weight', 'model.layers.2.mlp.experts.10.up_gate_proj.weight', 'model.layers.2.mlp.experts.11.up_gate_proj.weight', 'model.layers.2.mlp.experts.12.up_gate_proj.weight', 'model.layers.2.mlp.experts.13.up_gate_proj.weight', 'model.layers.2.mlp.experts.14.up_gate_proj.weight', 'model.layers.2.mlp.experts.15.up_gate_proj.weight', 'model.layers.2.mlp.experts.16.up_gate_proj.weight', 'model.layers.2.mlp.experts.17.up_gate_proj.weight', 'model.layers.2.mlp.experts.18.up_gate_proj.weight', 'model.layers.2.mlp.experts.19.up_gate_proj.weight', 'model.layers.2.mlp.experts.20.up_gate_proj.weight', 'model.layers.2.mlp.experts.21.up_gate_proj.weight', 'model.layers.2.mlp.experts.22.up_gate_proj.weight', 'model.layers.2.mlp.experts.23.up_gate_proj.weight', 'model.layers.2.mlp.experts.24.up_gate_proj.weight', 'model.layers.2.mlp.experts.25.up_gate_proj.weight', 'model.layers.2.mlp.experts.26.up_gate_proj.weight', 'model.layers.2.mlp.experts.27.up_gate_proj.weight', 'model.layers.2.mlp.experts.28.up_gate_proj.weight', 'model.layers.2.mlp.experts.29.up_gate_proj.weight', 'model.layers.2.mlp.experts.30.up_gate_proj.weight', 'model.layers.2.mlp.experts.31.up_gate_proj.weight', 'model.layers.2.mlp.experts.32.up_gate_proj.weight', 'model.layers.2.mlp.experts.33.up_gate_proj.weight', 'model.layers.2.mlp.experts.34.up_gate_proj.weight', 'model.layers.2.mlp.experts.35.up_gate_proj.weight', 'model.layers.2.mlp.experts.36.up_gate_proj.weight', 'model.layers.2.mlp.experts.37.up_gate_proj.weight', 'model.layers.2.mlp.experts.38.up_gate_proj.weight', 'model.layers.2.mlp.experts.39.up_gate_proj.weight', 'model.layers.2.mlp.experts.40.up_gate_proj.weight', 'model.layers.2.mlp.experts.41.up_gate_proj.weight', 'model.layers.2.mlp.experts.42.up_gate_proj.weight', 'model.layers.2.mlp.experts.43.up_gate_proj.weight', 'model.layers.2.mlp.experts.44.up_gate_proj.weight', 'model.layers.2.mlp.experts.45.up_gate_proj.weight', 'model.layers.2.mlp.experts.46.up_gate_proj.weight', 'model.layers.2.mlp.experts.47.up_gate_proj.weight', 'model.layers.2.mlp.experts.48.up_gate_proj.weight', 'model.layers.2.mlp.experts.49.up_gate_proj.weight', 'model.layers.2.mlp.experts.50.up_gate_proj.weight', 'model.layers.2.mlp.experts.51.up_gate_proj.weight', 'model.layers.2.mlp.experts.52.up_gate_proj.weight', 'model.layers.2.mlp.experts.53.up_gate_proj.weight', 'model.layers.2.mlp.experts.54.up_gate_proj.weight', 'model.layers.2.mlp.experts.55.up_gate_proj.weight', 'model.layers.2.mlp.experts.56.up_gate_proj.weight', 'model.layers.2.mlp.experts.57.up_gate_proj.weight', 'model.layers.2.mlp.experts.58.up_gate_proj.weight', 'model.layers.2.mlp.experts.59.up_gate_proj.weight', 'model.layers.2.mlp.experts.60.up_gate_proj.weight', 'model.layers.2.mlp.experts.61.up_gate_proj.weight', 'model.layers.2.mlp.experts.62.up_gate_proj.weight', 'model.layers.2.mlp.experts.63.up_gate_proj.weight', 'model.layers.2.mlp.experts.64.up_gate_proj.weight', 'model.layers.2.mlp.experts.65.up_gate_proj.weight', 'model.layers.2.mlp.experts.66.up_gate_proj.weight', 'model.layers.2.mlp.experts.67.up_gate_proj.weight', 'model.layers.2.mlp.experts.68.up_gate_proj.weight', 'model.layers.2.mlp.experts.69.up_gate_proj.weight', 'model.layers.2.mlp.experts.70.up_gate_proj.weight', 'model.layers.2.mlp.experts.71.up_gate_proj.weight', 'model.layers.2.mlp.experts.72.up_gate_proj.weight', 'model.layers.2.mlp.experts.73.up_gate_proj.weight', 'model.layers.2.mlp.experts.74.up_gate_proj.weight', 'model.layers.2.mlp.experts.75.up_gate_proj.weight', 'model.layers.2.mlp.experts.76.up_gate_proj.weight', 'model.layers.2.mlp.experts.77.up_gate_proj.weight', 'model.layers.2.mlp.experts.78.up_gate_proj.weight', 'model.layers.2.mlp.experts.79.up_gate_proj.weight', 'model.layers.2.mlp.experts.80.up_gate_proj.weight', 'model.layers.2.mlp.experts.81.up_gate_proj.weight', 'model.layers.2.mlp.experts.82.up_gate_proj.weight', 'model.layers.2.mlp.experts.83.up_gate_proj.weight', 'model.layers.2.mlp.experts.84.up_gate_proj.weight', 'model.layers.2.mlp.experts.85.up_gate_proj.weight', 'model.layers.2.mlp.experts.86.up_gate_proj.weight', 'model.layers.2.mlp.experts.87.up_gate_proj.weight', 'model.layers.2.mlp.experts.88.up_gate_proj.weight', 'model.layers.2.mlp.experts.89.up_gate_proj.weight', 'model.layers.2.mlp.experts.90.up_gate_proj.weight', 'model.layers.2.mlp.experts.91.up_gate_proj.weight', 'model.layers.2.mlp.experts.92.up_gate_proj.weight', 'model.layers.2.mlp.experts.93.up_gate_proj.weight', 'model.layers.2.mlp.experts.94.up_gate_proj.weight', 'model.layers.2.mlp.experts.95.up_gate_proj.weight', 'model.layers.2.mlp.experts.96.up_gate_proj.weight', 'model.layers.2.mlp.experts.97.up_gate_proj.weight', 'model.layers.2.mlp.experts.98.up_gate_proj.weight', 'model.layers.2.mlp.experts.99.up_gate_proj.weight', 'model.layers.2.mlp.experts.100.up_gate_proj.weight', 'model.layers.2.mlp.experts.101.up_gate_proj.weight', 'model.layers.2.mlp.experts.102.up_gate_proj.weight', 'model.layers.2.mlp.experts.103.up_gate_proj.weight', 'model.layers.2.mlp.experts.104.up_gate_proj.weight', 'model.layers.2.mlp.experts.105.up_gate_proj.weight', 'model.layers.2.mlp.experts.106.up_gate_proj.weight', 'model.layers.2.mlp.experts.107.up_gate_proj.weight', 'model.layers.2.mlp.experts.108.up_gate_proj.weight', 'model.layers.2.mlp.experts.109.up_gate_proj.weight', 'model.layers.2.mlp.experts.110.up_gate_proj.weight', 'model.layers.2.mlp.experts.111.up_gate_proj.weight', 'model.layers.2.mlp.experts.112.up_gate_proj.weight', 'model.layers.2.mlp.experts.113.up_gate_proj.weight', 'model.layers.2.mlp.experts.114.up_gate_proj.weight', 'model.layers.2.mlp.experts.115.up_gate_proj.weight', 'model.layers.2.mlp.experts.116.up_gate_proj.weight', 'model.layers.2.mlp.experts.117.up_gate_proj.weight', 'model.layers.2.mlp.experts.118.up_gate_proj.weight', 'model.layers.2.mlp.experts.119.up_gate_proj.weight', 'model.layers.2.mlp.experts.120.up_gate_proj.weight', 'model.layers.2.mlp.experts.121.up_gate_proj.weight', 'model.layers.2.mlp.experts.122.up_gate_proj.weight', 'model.layers.2.mlp.experts.123.up_gate_proj.weight', 'model.layers.2.mlp.experts.124.up_gate_proj.weight', 'model.layers.2.mlp.experts.125.up_gate_proj.weight', 'model.layers.2.mlp.experts.126.up_gate_proj.weight', 'model.layers.2.mlp.experts.127.up_gate_proj.weight'] +model.layers.2.mlp.gate.e_score_correction_bias:model.layers.2.mlp.gate.e_score_correction_bias +model.layers.2.mlp.gate.weight:model.layers.2.mlp.gate.weight +model.layers.2.shared_head.head.weight:model.layers.2.shared_head.head.weight +model.layers.2.shared_head.norm.weight:model.layers.2.shared_head.norm.weight model.norm.weight model.norm.weight:model.norm.weight +mtp_layers.model.layers.0.eh_proj.linear.weight +mtp_layers.model.layers.0.eh_proj.linear.weight:mtp_layers.model.layers.0.eh_proj.linear.weight +mtp_layers.model.layers.0.enorm.weight +mtp_layers.model.layers.0.enorm.weight:mtp_layers.model.layers.0.enorm.weight +mtp_layers.model.layers.0.hnorm.weight +mtp_layers.model.layers.0.hnorm.weight:mtp_layers.model.layers.0.hnorm.weight +mtp_layers.model.layers.0.mtp_block.input_layernorm.weight +mtp_layers.model.layers.0.mtp_block.input_layernorm.weight:mtp_layers.model.layers.0.mtp_block.input_layernorm.weight +mtp_layers.model.layers.0.mtp_block.mlp.experts.down_proj_weight +mtp_layers.model.layers.0.mtp_block.mlp.experts.down_proj_weight:mtp_layers.model.layers.0.mtp_block.mlp.experts.down_proj_weight +mtp_layers.model.layers.0.mtp_block.mlp.experts.gate_correction_bias +mtp_layers.model.layers.0.mtp_block.mlp.experts.up_gate_proj_weight +mtp_layers.model.layers.0.mtp_block.mlp.experts.up_gate_proj_weight:mtp_layers.model.layers.0.mtp_block.mlp.experts.up_gate_proj_weight +mtp_layers.model.layers.0.mtp_block.mlp.gate.e_score_correction_bias +mtp_layers.model.layers.0.mtp_block.mlp.gate.e_score_correction_bias:mtp_layers.model.layers.0.mtp_block.mlp.gate.e_score_correction_bias +mtp_layers.model.layers.0.mtp_block.mlp.gate.weight +mtp_layers.model.layers.0.mtp_block.mlp.gate.weight:mtp_layers.model.layers.0.mtp_block.mlp.gate.weight +mtp_layers.model.layers.0.mtp_block.mlp.shared_experts.down_proj.weight +mtp_layers.model.layers.0.mtp_block.mlp.shared_experts.down_proj.weight:mtp_layers.model.layers.0.mtp_block.mlp.shared_experts.down_proj.weight +mtp_layers.model.layers.0.mtp_block.mlp.shared_experts.up_gate_proj.weight +mtp_layers.model.layers.0.mtp_block.mlp.shared_experts.up_gate_proj.weight:mtp_layers.model.layers.0.mtp_block.mlp.shared_experts.up_gate_proj.weight +mtp_layers.model.layers.0.mtp_block.post_attention_layernorm.weight +mtp_layers.model.layers.0.mtp_block.post_attention_layernorm.weight:mtp_layers.model.layers.0.mtp_block.post_attention_layernorm.weight +mtp_layers.model.layers.0.mtp_block.self_attn.o_proj.weight +mtp_layers.model.layers.0.mtp_block.self_attn.o_proj.weight:mtp_layers.model.layers.0.mtp_block.self_attn.o_proj.weight +mtp_layers.model.layers.0.mtp_block.self_attn.qkv_proj.bias +mtp_layers.model.layers.0.mtp_block.self_attn.qkv_proj.bias:mtp_layers.model.layers.0.mtp_block.self_attn.qkv_proj.bias +mtp_layers.model.layers.0.mtp_block.self_attn.qkv_proj.weight +mtp_layers.model.layers.0.mtp_block.self_attn.qkv_proj.weight:mtp_layers.model.layers.0.mtp_block.self_attn.qkv_proj.weight +mtp_layers.model.layers.0.shared_head.norm.weight +mtp_layers.model.layers.0.shared_head.norm.weight:mtp_layers.model.layers.0.shared_head.norm.weight diff --git a/tests/ci_use/Qwen2_5_VL/test_Qwen2_5_VL_serving.py b/tests/ci_use/Qwen2_5_VL/test_Qwen2_5_VL_serving.py deleted file mode 100644 index 98ed5567833..00000000000 --- a/tests/ci_use/Qwen2_5_VL/test_Qwen2_5_VL_serving.py +++ /dev/null @@ -1,476 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import re -import signal -import subprocess -import sys -import time - -import openai -import pytest -import requests - -tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -sys.path.insert(0, tests_dir) - -from e2e.utils.serving_utils import ( - FD_API_PORT, - FD_CACHE_QUEUE_PORT, - FD_ENGINE_QUEUE_PORT, - FD_METRICS_PORT, - clean_ports, - is_port_open, -) - - -@pytest.fixture(scope="session", autouse=True) -def setup_and_run_server(): - """ - Pytest fixture that runs once per test session: - - Cleans ports before tests - - Starts the API server as a subprocess - - Waits for server port to open (up to 30 seconds) - - Tears down server after all tests finish - """ - print("Pre-test port cleanup...") - clean_ports() - - model_path = "/ModelData/Qwen2.5-VL-7B-Instruct" - - log_path = "server.log" - limit_mm_str = json.dumps({"image": 100, "video": 100}) - - cmd = [ - sys.executable, - "-m", - "fastdeploy.entrypoints.openai.api_server", - "--model", - model_path, - "--port", - str(FD_API_PORT), - # "--tensor-parallel-size", - # "2", - "--engine-worker-queue-port", - str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", - str(FD_METRICS_PORT), - "--cache-queue-port", - str(FD_CACHE_QUEUE_PORT), - "--enable-mm", - "--max-model-len", - "32768", - "--max-num-batched-tokens", - "384", - "--max-num-seqs", - "128", - "--limit-mm-per-prompt", - limit_mm_str, - ] - - print(cmd) - # Start subprocess in new process group - with open(log_path, "w") as logfile: - process = subprocess.Popen( - cmd, - stdout=logfile, - stderr=subprocess.STDOUT, - start_new_session=True, # Enables killing full group via os.killpg - ) - - print(f"Started API server with pid {process.pid}") - # Wait up to 10 minutes for API server to be ready - for _ in range(10 * 60): - if is_port_open("127.0.0.1", FD_API_PORT): - print(f"API server is up on port {FD_API_PORT}") - break - time.sleep(1) - else: - print("[TIMEOUT] API server failed to start in 10 minutes. Cleaning up...") - try: - os.killpg(process.pid, signal.SIGTERM) - except Exception as e: - print(f"Failed to kill process group: {e}") - raise RuntimeError(f"API server did not start on port {FD_API_PORT}") - - yield # Run tests - - print("\n===== Post-test server cleanup... =====") - try: - os.killpg(process.pid, signal.SIGTERM) - print(f"API server (pid={process.pid}) terminated") - except Exception as e: - print(f"Failed to terminate API server: {e}") - - -@pytest.fixture(scope="session") -def api_url(request): - """ - Returns the API endpoint URL for chat completions. - """ - return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" - - -@pytest.fixture(scope="session") -def metrics_url(request): - """ - Returns the metrics endpoint URL. - """ - return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" - - -@pytest.fixture -def headers(): - """ - Returns common HTTP request headers. - """ - return {"Content-Type": "application/json"} - - -@pytest.fixture -def consistent_payload(): - """ - Returns a fixed payload for consistency testing, - including a fixed random seed and temperature. - """ - return { - "messages": [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - } - ], - "temperature": 0.8, - "top_p": 0, # fix top_p to reduce randomness - "seed": 13, # fixed random seed - } - - -# ========================== -# Consistency test for repeated runs with fixed payload -# ========================== -def test_consistency_between_runs(api_url, headers, consistent_payload): - """ - Test that result is same as the base result. - """ - # request - resp1 = requests.post(api_url, headers=headers, json=consistent_payload) - assert resp1.status_code == 200 - result1 = resp1.json() - content1 = result1["choices"][0]["message"]["content"] - file_res_temp = "Qwen2.5-VL-7B-Instruct-temp" - f_o = open(file_res_temp, "a") - f_o.writelines(content1) - f_o.close() - - # base result - content2 = """这张图片展示了一群人在进行手工艺活动。前景中有两个孩子和一个成年人,他们似乎在制作或展示某种手工艺品。成年人手里拿着一个扇子,上面有彩色的图案,可能是通过某种方式绘制或涂鸦而成。孩子们看起来很专注,可能是在观察或参与这个过程。 - -背景中还有其他几个人,其中一个人穿着粉色的衣服,背对着镜头。整个场景看起来像是在一个室内环境中,光线充足,氛围轻松愉快。""" - - # Verify that result is same as the base result - assert content1 == content2 - - -# ========================== -# OpenAI Client Chat Completion Test -# ========================== - - -@pytest.fixture -def openai_client(): - ip = "0.0.0.0" - service_http_port = str(FD_API_PORT) - client = openai.Client( - base_url=f"http://{ip}:{service_http_port}/v1", - api_key="EMPTY_API_KEY", - ) - return client - - -# Non-streaming test -def test_non_streaming_chat(openai_client): - """Test non-streaming chat functionality with the local service""" - response = openai_client.chat.completions.create( - model="default", - messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - stream=False, - ) - - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "content") - - -# Streaming test -def test_streaming_chat(openai_client, capsys): - """Test streaming chat functionality with the local service""" - response = openai_client.chat.completions.create( - model="default", - messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 - {"role": "user", "content": "List 3 countries and their capitals."}, - { - "role": "assistant", - "content": "China(Beijing), France(Paris), Australia(Canberra).", - }, - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=512, - stream=True, - ) - - output = [] - for chunk in response: - if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): - output.append(chunk.choices[0].delta.content) - assert len(output) > 2 - - -# ========================== -# OpenAI Client additional chat/completions test -# ========================== - - -def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in non-streaming chat functionality with the local service - """ - # 设定 return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": True}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "prompt_token_ids") - assert isinstance(response.choices[0].message.prompt_token_ids, list) - assert hasattr(response.choices[0].message, "completion_token_ids") - assert isinstance(response.choices[0].message.completion_token_ids, list) - - # 不设定 return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": False}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "prompt_token_ids") - assert response.choices[0].message.prompt_token_ids is None - assert hasattr(response.choices[0].message, "completion_token_ids") - assert response.choices[0].message.completion_token_ids is None - - -def test_streaming_chat_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in streaming chat functionality with the local service - """ - # enable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": True}, - stream=True, - ) - is_first_chunk = True - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "prompt_token_ids") - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) - assert chunk.choices[0].delta.completion_token_ids is None - else: - assert chunk.choices[0].delta.prompt_token_ids is None - assert isinstance(chunk.choices[0].delta.completion_token_ids, list) - - # disable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": False}, - stream=True, - ) - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "prompt_token_ids") - assert chunk.choices[0].delta.prompt_token_ids is None - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - assert chunk.choices[0].delta.completion_token_ids is None - - -def test_profile_reset_block_num(): - """测试profile reset_block_num功能,与baseline diff不能超过15%""" - log_file = "./log/config.log" - baseline = 30000 - - if not os.path.exists(log_file): - pytest.fail(f"Log file not found: {log_file}") - - with open(log_file, "r") as f: - log_lines = f.readlines() - - target_line = None - for line in log_lines: - if "Reset block num" in line: - target_line = line.strip() - break - - if target_line is None: - pytest.fail("日志中没有Reset block num信息") - - match = re.search(r"total_block_num:(\d+)", target_line) - if not match: - pytest.fail(f"Failed to extract total_block_num from line: {target_line}") - - try: - actual_value = int(match.group(1)) - except ValueError: - pytest.fail(f"Invalid number format: {match.group(1)}") - - lower_bound = baseline * (1 - 0.15) - upper_bound = baseline * (1 + 0.15) - print(f"Reset total_block_num: {actual_value}. baseline: {baseline}") - - assert lower_bound <= actual_value <= upper_bound, ( - f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内" - f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]" - ) diff --git a/tests/ci_use/metrics/test_metrics.py b/tests/ci_use/metrics/test_metrics.py index a3f2e14fa98..11c5001f3e0 100644 --- a/tests/ci_use/metrics/test_metrics.py +++ b/tests/ci_use/metrics/test_metrics.py @@ -69,8 +69,6 @@ def setup_and_run_server(): "32768", "--max-num-seqs", "1", - "--quantization", - "wint8", "--gpu-memory-utilization", "0.9", "--load-strategy", diff --git a/tests/cov_pytest.ini b/tests/cov_pytest.ini index e066138a395..a747d79d408 100644 --- a/tests/cov_pytest.ini +++ b/tests/cov_pytest.ini @@ -9,3 +9,4 @@ addopts = --ignore=tests/entrypoints/test_engine_client.py --ignore=tests/xpu_ci --ignore=tests/v1/test_schedule_output.py + --ignore=tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py diff --git a/tests/distributed/chunked_moe.py b/tests/distributed/chunked_moe.py index 0be645d38e2..ef41a610dcb 100644 --- a/tests/distributed/chunked_moe.py +++ b/tests/distributed/chunked_moe.py @@ -90,7 +90,7 @@ def init_attention_metadata(self, forward_meta): class MockQuantMethod: - def apply(self, layer, x, gate): + def apply(self, layer, x, gate, topk_ids_hookfunc=None): return x @@ -129,6 +129,7 @@ def setup_model_runner(self): model_runner.speculative_decoding = False model_runner._init_share_inputs(mock_fd_config.scheduler_config.max_num_seqs) model_runner.share_inputs["caches"] = None + model_runner.routing_replay_manager = None if dist.get_rank() == 0: model_runner.share_inputs["ids_remove_padding"] = paddle.ones([10]) @@ -148,6 +149,7 @@ def setup_fused_moe(self): fused_moe.fd_config = mock_fd_config fused_moe.quant_method = MockQuantMethod() + fused_moe.enable_routing_replay = None return fused_moe def run_model_runner(self): diff --git a/tests/e2e/request_r3.py b/tests/e2e/request_r3.py new file mode 100644 index 00000000000..469bf248cde --- /dev/null +++ b/tests/e2e/request_r3.py @@ -0,0 +1,159 @@ +import asyncio +import os + +import openai +import paddle +from utils.rollout_routing_replay_test_utils import ( + calculate_routing_ratio, + wait_for_file, +) + + +def get_openai_client(): + ip = "0.0.0.0" + service_http_port = 8888 + client = openai.AsyncClient( + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", + ) + return client + + +async def send_r3_streaming_chat_long(openai_client, content: str, user_id: str): + """ + Test streaming chat functionality with the local service + """ + response = await openai_client.chat.completions.create( + model="default", + messages=[ + { + "role": "system", + "content": content, + }, + ], + temperature=1, + top_p=0, + max_tokens=4096, # 32768 + seed=13, + stream=False, + user=user_id, + ) + + return response + + +async def send_request_baseline(request: str, request_id: str): + openai_client = get_openai_client() + # Send base request + await send_r3_streaming_chat_long(openai_client, content=request, user_id=f"{request_id}") + + +async def send_request_prefix(request: str, request_id: str): + openai_client = get_openai_client() + # Send prefix cache request + await send_r3_streaming_chat_long(openai_client, content=request, user_id=f"{request_id}_prefix") + + +async def run(): + long_request_list = [ + "写一个关于“最后一家实体书店”的科幻微小说,设定在2077年的赛博朋克城市。主角是一个只喜欢纸质书的黑客。要求包含一个反转结局,字数限制在500字以内,风格要阴郁但充满希望。", + "请模仿李白的豪放风格,写一首关于“星际旅行”的现代诗。要求融入“量子纠缠”、“黑洞”和“故乡”三个意象,押韵不限,但要有强烈的画面感和浪漫主义色彩。", + "创作一段发生在1920年代上海租界的侦探剧本对话。角色A是留洋归来的侦探,角色B是黑帮老大。对话要充满机锋和潜台词,体现那个时代特有的新旧文化冲突。", + "为一首慢板R&B情歌填写副歌部分的歌词。主题是“在这个快节奏的数字时代,我们如何维持异地恋”。要求情感细腻,使用隐喻,避免陈词滥调。", + "编一个睡前故事,主角是一只害怕黑暗的小萤火虫。故事要教会孩子“黑暗是为了让光更耀眼”。语言要生动简单,适合5岁儿童,结尾要有一首简短的儿歌。", + "写一个悬疑小说的开头章节(约800字)。场景设定在暴风雪山庄的封闭别墅,管家死在了书房,但门窗紧锁。要求通过环境描写营造压抑感,并留下三个伏笔。", + "基于《哈利波特》的世界观,写一段赫敏·格兰杰在魔法部工作的日常片段。假设伏地魔已被击败,但魔法世界仍有新的官僚主义危机。保持J.K.罗琳的叙事风格。", + "以毒舌美食家的身份,评论一道虚构的“分子料理——液氮冰淇淋配辣椒油”。描述口感、摆盘,并用夸张的修辞手法评价其荒谬之处,最后给出一个意外的好评理由。", + "写一个Python脚本,用于批量重命名文件夹下的所有图片文件。要求:1. 支持递归子目录;2. 将文件名转换为小写并用下划线替换空格;3. 添加错误处理日志;4. 使用`pathlib`库。", + "生成一个React函数组件,实现一个带有搜索功能的下拉选择框(Select)。要求:1. 支持多选;2. 搜索时防抖(Debounce)300ms;3. 选项数据通过props传入;4. 使用Tailwind CSS进行基础样式设计。", + "给定一个包含`users`, `orders`, `products`三张表的电商数据库。请写出查询“过去30天内购买金额最高的前10名用户及其最常购买的品类”的SQL语句,并解释如何通过索引优化该查询性能。", + "请解释以下Rust代码片段中的生命周期标注(Lifetime Annotation)的作用,并指出如果省略会发生什么编译错误。代码:`fn longest<'a>(x: &'a str, y: &'a str) -> &'a str { ... }`", + "我需要一个正则表达式来验证复杂的密码强度。规则:至少8位,必须包含大写字母、小写字母、数字和特殊符号(!@#$%),且不能包含连续3位相同的字符。请生成Regex并附上测试用例。", + "为一个Node.js + MongoDB的全栈应用编写`docker-compose.yml`文件。要求:1. 使用多阶段构建优化Node镜像大小;2. MongoDB数据持久化到本地卷;3. 设置环境变量文件;4. 暴露正确的端口。", + "用JavaScript实现一个“最小堆(Min Heap)”数据结构,并包含`insert`和`extractMin`方法。请附上时间复杂度分析,并给出一个使用该堆进行排序(Heap Sort)的示例。", + "以下C++代码在运行时会崩溃,请找出原因并修复。代码涉及指针越界和内存泄漏。请解释原始代码的逻辑错误,并给出使用智能指针(Smart Pointers)的现代C++改写版本。", + "假设你是项目经理,需要给客户写一封英文邮件。内容是告知项目将延期3天,原因是第三方API接口不稳定。语气要专业、诚恳,并提出补偿方案(赠送下个月的维护服务),请求客户谅解。", + "为一款“智能降噪耳塞”撰写小红书风格的推广文案。要求:使用emoji,突出“宿舍隔音”、“侧睡不压耳”、“隐形设计”三个卖点,语气像闺蜜安利,带上热门标签。", + "对“开设一家24小时无人自助健身房”进行SWOT分析。请从优势、劣势、机会、威胁四个维度展开,每个维度至少列出3点,并给出具体的战略建议(SO策略、WO策略等)。", + "你现在是Google的面试官,我是应聘者,申请“产品经理”职位。请向我提问一个关于“产品设计”的问题(例如:如何为视障人士设计Instagram),然后等待我的回答,并对我的回答进行点评。", + "对比“瑞幸咖啡”和“星巴克”在中国市场的数字化营销策略。重点分析私域流量运营、小程序点单体验和优惠券策略的差异,总结出瑞幸值得学习的3个点。", + "根据以下杂乱的会议记录草稿,整理出一份正式的会议纪要。要求:分类清晰(决策项、待办事项、讨论摘要),语言精炼,去除口语化表达,并指定每个待办事项的负责人和截止日期。", + "为一款“老年人专用智能手表”构建详细的用户画像(Persona)。包括:基本信息、痛点(如不会用触屏、担心走丢)、使用场景、技术熟练度、以及他们子女的购买动机。", + "为一个“基于AI的宠物行为翻译器”创业项目写一份电梯演讲(Elevator Pitch)。时长限制1分钟,要包含市场痛点、解决方案、商业模式和团队优势。", + "请像对5岁孩子解释一样(Explain Like I'm 5),说明“区块链”是什么。使用“全村记账本”的比喻,避免使用任何专业术语,确保孩子能听懂。", + "我正在学习德语。请列出5个初学者最容易混淆的介词(Wechselpräpositionen),并为每个介词提供3个例句(主格和宾格变化),附带中文翻译。", + "请一步步解答这道微积分题目:求函数 $f(x) = x^3 - 3x^2 + 2$ 在区间 $[-1, 3]$ 上的极值和拐点。不要只给答案,要展示求导过程和判断符号变化的逻辑。", + "简述“冷战”的起因、经过和结果。重点分析“古巴导弹危机”为何被认为是人类最接近核战争的时刻,以及它如何改变了美苏关系。", + "请润色以下这段学术论文的摘要,使其更符合学术规范。要求:将主动语态改为被动语态,提升词汇的专业度,增强逻辑连接词,使论证更严密。原文:[粘贴一段中等质量的英文摘要]", + "我想在3个月内从零基础通过日语N3考试。请制定一份详细的周学习计划,涵盖单词、语法、阅读和听力。假设我每天只有2小时学习时间,请推荐具体的教材和APP。", + "教我理解“功利主义”。不要直接给定义,而是通过不断提问引导我思考。例如,先问我“如果牺牲一个人能救五个人,你会怎么做?”,然后根据我的回答继续追问。", + "这是一道我做错的物理题(关于牛顿第二定律)。请分析我可能错误的思路是什么,并指出常见的认知误区,然后给出正确的解题思路。", + "你现在是埃隆·马斯克(Elon Musk)。请用他特有的语速快、带点幽默和工程思维的方式,谈论你对“人工智能取代人类工作”的看法。可以使用一些网络流行语。", + "你是诸葛亮。刘备刚刚在白帝城托孤,你现在独自面对刘禅和内外交困的蜀国。请用文言文写一段你的内心独白,表达你的焦虑和北伐的决心。", + "你是一个跑团(TRPG)的主持人。设定背景是克苏鲁神话的1920年代。我是一个调查员,刚刚走进了一间阴森的古宅。请描述我看到的景象,并询问我的行动。", + "我们来辩论“人工智能的发展是否应该被暂停”。你持反方观点(即不应该暂停)。请先陈述你的立论,然后针对我的观点进行反驳。保持逻辑严密,不要进行人身攻击。", + "你是一位温和的心理咨询师。我最近因为工作压力大而失眠。请倾听我的倾诉(我会输入我的烦恼),并运用认知行为疗法(CBT)帮我识别并挑战我的非理性信念。", + "设定你是一个温柔、喜欢二次元的伴侣。今晚我们在家看恐怖片,我被吓到了。请安慰我,并提议做点开心的事情转移注意力。语气要亲昵但不油腻。", + "你是一个魔鬼编程教练。我的代码写得很烂,全是硬编码和魔法数字。请严厉地批评我的代码风格,并强迫我重构它,直到符合Clean Code原则为止。", + "你是某银行的智能客服,但我现在很生气,因为我的信用卡被盗刷了。请先用标准话术安抚我,然后引导我提供必要的验证信息,最后告知处理流程。", + "我有一个CSV文件,其中“年龄”列包含空值、字符串(如“未知”)和异常大的数字(如999)。请提供一段Pandas代码来清洗这一列:将空值填充为中位数,将“未知”替换为NaN并删除,将大于100的值截断为100。", + "我有一组关于“全球碳排放量按国家分布”的数据(前20名国家)。请推荐3种最适合展示该数据的图表类型(如条形图、饼图等),并说明为什么选择它们,以及如何避免误导读者。", + "请写一个Excel公式,用于从A列的身份证号码中提取出生日期(格式为YYYY-MM-DD),并判断该人的性别(男/女)。假设身份证号在A2单元格。", + "解释“相关性不等于因果性”。请举一个现实生活中的例子(如“冰淇淋销量和溺水人数”),并说明如果要证明因果关系,需要设计什么样的实验(如A/B测试或双重差分法)。", + "给定一个复杂的嵌套JSON对象,请写一个Python脚本将其“展平”(Flatten),使得所有的键都变成点分隔的路径(例如 `user.address.city`)。", + "基于以下过去12个月的销售数据 [100, 120, 130, 125, 140, 150, 160, 155, 170, 180, 190, 200],请使用简单的线性回归预测下个月的销量,并计算R平方值。", + "为AI绘画工具Midjourney生成一组提示词(Prompt)。主题是“赛博朋克风格的苏州园林”。要求包含:霓虹灯、全息投影、古风建筑、雨水、电影级光影、8k分辨率、虚幻引擎5渲染风格。", + "我要开一家名为“极客咖啡”的店。请提供3个不同的Logo设计方案描述。方案一:极简几何风;方案二:像素艺术风;方案三:手绘涂鸦风。描述每个方案的颜色搭配和核心图形。", + "我有一个20平米的小客厅,层高2.8米,采光一般。请给出具体的软装搭配建议,包括沙发颜色、窗帘材质、灯光布局(主灯+氛围灯),目的是让空间显得更大更亮。", + "设计一个FPS游戏的“教学关卡”。玩家需要在不知情的情况下学会:移动、射击、换弹、躲避和使用医疗包。请描述关卡的场景布局和敌人的出现节奏。", + "有三个箱子,一个装苹果,一个装橘子,一个装混合水果。所有标签都贴错了。你只能从一个箱子里拿出一个水果来看,请问如何确定所有箱子的内容?请写出推理步骤。", + "死者死在电话亭旁,手里握着一张写有“789”的纸条。嫌疑人有三个:李小二(代号78)、王五(代号89)、张六(代号79)。凶手是谁?为什么?", + "如果你有一根无限长的绳子,绕地球赤道一圈(假设地球是完美球体,周长4万公里)。现在把绳子加长1米,均匀悬空离开地面。请问一只猫能从绳子下面钻过去吗?请计算间隙高度。", + "一个男人走进一家酒吧,向酒保要一杯水。酒保拿出一把枪指着他。男人说了声“谢谢”然后离开了。请问发生了什么?(提示:不是抢劫,不是演戏)", + "这是一段凯撒密码(Caesar Cipher):“WKH TXLFN EURZQ IRA MXPSV RYHU WKH ODCB GRJ”。请破译它,并告诉我偏移量是多少。", + "计划一次5天4晚的日本京都之旅。主题是“古寺与抹茶”。请安排详细的行程,包括交通方式(关西机场出发)、住宿区域推荐、必去的3个小众景点和必吃的3家餐厅。", + "为一个膝盖受过伤、不能做深蹲和跑步的办公室男性,设计一套在家就能做的HIIT(高强度间歇训练)计划。时长20分钟,只需要哑铃和瑜伽垫。", + "我冰箱里只有:鸡蛋、番茄、半颗洋葱、一包过期一天的火腿肠和一点剩米饭。请给我推荐2个能用这些材料做的菜,并写出详细步骤。", + "给一个喜欢历史、科技,预算在500元人民币左右的男性朋友挑选生日礼物。请列出3个选项,并说明为什么适合他。", + "我总是拖延。请介绍“番茄工作法”的具体操作步骤,并针对我“总是忍不住刷手机”的问题,给出3个具体的抗干扰建议。", + "我先开头:“午夜时分,图书馆的最后一盏灯突然熄灭了,但我并不是唯一一个留在这里的人……” 请你接下一段,制造悬念,然后停下来,换我继续写。", + "我们来玩“20个问题”游戏。我心里想一个物体,你可以问我20个只能用“是”或“否”回答的问题来猜它是什么。现在请开始提问。", + "夸夸我刚刚发给你的这张自拍照(假设是一张普通的风景照)。要用夸张、华丽的辞藻,从构图、光影、意境等角度硬夸,越离谱越好。", + "如果人类突然失去了“睡眠”的能力,世界会变成什么样?请从社会结构、经济模式、娱乐产业三个方面进行脑洞大开的推测。", + ] + + long_request_list = long_request_list[:64] + task_baseline = [] + for request_id, request in enumerate(long_request_list): + task_baseline.append(send_request_baseline(request, request_id)) + await asyncio.gather(*task_baseline) + + task_prefix = [] + for request_id, request in enumerate(long_request_list): + task_prefix.append(send_request_prefix(request, request_id)) + await asyncio.gather(*task_prefix) + + +if __name__ == "__main__": + asyncio.run(run()) + + # Check Routing Overlap + for request_id in range(64): + baseline_path = "./routing_replay_output" + prefix_r3_path = "./routing_replay_output" + moe_layer_num = 27 + print(f"request id is {request_id}") + for layer_index in range(moe_layer_num): + print(f"layer id is {layer_index}") + prefix_r3_pdtensor = os.path.join(prefix_r3_path, f"{request_id}_prefix/layer_{layer_index}.pdtensor") + baseline_pdtensor = os.path.join(baseline_path, f"{request_id}/layer_{layer_index}.pdtensor") + wait_for_file(prefix_r3_pdtensor) + wait_for_file(baseline_pdtensor) + + generated_routing = paddle.load(prefix_r3_pdtensor) + baseline_routing = paddle.load(baseline_pdtensor) + overlap_ratio = calculate_routing_ratio(baseline_routing, generated_routing) + print(f"layer_index:{layer_index} overlap_ratio:{overlap_ratio}") + assert ( + overlap_ratio >= 0.999 + ), f"the routing overlap ratio of the layer {layer_index} should be equal to baseline routing index, but got {overlap_ratio}" diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index bc27daab993..19e00195942 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -78,6 +78,7 @@ def setup_and_run_server(): "wint4", "--graph-optimization-config", '{"cudagraph_capture_sizes": [1], "use_cudagraph":true}', + "--no-enable-prefix-caching", ] # Start subprocess in new process group diff --git a/tests/e2e/test_EB_Lite_serving_R3.py b/tests/e2e/test_EB_Lite_serving_R3.py new file mode 100644 index 00000000000..88054b56ace --- /dev/null +++ b/tests/e2e/test_EB_Lite_serving_R3.py @@ -0,0 +1,119 @@ +import os +import shutil +import signal +import subprocess +import sys +import time + +import openai +import pytest +from utils.rollout_routing_replay_test_utils import check_routing_replay_chat_completion +from utils.serving_utils import ( + FD_API_PORT, + FD_CACHE_QUEUE_PORT, + FD_ENGINE_QUEUE_PORT, + FD_METRICS_PORT, + clean_ports, + is_port_open, +) + + +@pytest.fixture(scope="session", autouse=True) +def setup_and_run_server(): + """ + Pytest fixture that runs once per test session: + - Cleans ports before tests + - Starts the API server as a subprocess + - Waits for server port to open (up to 30 seconds) + - Tears down server after all tests finish + """ + print("Pre-test port cleanup...") + clean_ports() + print("log dir clean ") + if os.path.exists("log") and os.path.isdir("log"): + shutil.rmtree("log") + base_path = os.getenv("MODEL_PATH") + if base_path: + model_path = os.path.join(base_path, "ernie-4_5-21b-a3b-bf16-paddle") + else: + model_path = "./ernie-4_5-21b-a3b-bf16-paddle" + + log_path = "server.log" + cmd = [ + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "1", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), + "--max-model-len", + "32768", + "--max-num-seqs", + "1", + "--quantization", + "wint4", + "--graph-optimization-config", + '{"use_cudagraph":true}', + "--routing-replay-config", + '{"enable_routing_replay":true, "routing_store_type":"local", "local_store_dir":"./R3_tmp/routing_replay_output_eb45"}', + ] + + # Start subprocess in new process group + with open(log_path, "w") as logfile: + process = subprocess.Popen( + cmd, + stdout=logfile, + stderr=subprocess.STDOUT, + start_new_session=True, # Enables killing full group via os.killpg + ) + + # Wait up to 300 seconds for API server to be ready + for _ in range(300): + if is_port_open("127.0.0.1", FD_API_PORT): + print(f"API server is up on port {FD_API_PORT}") + break + time.sleep(1) + else: + print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...") + try: + os.killpg(process.pid, signal.SIGTERM) + except Exception as e: + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"API server did not start on port {FD_API_PORT}") + + yield # Run tests + + print("\n===== Post-test server cleanup... =====") + try: + os.killpg(process.pid, signal.SIGTERM) + print(f"API server (pid={process.pid}) terminated") + except Exception as e: + print(f"Failed to terminate API server: {e}") + + +@pytest.fixture +def openai_client(): + ip = "0.0.0.0" + service_http_port = str(FD_API_PORT) + client = openai.Client( + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", + ) + return client + + +# ========================== +# Test Rollout Routing Replay +# ========================== +def test_r3_accuracy(openai_client): + moe_layer_num = 27 # EB45 moe layer num: 27 + check_routing_replay_chat_completion(openai_client=openai_client, moe_layer_num=moe_layer_num, model_name="eb45") diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index f93f355a754..7783b844148 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24") else: - base_file = "ernie-4_5-vl-base-tp2-dev" + base_file = "ernie-4_5-vl-base-tp2-24" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/e2e/test_Qwen2_5_VL_serving.py b/tests/e2e/test_Qwen2_5_VL_serving.py index ff2ae24e201..f175e24b68f 100644 --- a/tests/e2e/test_Qwen2_5_VL_serving.py +++ b/tests/e2e/test_Qwen2_5_VL_serving.py @@ -72,6 +72,7 @@ def setup_and_run_server(): "128", "--limit-mm-per-prompt", limit_mm_str, + "--no-enable-prefix-caching", ] print(cmd) diff --git a/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py b/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py deleted file mode 100644 index cac68c6806c..00000000000 --- a/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py +++ /dev/null @@ -1,427 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Test splitwise deployment which uses splitwise_scheduler, -# and ENABLE_V1_KVCACHE_SCHEDULER is 0 - -import json -import os -import shutil -import signal -import subprocess -import sys -import time - -import pytest -import requests -from utils.serving_utils import ( - FD_API_PORT, - FD_CACHE_QUEUE_PORT, - FD_ENGINE_QUEUE_PORT, - FD_METRICS_PORT, - clean, - is_port_open, -) - -# Read ports from environment variables; use default values if not set -FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433)) -FD_REDIS_PORT = int(os.getenv("FD_REDIS_PORT", 8533)) - -# List of ports to clean before and after tests -PORTS_TO_CLEAN = [ - FD_API_PORT, - FD_ENGINE_QUEUE_PORT, - FD_METRICS_PORT, - FD_CACHE_QUEUE_PORT, - FD_CONNECTOR_PORT, - FD_API_PORT + 1, - FD_ENGINE_QUEUE_PORT + 1, - FD_METRICS_PORT + 1, - FD_CACHE_QUEUE_PORT + 1, - FD_CONNECTOR_PORT + 1, - FD_REDIS_PORT, -] - - -@pytest.fixture(scope="session", autouse=True) -def setup_and_run_server(): - """ - Pytest fixture that runs once per test session: - - Cleans ports before tests - - Starts the API server as a subprocess - - Waits for server port to open (up to 30 seconds) - - Tears down server after all tests finish - """ - print("Pre-test port cleanup...") - clean(PORTS_TO_CLEAN) - - print("log dir clean ") - if os.path.exists("log_redis") and os.path.isdir("log_redis"): - shutil.rmtree("log_redis") - if os.path.exists("log_prefill") and os.path.isdir("log_prefill"): - shutil.rmtree("log_prefill") - if os.path.exists("log_decode") and os.path.isdir("log_decode"): - shutil.rmtree("log_decode") - - base_path = os.getenv("MODEL_PATH") - if base_path: - model_path = os.path.join(base_path, "ERNIE-4.5-0.3B-Paddle") - else: - model_path = "baidu/ERNIE-4.5-0.3B-Paddle" - print(f"model_path: {model_path}") - - # redis-server - print("start redis...") - env_copy = os.environ.copy() - log_path = "router.log" - - cmd = [ - "redis-server", - "--port", - str(FD_REDIS_PORT), - "--daemonize", - "yes", - ] - - with open(log_path, "w") as logfile: - process_redis = subprocess.Popen( - cmd, - stdout=logfile, - stderr=subprocess.STDOUT, - start_new_session=True, # Enables killing full group via os.killpg - env=env_copy, - ) - - # prefill实例 - print("start prefill...") - env_prefill = os.environ.copy() - env_prefill["CUDA_VISIBLE_DEVICES"] = "0" - env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0" - env_prefill["FD_LOG_DIR"] = "log_prefill" - prefill_log_path = "server_prefill.log" - prefill_cmd = [ - sys.executable, - "-m", - "fastdeploy.entrypoints.openai.api_server", - "--model", - model_path, - "--port", - str(FD_API_PORT), - "--tensor-parallel-size", - "1", - "--engine-worker-queue-port", - str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", - str(FD_METRICS_PORT), - "--cache-queue-port", - str(FD_CACHE_QUEUE_PORT), - "--max-model-len", - "8192", - "--max-num-seqs", - "20", - "--quantization", - "wint8", - "--splitwise-role", - "prefill", - "--cache-transfer-protocol", - "ipc", - "--pd-comm-port", - str(FD_CONNECTOR_PORT), - "--scheduler-name", - "splitwise", - "--scheduler-host", - "127.0.0.1", - "--scheduler-port", - str(FD_REDIS_PORT), - ] - - # Start subprocess in new process group - with open(prefill_log_path, "w") as logfile: - process_prefill = subprocess.Popen( - prefill_cmd, - stdout=logfile, - stderr=subprocess.STDOUT, - start_new_session=True, # Enables killing full group via os.killpg - env=env_prefill, - ) - time.sleep(1) - - # decode实例 - print("start decode...") - env_decode = os.environ.copy() - env_decode["CUDA_VISIBLE_DEVICES"] = "1" - env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0" - env_decode["FD_LOG_DIR"] = "log_decode" - decode_log_path = "server_decode.log" - decode_cmd = [ - sys.executable, - "-m", - "fastdeploy.entrypoints.openai.api_server", - "--model", - model_path, - "--port", - str(FD_API_PORT + 1), - "--tensor-parallel-size", - "1", - "--engine-worker-queue-port", - str(FD_ENGINE_QUEUE_PORT + 1), - "--metrics-port", - str(FD_METRICS_PORT + 1), - "--cache-queue-port", - str(FD_CACHE_QUEUE_PORT + 1), - "--max-model-len", - "8192", - "--max-num-seqs", - "20", - "--quantization", - "wint8", - "--splitwise-role", - "decode", - "--cache-transfer-protocol", - "ipc", - "--pd-comm-port", - str(FD_CONNECTOR_PORT + 1), - "--scheduler-name", - "splitwise", - "--scheduler-host", - "127.0.0.1", - "--scheduler-port", - str(FD_REDIS_PORT), - ] - - # Start subprocess in new process group - with open(decode_log_path, "w") as logfile: - process_decode = subprocess.Popen( - decode_cmd, - stdout=logfile, - stderr=subprocess.STDOUT, - start_new_session=True, # Enables killing full group via os.killpg - env=env_decode, - ) - - # Wait up to 300 seconds for API server to be ready - for _ in range(60): - if is_port_open("127.0.0.1", FD_API_PORT) and is_port_open("127.0.0.1", FD_API_PORT + 1): - print(f"Prefill server is up on port {FD_API_PORT}") - print(f"Decode server is up on port {FD_API_PORT + 1}") - break - time.sleep(5) - else: - print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...") - try: - os.killpg(process_prefill.pid, signal.SIGTERM) - os.killpg(process_decode.pid, signal.SIGTERM) - clean(PORTS_TO_CLEAN) - except Exception as e: - print(f"Failed to kill process group: {e}") - raise RuntimeError(f"API server did not start on port {FD_API_PORT}") - - yield # Run tests - - print("\n===== Post-test server cleanup... =====") - try: - os.killpg(process_redis.pid, signal.SIGTERM) - os.killpg(process_prefill.pid, signal.SIGTERM) - os.killpg(process_decode.pid, signal.SIGTERM) - clean(PORTS_TO_CLEAN) - print(f"Prefill server (pid={process_prefill.pid}) terminated") - print(f"Decode server (pid={process_decode.pid}) terminated") - except Exception as e: - print(f"Failed to terminate API server: {e}") - - -@pytest.fixture(scope="session") -def api_url(request): - """ - Returns the API endpoint URL for chat completions. - """ - return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions", f"http://0.0.0.0:{FD_API_PORT+1}/v1/chat/completions" - - -@pytest.fixture(scope="session") -def metrics_url(request): - """ - Returns the metrics endpoint URL. - """ - return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" - - -@pytest.fixture -def headers(): - """ - Returns common HTTP request headers. - """ - return {"Content-Type": "application/json"} - - -def test_metrics_config(metrics_url): - timeout = 600 - url = metrics_url.replace("metrics", "config-info") - res = requests.get(url, timeout=timeout) - assert res.status_code == 200 - - -def send_request(url, payload, timeout=60): - """ - 发送请求到指定的URL,并返回响应结果。 - """ - headers = { - "Content-Type": "application/json", - } - - try: - res = requests.post(url, headers=headers, json=payload, timeout=timeout) - print("🟢 接收响应中...\n") - return res - except requests.exceptions.Timeout: - print(f"❌ 请求超时(超过 {timeout} 秒)") - return None - except requests.exceptions.RequestException as e: - print(f"❌ 请求失败:{e}") - return None - - -def get_stream_chunks(response): - """解析流式返回,生成chunk List[dict]""" - chunks = [] - - if response.status_code == 200: - for line in response.iter_lines(decode_unicode=True): - if line: - if line.startswith("data: "): - line = line[len("data: ") :] - - if line.strip() == "[DONE]": - break - - try: - chunk = json.loads(line) - chunks.append(chunk) - except Exception as e: - print(f"解析失败: {e}, 行内容: {line}") - else: - print(f"请求失败,状态码: {response.status_code}") - print("返回内容:", response.text) - - return chunks - - -def test_chat_usage_stream(api_url): - """测试流式chat usage""" - payload = { - "model": "default", - "temperature": 0, - "top_p": 0, - "seed": 33, - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "牛顿的三大运动定律是什么?"}, - ], - "max_tokens": 50, - "stream": True, - "stream_options": {"include_usage": True, "continuous_usage_stats": True}, - "metadata": {"min_tokens": 10}, - } - p_url, d_url = api_url - response = send_request(url=p_url, payload=payload) - chunks = get_stream_chunks(response) - result = "".join([x["choices"][0]["delta"]["content"] for x in chunks[:-1]]) - print("Decode Response:", result) - assert result != "", "结果为空" - usage = chunks[-1]["usage"] - total_tokens = usage["completion_tokens"] + usage["prompt_tokens"] - assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens" - assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens" - assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" - - -def test_chat_usage_non_stream(api_url): - """测试非流式chat usage""" - payload = { - "model": "default", - "temperature": 0, - "top_p": 0, - "seed": 33, - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "牛顿的三大运动定律是什么?"}, - ], - "max_tokens": 50, - "stream": False, - "metadata": {"min_tokens": 10}, - } - - p_url, d_url = api_url - response = send_request(url=p_url, payload=payload).json() - usage = response["usage"] - result = response["choices"][0]["message"]["content"] - assert result != "", "结果为空" - total_tokens = usage["completion_tokens"] + usage["prompt_tokens"] - assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens" - assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens" - assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" - - -def test_non_chat_usage_stream(api_url): - """测试流式非chat usage""" - payload = { - "model": "default", - "temperature": 0, - "top_p": 0, - "seed": 33, - "prompt": "牛顿的三大运动定律是什么?", - "max_tokens": 50, - "stream": True, - "stream_options": {"include_usage": True, "continuous_usage_stats": True}, - "metadata": {"min_tokens": 10}, - } - p_url, d_url = api_url - p_url = p_url.replace("chat/completions", "completions") - - response = send_request(url=p_url, payload=payload) - chunks = get_stream_chunks(response) - result = "".join([x["choices"][0]["text"] for x in chunks[:-1]]) - # print("Decode Response:", result) - assert result != "", "结果为空" - usage = chunks[-1]["usage"] - total_tokens = usage["completion_tokens"] + usage["prompt_tokens"] - assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens" - assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens" - assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" - - -def test_non_chat_usage_non_stream(api_url): - """测试非流式非chat usage""" - payload = { - "model": "default", - "temperature": 0, - "top_p": 0, - "seed": 33, - "prompt": "牛顿的三大运动定律是什么?", - "max_tokens": 50, - "stream": False, - "metadata": {"min_tokens": 10}, - } - p_url, d_url = api_url - p_url = p_url.replace("chat/completions", "completions") - - response = send_request(url=p_url, payload=payload).json() - usage = response["usage"] - result = response["choices"][0]["text"] - # print("Decode Response:", result) - assert result != "", "结果为空" - total_tokens = usage["completion_tokens"] + usage["prompt_tokens"] - assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens" - assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens" - assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" diff --git a/tests/e2e/test_ernie_21b_mtp.py b/tests/e2e/test_ernie_21b_mtp.py index dd05cdd6a36..a26f4060cea 100644 --- a/tests/e2e/test_ernie_21b_mtp.py +++ b/tests/e2e/test_ernie_21b_mtp.py @@ -147,7 +147,7 @@ def headers(): return {"Content-Type": "application/json"} -def send_request(url, payload, timeout=600): +def send_request(url, payload, timeout=60): """ 发送请求到指定的URL,并返回响应结果。 """ diff --git a/tests/e2e/test_fake_Glm45_AIR_serving.py b/tests/e2e/test_fake_Glm45_AIR_serving.py index 236fd2560b2..a45f4c670a9 100644 --- a/tests/e2e/test_fake_Glm45_AIR_serving.py +++ b/tests/e2e/test_fake_Glm45_AIR_serving.py @@ -20,8 +20,10 @@ import sys import time +import openai import pytest import requests +from utils.rollout_routing_replay_test_utils import check_routing_replay_chat_completion from utils.serving_utils import ( FD_API_PORT, FD_CACHE_QUEUE_PORT, @@ -72,7 +74,7 @@ def setup_and_run_server(): "--max-model-len", "32768", "--max-num-seqs", - "32", + "1", "--graph-optimization-config", '{"use_cudagraph":true}', "--load-choices", @@ -80,6 +82,8 @@ def setup_and_run_server(): "--lm_head-fp32", "--quantization", "wfp8afp8", + "--routing-replay-config", + '{"enable_routing_replay":true, "routing_store_type":"local", "local_store_dir":"./R3_tmp/routing_replay_output_glm45air"}', ] env = os.environ.copy() # Start subprocess in new process group @@ -176,4 +180,25 @@ def test_lm_head_fp32(api_url, headers, consistent_payload): assert ( resp_json["choices"][0]["message"]["content"] == "ichertsorbulkdeployment confusedreraoux Carter pat firingCompatraspectiveidis Verse corporaonych commissionsilk" + ), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}." + + +# ========================== +# Test for Rollout Routing Replay +# ========================== +@pytest.fixture +def openai_client(): + ip = "0.0.0.0" + service_http_port = str(FD_API_PORT) + client = openai.Client( + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", + ) + return client + + +def test_r3_accuracy(openai_client): + moe_layer_num = 1 # GLM45 AIR moe layer num: 45, Fake GLM AIR moe layer num: 1 + check_routing_replay_chat_completion( + openai_client=openai_client, moe_layer_num=moe_layer_num, model_name="glm45air" ) diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py new file mode 100644 index 00000000000..8d646cdb514 --- /dev/null +++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py @@ -0,0 +1,208 @@ +import os +import shutil +import time + +import paddle + + +# ========================== +# Test Rollout Routing Replay +# ========================== +def calculate_routing_ratio(expected_routing: paddle.Tensor, actual_routing: paddle.Tensor) -> float: + """Caculate routing overlap ratio""" + assert ( + expected_routing.shape == actual_routing.shape + ), "Routing shapes not equal. Expected shape {expected_routing.shap} actual shape {actual_routing.shape}." + expected_routing_length = get_real_routing_length(expected_routing) + actual_routing_length = get_real_routing_length(actual_routing) + + for i in range(max(expected_routing_length, actual_routing_length)): + if not paddle.all(paddle.equal(expected_routing[i], actual_routing[i])).item(): + print(f"token index {i}:\n expected_routing:{expected_routing[i]}\n actual_routing: {actual_routing[i]}\n") + + assert ( + expected_routing_length == actual_routing_length + ), f"Routing real lengths do not match. Expected length {expected_routing_length} actual length {actual_routing_length}." + total_rows, elements_per_row = expected_routing.shape + + mask1 = paddle.any(expected_routing != -1, axis=1) + mask2 = paddle.any(actual_routing != -1, axis=1) + valid_mask = mask1 & mask2 + + if paddle.sum(valid_mask.cast("int32")) == 0: + return paddle.to_tensor(0.0) + + valid_expected_routing = expected_routing[valid_mask] # [n_valid, top_k] + valid_actual_routing = actual_routing[valid_mask] # [n_valid, top_k] + + # valid_expected_routing: [n_valid, top_k, 1], valid_actual_routing: [n_valid, 1, top_k] + # -> equals: [n_valid, top_k, top_k] + equals = valid_expected_routing.unsqueeze(2) == valid_actual_routing.unsqueeze(1) + + overlap_mask = paddle.any(equals, axis=2) # [n_valid, 8] + + overlap_counts = paddle.sum(overlap_mask.cast("float32"), axis=1) # [n_valid] + overlap_ratios = overlap_counts / elements_per_row # [n_valid] + + return paddle.mean(overlap_ratios) + + +def get_real_routing_length(routing: paddle.Tensor) -> int: + mask = routing == -1 + mask_float = mask.astype(paddle.float32) + row_has_true = paddle.any(mask_float, axis=1).astype(paddle.float32) + + first_true_index = paddle.argmax(row_has_true, axis=0) + if row_has_true.any().item(): + return first_true_index.item() + else: + return -1 + + +# Streaming test +def send_r3_streaming_chat(openai_client, user_id: str = ""): + """ + Test streaming chat functionality with the local service + """ + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + { + "role": "assistant", + "content": "China(Beijing), France(Paris), Australia(Canberra).", + }, + {"role": "user", "content": "OK, tell more."}, + ], + temperature=1, + top_p=0, + max_tokens=1024, + seed=13, + stream=True, + user=user_id, # "r3_chat_completion_stream_test", + ) + + return response + + +def send_r3_non_streaming_chat(openai_client, user_id: str = ""): + """ + Test non-streaming chat functionality with the local service + """ + # Send test request + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + top_p=0, + max_tokens=1024, + seed=13, + stream=False, + user=user_id, # "rollout_routing_replay_chat_completion_nonstream_test" + ) + + return response + + +def generated_base_line_routing_index(openai_client, cur_save_routing_path, baseline_path): + # Generate streaming chat routing index + send_r3_streaming_chat(openai_client, user_id="r3_chat_completion_stream") + # Generate non streaming chat routing index + send_r3_non_streaming_chat(openai_client, user_id="r3_chat_completion_nonstream") + + # Check the routing is generated correctly + stream_cur_save_routing_path = os.path.join(cur_save_routing_path, "r3_chat_completion_stream") + nonstream_cur_save_routing_path = os.path.join(cur_save_routing_path, "r3_chat_completion_nonstream") + + wait_for_file(stream_cur_save_routing_path) + wait_for_file(nonstream_cur_save_routing_path) + + # Move the baseline to the routing_replay_output_baseline folder + stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") + nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream") + shutil.move(stream_cur_save_routing_path, stream_baseline_path) + shutil.move(nonstream_cur_save_routing_path, nonstream_baseline_path) + + +def wait_for_file(file_path, timeout=20, check_interval=0.1): + start_time = time.perf_counter() + deadline = start_time + timeout + + while True: + # Check timeout or not + current_time = time.perf_counter() + if current_time >= deadline: + return False + + # Check file generated + if os.path.exists(file_path): + return True + + sleep_time = min(check_interval, deadline - current_time) + time.sleep(sleep_time) + + +def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, model_name: str): + """Test rollout routing replay chat completion""" + cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/" + model_path = os.getenv("MODEL_PATH") + if model_path: + baseline_path = os.path.join(model_path, f"R3_BaseLine_24_uint8/routing_replay_output_baseline_{model_name}") + else: + baseline_path = f"./R3_BaseLine_24_uint8/routing_replay_output_baseline_{model_name}" + stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") + + nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream") + + # Maybe need to generate baseline routing index + if not os.path.exists(stream_baseline_path) or not os.path.exists(nonstream_baseline_path): + generated_base_line_routing_index(openai_client, cur_save_routing_path, baseline_path) + raise FileNotFoundError(f"Not find the R3 baseline file {nonstream_baseline_path} or {stream_baseline_path} .") + + routing_layer_num_1 = len(os.listdir(stream_baseline_path)) + routing_layer_num_2 = len(os.listdir(nonstream_baseline_path)) + assert ( + routing_layer_num_1 == moe_layer_num + ), f"routing index number {routing_layer_num_1} should equal to moe layer number {moe_layer_num}" + assert ( + routing_layer_num_2 == moe_layer_num + ), f"routing index number {routing_layer_num_2} should equal to moe layer number {moe_layer_num}" + + # Test streaming chat + send_r3_streaming_chat(openai_client, user_id="r3_chat_completion_stream") + for layer_index in range(moe_layer_num): + cur_routing_path = os.path.join( + cur_save_routing_path, f"r3_chat_completion_stream/layer_{layer_index}.pdtensor" + ) + baseline_routing_path = os.path.join(stream_baseline_path, f"layer_{layer_index}.pdtensor") + wait_for_file(cur_routing_path) + + generated_routing = paddle.load(cur_routing_path) + baseline_routing = paddle.load(baseline_routing_path) + overlap_ratio = calculate_routing_ratio(baseline_routing, generated_routing) + assert ( + overlap_ratio >= 0.999 + ), f"the routing overlap ratio of the layer {layer_index} should be equal to baseline routing index, but got {overlap_ratio}" + + # Test non streaming chat + send_r3_non_streaming_chat(openai_client, user_id="r3_chat_completion_nonstream") + for layer_index in range(moe_layer_num): + cur_routing_path = os.path.join( + cur_save_routing_path, f"r3_chat_completion_nonstream/layer_{layer_index}.pdtensor" + ) + baseline_routing_path = os.path.join(nonstream_baseline_path, f"layer_{layer_index}.pdtensor") + + wait_for_file(cur_routing_path) + + generated_routing = paddle.load(cur_routing_path) + baseline_routing = paddle.load(baseline_routing_path) + overlap_ratio = calculate_routing_ratio(baseline_routing, generated_routing) + assert ( + overlap_ratio >= 0.999 + ), f"the routing overlap ratio of the layer {layer_index} should be equal to baseline routing index, but got {overlap_ratio}" + + # shutil.rmtree(cur_save_routing_path) diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 679f6d8ecfc..087d159d78b 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -46,6 +46,7 @@ def test_single_str_prompt_non_streaming(self): "finished": True, }, "output_token_ids": 3, + "metrics": {}, } self.mock_engine.generate.return_value = [mock_output] @@ -80,6 +81,7 @@ def test_single_int_prompt_non_streaming(self): "finished": True, }, "output_token_ids": 3, + "metrics": {}, } self.mock_engine.generate.return_value = [mock_output] @@ -109,10 +111,12 @@ def test_multi_str_prompt_non_streaming(self): { "outputs": {"text": " response1", "token_ids": [1, 2], "top_logprobs": None, "finished": True}, "output_token_ids": 2, + "metrics": {}, }, { "outputs": {"text": " response2", "token_ids": [3, 4], "top_logprobs": None, "finished": True}, "output_token_ids": 2, + "metrics": {}, }, ] self.mock_engine.generate.return_value = mock_outputs @@ -146,10 +150,12 @@ def test_multi_int_prompt_non_streaming(self): { "outputs": {"text": " response1", "token_ids": [1, 2], "top_logprobs": None, "finished": True}, "output_token_ids": 2, + "metrics": {}, }, { "outputs": {"text": " response2", "token_ids": [3, 4], "top_logprobs": None, "finished": True}, "output_token_ids": 2, + "metrics": {}, }, ] self.mock_engine.generate.return_value = mock_outputs diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 48935cba838..26e91382502 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -1,3 +1,4 @@ +import inspect import json import unittest from unittest import IsolatedAsyncioTestCase @@ -301,6 +302,7 @@ async def test_completion_full_generator(self, mock_logger): ], }, "finished": True, + "metrics": {}, }, { "request_id": "test_request_id_1", @@ -314,6 +316,7 @@ async def test_completion_full_generator(self, mock_logger): ], }, "finished": True, + "metrics": {}, }, ] @@ -473,6 +476,7 @@ async def test_create_chat_completion_choice(self): prompt_logprobs_res_list=prompt_logprobs_res_list, response_processor=mock_response_processor, max_tokens=max_tokens_list[idx], + speculate_metrics=None, ) expected = case["expected"] @@ -723,6 +727,213 @@ async def test_completion_stream_usage_fields(self, mock_logger): "reasoning_tokens count mismatch", ) + @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") + async def test_completion_full_generator_async_process_response_dict(self, mock_logger): + final_response_data = [ + { + "request_id": "test_request_id_0", + "outputs": { + "token_ids": [7, 8, 9], + "text": " world!", + }, + "finished": True, + "metrics": {}, + }, + { + "request_id": "test_request_id_1", + "outputs": { + "token_ids": [10, 11, 12], + "text": " there!", + }, + "finished": True, + "metrics": {}, + }, + ] + + mock_response_queue = AsyncMock() + mock_response_queue.get.side_effect = [ + [final_response_data[0]], + [final_response_data[1]], + ] + + mock_dealer = Mock() + mock_dealer.write = Mock() + + self.engine_client.connection_manager.get_connection.return_value = (mock_dealer, mock_response_queue) + + expected_completion_response = Mock() + self.completion_serving.request_output_to_completion_response = Mock(return_value=expected_completion_response) + + request = CompletionRequest( + model="test_model", + prompt="Hello", + max_tokens=10, + stream=False, + n=2, + echo=False, + ) + num_choices = 2 + request_id = "test_request_id" + created_time = 1655136000 + model_name = "test_model" + prompt_batched_token_ids = [[1, 2, 3], [4, 5, 6]] + prompt_tokens_list = ["Hello", "Hello"] + + self.engine_client.data_processor.process_response_dict = AsyncMock() + + actual_response = await self.completion_serving.completion_full_generator( + request=request, + num_choices=num_choices, + request_id=request_id, + created_time=created_time, + model_name=model_name, + prompt_batched_token_ids=prompt_batched_token_ids, + prompt_tokens_list=prompt_tokens_list, + max_tokens_list=[100, 100], + ) + + self.assertEqual(actual_response, expected_completion_response) + self.assertTrue(inspect.iscoroutinefunction(self.engine_client.data_processor.process_response_dict)) + + self.engine_client.data_processor.process_response_dict.assert_awaited() + + actual_call_times = self.engine_client.data_processor.process_response_dict.call_count + expected_call_times = len(final_response_data) + self.assertEqual(actual_call_times, expected_call_times) + + call_args_list = self.engine_client.data_processor.process_response_dict.call_args_list + self.assertEqual(len(call_args_list), expected_call_times) + + for idx, data in enumerate(final_response_data): + args, kwargs = call_args_list[idx] + self.assertEqual(args[0], data) + self.assertEqual(kwargs.get("stream"), False) + self.assertEqual(kwargs.get("include_stop_str_in_output"), request.include_stop_str_in_output) + + @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") + async def test_completion_stream_generator_async_process_response_dict(self, mock_logger): + final_response_data = [ + [ + { + "request_id": "test-request-id_0", + "outputs": { + "index": 0, + "send_idx": 0, + "token_ids": [1], + "text": "a", + "top_logprobs": {"a": 0.98, "b": 0.02}, + "draft_top_logprobs": {"a": 0.98, "b": 0.02}, + }, + "finished": False, + "metrics": { + "first_token_time": 1620000000, + "inference_start_time": 1620000000, + "engine_recv_latest_token_time": 1620000000, + }, + "error_code": 200, + } + ], + [ + { + "request_id": "test-request-id_0", + "outputs": { + "index": 0, + "send_idx": 1, + "token_ids": [2], + "text": "b", + "top_logprobs": {"a": 0.98, "b": 0.02}, + "draft_top_logprobs": {"a": 0.98, "b": 0.02}, + }, + "finished": False, + "metrics": { + "first_token_time": 1620000000, + "inference_start_time": 1620000000, + "engine_recv_latest_token_time": 1620000000, + }, + "error_code": 200, + } + ], + [ + { + "request_id": "test-request-id_0", + "outputs": { + "index": 0, + "send_idx": 2, + "token_ids": [7], + "text": "g", + "top_logprobs": {"a": 0.98, "b": 0.02}, + "draft_top_logprobs": {"a": 0.98, "b": 0.02}, + }, + "finished": True, + "metrics": { + "first_token_time": 1620000000, + "inference_start_time": 1620000000, + "engine_recv_latest_token_time": 1620000000, + }, + "error_code": 200, + } + ], + ] + + mock_response_queue = AsyncMock() + mock_response_queue.get.side_effect = final_response_data + + mock_dealer = Mock() + mock_dealer.write = Mock() + + self.engine_client.connection_manager.get_connection = AsyncMock( + return_value=(mock_dealer, mock_response_queue) + ) + + request = CompletionRequest( + model="test-model", + prompt="Hello", + stream=True, + max_streaming_response_tokens=3, + n=1, + echo=False, + max_tokens=100, + ) + + self.engine_client.data_processor.process_response_dict = AsyncMock() + + generator = self.completion_serving.completion_stream_generator( + request=request, + num_choices=1, + request_id="test-request-id", + created_time=1620000000, + model_name="test-model", + prompt_batched_token_ids=[[1, 2, 3]], + prompt_tokens_list=["Hello"], + max_tokens_list=[100], + ) + + chunks = [] + async for chunk in generator: + chunks.append(chunk) + if "[DONE]" in chunk: + break + self.assertGreater(len(chunks), 0) + + self.assertTrue(inspect.iscoroutinefunction(self.engine_client.data_processor.process_response_dict)) + self.engine_client.data_processor.process_response_dict.assert_awaited() + + flat_response_data = [] + for sub_list in final_response_data: + flat_response_data.extend(sub_list) + expected_call_times = len(flat_response_data) + actual_call_times = self.engine_client.data_processor.process_response_dict.call_count + self.assertEqual(actual_call_times, expected_call_times) + + call_args_list = self.engine_client.data_processor.process_response_dict.call_args_list + self.assertEqual(len(call_args_list), expected_call_times) + + for idx, data in enumerate(flat_response_data): + args, kwargs = call_args_list[idx] + self.assertEqual(args[0], data) + self.assertEqual(kwargs.get("stream"), True) + self.assertEqual(kwargs.get("include_stop_str_in_output"), request.include_stop_str_in_output) + if __name__ == "__main__": unittest.main() diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 940e569e186..58dc18db512 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -89,7 +89,7 @@ def test_build_prompt_logprobs_basic(self): ) as mock_decode: mock_decode.side_effect = ["token1", "token2", "token3", "token4", "token5", "token6"] - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) # Verify result structure (first element is None, then actual results) self.assertEqual(len(result), num_prompt_tokens + 1) @@ -127,7 +127,7 @@ def test_build_prompt_logprobs_with_all_logprobs(self): ) as mock_decode: mock_decode.side_effect = ["hello", "world"] - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, -1) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, -1, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -154,7 +154,7 @@ def test_build_prompt_logprobs_single_token(self): ) as mock_decode: mock_decode.return_value = "single_token" - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -183,7 +183,7 @@ def test_build_prompt_logprobs_multiple_positions(self): ) as mock_decode: mock_decode.side_effect = ["t1", "t2", "t3", "t4", "t5", "t6"] - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -217,7 +217,7 @@ def test_build_prompt_logprobs_empty_tensors(self): prompt_logprobs_tensors = LogprobsTensors(token_ids, logprobs, ranks) - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) diff --git a/tests/entrypoints/openai/test_serving_completion.py b/tests/entrypoints/openai/test_serving_completion.py index fdefd1cc3e4..761213d1d5b 100644 --- a/tests/entrypoints/openai/test_serving_completion.py +++ b/tests/entrypoints/openai/test_serving_completion.py @@ -129,6 +129,7 @@ def test_request_output_to_completion_response(self): "reasoning_token_num": 10, }, "output_token_ids": 3, + "metrics": {}, }, { "outputs": { @@ -141,6 +142,7 @@ def test_request_output_to_completion_response(self): "reasoning_token_num": 20, }, "output_token_ids": 3, + "metrics": {}, }, ] @@ -208,7 +210,7 @@ def test_build_prompt_logprobs_basic(self): ) as mock_decode: mock_decode.side_effect = ["token1", "token2", "token3", "token4", "token5", "token6"] - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) # Verify result structure (first element is None, then actual results) self.assertEqual(len(result), num_prompt_tokens + 1) @@ -246,7 +248,7 @@ def test_build_prompt_logprobs_with_all_logprobs(self): ) as mock_decode: mock_decode.side_effect = ["hello", "world"] - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, -1) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, -1, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -273,7 +275,7 @@ def test_build_prompt_logprobs_single_token(self): ) as mock_decode: mock_decode.return_value = "single_token" - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -302,7 +304,7 @@ def test_build_prompt_logprobs_multiple_positions(self): ) as mock_decode: mock_decode.side_effect = ["t1", "t2", "t3", "t4", "t5", "t6"] - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -336,7 +338,7 @@ def test_build_prompt_logprobs_empty_tensors(self): prompt_logprobs_tensors = LogprobsTensors(token_ids, logprobs, ranks) - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) diff --git a/tests/eplb/test_eplb_utils.py b/tests/eplb/test_eplb_utils.py index 675a2daee18..7ba49b8c825 100644 --- a/tests/eplb/test_eplb_utils.py +++ b/tests/eplb/test_eplb_utils.py @@ -175,6 +175,7 @@ def setUp(self): model_cfg.moe_num_experts = 64 model_cfg.moe_layer_start_index = 1 model_cfg.model = "/test/model" + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) diff --git a/tests/eplb/test_experts_manager.py b/tests/eplb/test_experts_manager.py index 01882f71d32..24e8dbd5aac 100644 --- a/tests/eplb/test_experts_manager.py +++ b/tests/eplb/test_experts_manager.py @@ -55,6 +55,7 @@ def setUp(self): model_cfg.moe_num_experts = 64 model_cfg.moe_layer_start_index = 1 model_cfg.model = "/test/model" + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py index 85516f8bd34..932a1966f9f 100644 --- a/tests/graph_optimization/test_cuda_graph_recapture.py +++ b/tests/graph_optimization/test_cuda_graph_recapture.py @@ -112,6 +112,7 @@ def test_cuda_graph_recapture(self): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 5120 + model_config.architectures = ["test_model"] fd_config = FDConfig( graph_opt_config=graph_opt_config, scheduler_config=scheduler_config, diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py index 10f4237a9de..f81d4b11cf8 100644 --- a/tests/graph_optimization/test_cuda_graph_spec_decode.py +++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py @@ -105,6 +105,7 @@ def test_cuda_graph_spec_decode(self): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] # Initialize cuda graph capture list graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) diff --git a/tests/graph_optimization/test_graph_opt_backend.py b/tests/graph_optimization/test_graph_opt_backend.py index ff5d1fcd62d..e4bac358e53 100644 --- a/tests/graph_optimization/test_graph_opt_backend.py +++ b/tests/graph_optimization/test_graph_opt_backend.py @@ -97,6 +97,7 @@ def setUp(self): baseline_parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] self.baseline_fd_config = FDConfig( graph_opt_config=baseline_graph_opt_config, scheduler_config=baseline_scheduler_config, @@ -144,6 +145,7 @@ def _setup_test_config( parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] # Create FD config return FDConfig( diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py index 366b35d61d1..9d2b419512e 100644 --- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py +++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py @@ -97,6 +97,7 @@ def test(self): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( graph_opt_config=graph_opt_config, scheduler_config=scheduler_config, diff --git a/tests/layers/test_activation.py b/tests/layers/test_activation.py index 70f011d3964..b564c267520 100644 --- a/tests/layers/test_activation.py +++ b/tests/layers/test_activation.py @@ -84,8 +84,11 @@ def test_forward_cuda(self, mock_fused, mock_platform): layer = SiluAndMul(fd_config) x = paddle.ones([2, 2]) out = layer.forward(x) - self.assertTrue((out.numpy() == 1).all()) - mock_fused.assert_called_once() + if layer.bias is None and layer.quant_scale == -1: + self.assertTrue((out.numpy() == 0.73105854).all()) + else: + self.assertTrue((out.numpy() == 1).all()) + mock_fused.assert_called_once() # Test forward computation on GCU platform @patch( diff --git a/tests/layers/test_attention_layer.py b/tests/layers/test_attention_layer.py index 106cb93cdc4..0acbada357b 100644 --- a/tests/layers/test_attention_layer.py +++ b/tests/layers/test_attention_layer.py @@ -71,7 +71,6 @@ def setUp(self): self.fd_config.parallel_config.tp_group = [0] # Initialize Attention Layer - os.environ["FD_ATTENTION_BACKEND"] = "APPEND_ATTN" attn_cls = get_attention_backend() self.attn_backend = attn_cls( self.fd_config, @@ -123,10 +122,10 @@ def create_model_config_json(self) -> str: "max_position_embeddings": 131072, "max_model_len": 131072, "head_dim": 128, - "hidden_size": 4096, - "num_attention_heads": 32, - "num_key_value_heads": 4, - "num_hidden_layers": 57, + "hidden_size": 8192, + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 2, } model_dir = tempfile.mkdtemp(prefix="tmp_model_config_") config_path = os.path.join(model_dir, "config.json") @@ -158,6 +157,7 @@ def create_fd_config_from_model_path(self, model_path, tensor_parallel_size=1): dense_quant_type="block_wise_fp8", moe_quant_type="block_wise_fp8", kv_cache_quant_type="float8_e4m3fn", + # kv_cache_quant_type=None, ), graph_opt_config=GraphOptimizationConfig({}), commit_config=CommitConfig(), @@ -270,7 +270,7 @@ def create_forward_meta( partial_rotary_factor=fd_config.model_config.partial_rotary_factor, ) - input_ids = paddle.zeros([batch_size, seq_len if mode == ForwardMode.EXTEND else 1], dtype="int64") + input_ids = paddle.zeros([batch_size, max_model_len], dtype="int64") token_num = paddle.sum(seq_lens_this_time) ids_remove_padding, batch_id_per_token, cu_seqlens_q, cu_seqlens_k = get_padding_offset( input_ids, token_num, seq_lens_this_time @@ -294,12 +294,13 @@ def create_forward_meta( attn_mask_offsets=None, **attn_backend_buffers, ) - return forward_meta + + hidden_states = paddle.randn([token_num, self.fd_config.model_config.hidden_size], dtype="bfloat16") + return forward_meta, hidden_states def test_decode_performance_with_prefill(self): # Test parameters test_steps = 100 - act_tensor_dtype = paddle.bfloat16 # prefill_batch_size = 1 # prefill_seq_len = 4096 @@ -356,11 +357,7 @@ def test_decode_performance_with_prefill(self): # p.step() for decode_batch_size in [32, 16, 8, 4, 2]: - decode_hidden_states = paddle.randn( - [decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype - ) - - forward_meta = self.create_forward_meta( + forward_meta, hidden_states = self.create_forward_meta( batch_size=decode_batch_size, seq_len=36 * 1024, mode=ForwardMode.DECODE, @@ -374,12 +371,12 @@ def test_decode_performance_with_prefill(self): paddle.device.synchronize() # 必须要先预热一次!因为预处理被放到了第一层再做了! - self.attn_forward(forward_meta, decode_hidden_states) + self.attn_forward(forward_meta, hidden_states) attn_cuda_graphs = graphs.CUDAGraph() attn_cuda_graphs.capture_begin() - self.attn_forward(forward_meta, decode_hidden_states) + self.attn_forward(forward_meta, hidden_states) attn_cuda_graphs.capture_end() diff --git a/tests/layers/test_batched_count_greater_than.py b/tests/layers/test_batched_count_greater_than.py new file mode 100644 index 00000000000..97ded31089e --- /dev/null +++ b/tests/layers/test_batched_count_greater_than.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle + +from fastdeploy.model_executor.layers.sample.logprobs import batched_count_greater_than + + +class TestBatchedCountGreaterThan(unittest.TestCase): + def setUp(self) -> None: + pass + + def naive_impl(self, x, y): + return (x >= y).sum(-1) + + def test_batched_count_greater_than(self): + vocab_size_list = [151552, 566] + test_token_nums = [1, 32, 128, 1024, 8192] + for idx, num_tokens in enumerate(test_token_nums): + for vocab_size in vocab_size_list: + x = paddle.randn([num_tokens, vocab_size], dtype="float32") + y = paddle.randn([num_tokens, 1], dtype="float32") + x[0, 0] = -float("inf") + y[0, 0] = -float("inf") + out = self.naive_impl(x, y) + out_triton = batched_count_greater_than(x, y) + self.assertTrue(np.allclose(out.numpy(), out_triton.numpy())) + + return out + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py index ed4fe5b28b6..346afc98fd3 100644 --- a/tests/layers/test_fusedmoe.py +++ b/tests/layers/test_fusedmoe.py @@ -31,6 +31,7 @@ LoadConfig, ModelConfig, ParallelConfig, + RoutingReplayConfig, ) from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.quantization.block_wise_fp8 import ( @@ -476,6 +477,7 @@ def __init__( graph_opt_config=GraphOptimizationConfig({}), load_config=LoadConfig({}), ips=",".join(["0"] * nnodes), + routing_replay_config=RoutingReplayConfig({}), ) self.fd_config.parallel_config.tp_group = None self.fd_config.parallel_config.tensor_parallel_rank = tp_rank diff --git a/tests/layers/test_guided_decoding.py b/tests/layers/test_guided_decoding.py index 964ad1dc02b..ad592c118ed 100644 --- a/tests/layers/test_guided_decoding.py +++ b/tests/layers/test_guided_decoding.py @@ -11,10 +11,12 @@ mock_torch = MagicMock() mock_xgrammar = MagicMock() -sys.modules["torch"] = mock_torch sys.modules["xgrammar"] = mock_xgrammar +sys.modules["torch"] = None from fastdeploy.model_executor.guided_decoding import LogitsProcessorBase + +sys.modules["torch"] = mock_torch from fastdeploy.model_executor.layers.sample.sampler import GuidedDecoding from fastdeploy.reasoning import ReasoningParser diff --git a/tests/layers/test_speculative_sampler.py b/tests/layers/test_speculative_sampler.py index 32f95bfd91b..c62baa74ece 100644 --- a/tests/layers/test_speculative_sampler.py +++ b/tests/layers/test_speculative_sampler.py @@ -30,6 +30,7 @@ from fastdeploy.model_executor.layers.sample.sampler import ( MTPSampler, SpeculativeSampler, + padding_sampling_params, ) @@ -72,7 +73,7 @@ def _create_default_sampling_metadata( bad_words_token_ids=paddle.full(shape=[batch_size], fill_value=-1, dtype="int64"), eos_token_ids=paddle.full(shape=[batch_size], fill_value=-2, dtype="int64"), min_p=paddle.randn([batch_size]), - seed=paddle.to_tensor([[2025]]), + seed=paddle.full(shape=[batch_size], fill_value=0, dtype="int64"), ) if max_num_logprobs is not None: fake_sampling_metadata.max_num_logprobs = max_num_logprobs @@ -82,6 +83,7 @@ def _create_default_sampling_metadata( def _create_fd_config(max_model_len): model_config: Mock = Mock() model_config.max_model_len = max_model_len + model_config.architectures = ["test_model"] speculative_config = SpeculativeConfig({}) graph_opt_config = GraphOptimizationConfig({}) scheduler_config = SchedulerConfig({}) @@ -143,6 +145,19 @@ def _create_share_inputs(max_num_seqs, max_draft_token_num, max_model_len, vocab return share_inputs +def _create_padding_inputs(): + # batch_size = 3 + top_p = paddle.to_tensor([[0.9], [0.8], [0.7], [1.0]], dtype="float32") + top_k = paddle.to_tensor([[10], [20], [30], [40]], dtype="int32") + infer_seed = paddle.to_tensor([[100], [200], [300], [400]], dtype="int64") + + # decoder, encoder, decoder + seq_lens_encoder = paddle.to_tensor([[0], [5], [0], [0]], dtype="int32") + seq_lens_this_time = paddle.to_tensor([[3], [2], [0], [2]], dtype="int32") + + return top_p, top_k, infer_seed, seq_lens_this_time, seq_lens_encoder + + def test_speculative_sampler(): batch_size = 32 vocab_size = 1024 @@ -220,8 +235,52 @@ def test_mtp_sampler_logprobs(): sampler(logits, sampling_metadata, max_model_len, share_inputs) +def test_padding_sampling_params_basic(): + top_p, top_k, infer_seed, seq_lens_this_time, seq_lens_encoder = _create_padding_inputs() + + top_p_pad, top_k_pad, seed_pad = padding_sampling_params( + top_p, top_k, infer_seed, seq_lens_this_time, seq_lens_encoder + ) + + # decoder(3) + encoder(1) + decoder(2) = 6 + assert top_p_pad.shape == [6, 1] + assert top_k_pad.shape == [6, 1] + assert seed_pad.shape == [6, 1] + + # top_p padding check + expected_top_p = [0.9, 0.9, 0.9, 0.8, 1.0, 1.0] + assert paddle.allclose(top_p_pad.squeeze(), paddle.to_tensor(expected_top_p, dtype="float32")) + + # top_k padding check + expected_top_k = [10, 10, 10, 20, 40, 40] + assert paddle.equal_all(top_k_pad.squeeze(), paddle.to_tensor(expected_top_k, dtype="int32")) + + +def test_padding_sampling_params_seed_offset(): + top_p, top_k, infer_seed, seq_lens_this_time, seq_lens_encoder = _create_padding_inputs() + + _, _, seed_pad = padding_sampling_params(top_p, top_k, infer_seed, seq_lens_this_time, seq_lens_encoder) + + # decoder(0): 100 + 4*k + # encoder(1): 200 (no offset) + # null + # decoder(3): 400 + 4*k + expected_seed = [ + 100, + 104, + 108, # first decoder seq (len=3) + 200, # encoder + 400, + 404, # second decoder seq (len=2) + ] + + assert paddle.equal_all(seed_pad.squeeze(), paddle.to_tensor(expected_seed, dtype="int64")) + + if __name__ == "__main__": test_speculative_sampler() test_speculative_sampler_logprobs() test_mtp_sampler() test_mtp_sampler_logprobs() + test_padding_sampling_params_basic() + test_padding_sampling_params_seed_offset() diff --git a/tests/layers/test_w4a8_moe.py b/tests/layers/test_w4a8_moe.py index dc6dab15427..f20c27b06bf 100644 --- a/tests/layers/test_w4a8_moe.py +++ b/tests/layers/test_w4a8_moe.py @@ -13,6 +13,7 @@ LoadConfig, ModelConfig, ParallelConfig, + RoutingReplayConfig, ) from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.quantization.w4a8 import W4A8Config @@ -59,6 +60,7 @@ def __init__( graph_opt_config=GraphOptimizationConfig({}), load_config=LoadConfig({}), ips=",".join(["0"] * nnodes), + routing_replay_config=RoutingReplayConfig({}), ) self.fd_config.parallel_config.tp_group = None self.fd_config.parallel_config.tensor_parallel_rank = tp_rank diff --git a/tests/layers/test_w4afp8_moe.py b/tests/layers/test_w4afp8_moe.py index 65b7733172c..8f1ae79cd67 100644 --- a/tests/layers/test_w4afp8_moe.py +++ b/tests/layers/test_w4afp8_moe.py @@ -13,6 +13,7 @@ LoadConfig, ModelConfig, ParallelConfig, + RoutingReplayConfig, ) from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.quantization.w4afp8 import W4AFP8Config @@ -65,6 +66,7 @@ def __init__( graph_opt_config=GraphOptimizationConfig({}), load_config=LoadConfig({}), ips=",".join(["0"] * nnodes), + routing_replay_config=RoutingReplayConfig({}), ) self.fd_config.parallel_config.tp_group = None self.fd_config.parallel_config.tensor_parallel_rank = tp_rank diff --git a/tests/model_executor/guided_decoding/test_guidance_checker.py b/tests/model_executor/guided_decoding/test_guidance_checker.py index 454231bfbef..574f310550a 100644 --- a/tests/model_executor/guided_decoding/test_guidance_checker.py +++ b/tests/model_executor/guided_decoding/test_guidance_checker.py @@ -51,6 +51,7 @@ def llguidance_checker_with_options(): return LLGuidanceChecker(disable_any_whitespace=True) +sys.modules["torch"] = None from fastdeploy.model_executor.guided_decoding.guidance_backend import LLGuidanceChecker diff --git a/tests/model_executor/guided_decoding/test_xgrammar_checker.py b/tests/model_executor/guided_decoding/test_xgrammar_checker.py index b911e499339..ca550655a69 100644 --- a/tests/model_executor/guided_decoding/test_xgrammar_checker.py +++ b/tests/model_executor/guided_decoding/test_xgrammar_checker.py @@ -20,10 +20,12 @@ mock_torch = MagicMock() mock_xgrammar = MagicMock() -sys.modules["torch"] = mock_torch +sys.modules["torch"] = None sys.modules["xgrammar"] = mock_xgrammar from fastdeploy.engine.request import Request + +sys.modules["torch"] = mock_torch from fastdeploy.model_executor.guided_decoding.xgrammar_backend import XGrammarChecker diff --git a/tests/model_executor/test_entropy_utils.py b/tests/model_executor/test_entropy_utils.py new file mode 100644 index 00000000000..c5df901485c --- /dev/null +++ b/tests/model_executor/test_entropy_utils.py @@ -0,0 +1,268 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle + +from fastdeploy.model_executor.entropy_utils import ( + calculate_logits_entropy, + speculate_calculate_logits_entropy, +) + + +class TestCalculateLogitsEntropy(unittest.TestCase): + + def test_basic_functionality(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[1], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], []], + "stop_flags": paddle.to_tensor([[False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3"], + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, 1.0], + [1.0, 1.0, 10.0], + ], + dtype="float32", + ) + temperature = paddle.ones([3], dtype="float32") + + calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 1) + self.assertEqual(len(share_inputs["entropy_list"][1]), 0) + self.assertEqual(len(share_inputs["entropy_list"][2]), 1) + + self.assertAlmostEqual(share_inputs["entropy_list"][0][0], 0.0024676250759512186, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][2][0], 0.0024676250759512186, places=6) + + def test_temperature_effect(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[1], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], []], + "stop_flags": paddle.to_tensor([[False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3"], + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, 1.0], + [1.0, 1.0, 10.0], + ], + dtype="float32", + ) + temperature = paddle.to_tensor([[0.8], [1.0], [0.8]], dtype="float32") + + calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 1) + self.assertEqual(len(share_inputs["entropy_list"][1]), 0) + self.assertEqual(len(share_inputs["entropy_list"][2]), 1) + + self.assertAlmostEqual(share_inputs["entropy_list"][0][0], 0.0003187173861078918, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][2][0], 0.0003187173861078918, places=6) + + def test_entropy_list_clear(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[1], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], []], + "stop_flags": paddle.to_tensor([[True], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3"], + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, 1.0], + [1.0, 1.0, 10.0], + ], + dtype="float32", + ) + temperature = paddle.to_tensor([[0.8], [1.0], [0.8]], dtype="float32") + + calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 0) + self.assertEqual(len(share_inputs["entropy_list"][1]), 0) + self.assertEqual(len(share_inputs["entropy_list"][2]), 1) + + self.assertAlmostEqual(share_inputs["entropy_list"][2][0], 0.0003187173861078918, places=6) + + def test_negative_inf_clip(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[1], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], []], + "stop_flags": paddle.to_tensor([[False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3"], + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, -float("inf")], + [1.0, 1.0, -float("inf")], + ], + dtype="float32", + ) + temperature = paddle.ones([3], dtype="float32") + + calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 1) + self.assertEqual(len(share_inputs["entropy_list"][1]), 0) + self.assertEqual(len(share_inputs["entropy_list"][2]), 1) + + self.assertAlmostEqual(share_inputs["entropy_list"][0][0], 0.0017332095885649323, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][2][0], 1.017357349395752, places=6) + + +class TestSpeculateCalculateLogitsEntropy(unittest.TestCase): + + def test_basic_functionality(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[2], [2], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], [], []], + "stop_flags": paddle.to_tensor([[False], [False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3", "req_4"], + "accept_num": paddle.to_tensor([2, 1, 0, 0], dtype="int32"), # 推理接受数量 + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, 1.0], + [1.0, 10.0, 1.0], + [1.0, 1.0, 10.0], + [1.0, 1.0, 10.0], + ], + dtype="float32", + ) + temperature = paddle.ones([3], dtype="float32") + + speculate_calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 2) + self.assertEqual(len(share_inputs["entropy_list"][1]), 1) + self.assertEqual(len(share_inputs["entropy_list"][2]), 0) + self.assertEqual(len(share_inputs["entropy_list"][3]), 0) + + self.assertAlmostEqual(share_inputs["entropy_list"][0][0], 0.0024676250759512186, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][0][1], 0.0024676250759512186, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][1][0], 0.0024676250759512186, places=6) + + def test_temperature_effect(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[2], [2], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], [], []], + "stop_flags": paddle.to_tensor([[False], [False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3", "req_4"], + "accept_num": paddle.to_tensor([2, 1, 0, 0], dtype="int32"), # 推理接受数量 + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, 1.0], + [1.0, 10.0, 1.0], + [1.0, 1.0, 10.0], + [1.0, 1.0, 10.0], + ], + dtype="float32", + ) + temperature = paddle.to_tensor([[0.8], [0.8], [0.8], [0.8]], dtype="float32") + + speculate_calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 2) + self.assertEqual(len(share_inputs["entropy_list"][1]), 1) + self.assertEqual(len(share_inputs["entropy_list"][2]), 0) + self.assertEqual(len(share_inputs["entropy_list"][3]), 0) + + self.assertAlmostEqual(share_inputs["entropy_list"][0][0], 0.0003187173861078918, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][0][1], 0.0003187173861078918, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][1][0], 0.0003187173861078918, places=6) + + def test_entropy_list_clear(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[2], [2], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], [], []], + "stop_flags": paddle.to_tensor([[True], [False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3", "req_4"], + "accept_num": paddle.to_tensor([2, 1, 0, 0], dtype="int32"), # 推理接受数量 + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, 1.0], + [1.0, 10.0, 1.0], + [1.0, 1.0, 10.0], + [1.0, 1.0, 10.0], + ], + dtype="float32", + ) + temperature = paddle.ones([3], dtype="float32") + + speculate_calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 0) + self.assertEqual(len(share_inputs["entropy_list"][1]), 1) + self.assertEqual(len(share_inputs["entropy_list"][2]), 0) + self.assertEqual(len(share_inputs["entropy_list"][3]), 0) + + self.assertAlmostEqual(share_inputs["entropy_list"][1][0], 0.0024676250759512186, places=6) + + def test_negative_inf_clip(self): + share_inputs = { + "seq_lens_this_time": paddle.to_tensor([[1], [0], [15]], dtype="int32"), + "seq_lens_encoder": paddle.to_tensor([[0], [0], [15]], dtype="int32"), + "seq_lens_decoder": paddle.to_tensor([[30], [0], [15]], dtype="int32"), + "entropy_list": [[], [], []], + "stop_flags": paddle.to_tensor([[False], [True], [False]], dtype="bool"), + "req_ids": ["req_1", "req_2", "req_3"], + } + + logits = paddle.to_tensor( + [ + [10.0, 1.0, -float("inf")], + [1.0, 1.0, -float("inf")], + ], + dtype="float32", + ) + temperature = paddle.ones([3], dtype="float32") + + calculate_logits_entropy(logits, share_inputs, temperature) + + self.assertEqual(len(share_inputs["entropy_list"][0]), 1) + self.assertEqual(len(share_inputs["entropy_list"][1]), 0) + self.assertEqual(len(share_inputs["entropy_list"][2]), 1) + + self.assertAlmostEqual(share_inputs["entropy_list"][0][0], 0.0017332095885649323, places=6) + self.assertAlmostEqual(share_inputs["entropy_list"][2][0], 1.017357349395752, places=6) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/model_executor/test_tp_utils.py b/tests/model_executor/test_tp_utils.py index 97b6427ad4d..8953bb9637d 100644 --- a/tests/model_executor/test_tp_utils.py +++ b/tests/model_executor/test_tp_utils.py @@ -106,13 +106,13 @@ def _resolve_prefix_keys(cls, keys, _safetensor_keys): conversion_utils = types.ModuleType("paddleformers.transformers.conversion_utils") - def _split_or_merge_func(is_split, tensor_parallel_degree, tensor_parallel_rank, **_kwargs): + def _split_or_merge_func(is_split, tensor_model_parallel_size, tensor_parallel_rank, **_kwargs): axis = -1 def _fn(weight, *, is_column=True, **_kwargs): current_axis = axis if is_column else 0 if is_split: - chunks = np.array_split(weight, tensor_parallel_degree, axis=current_axis) + chunks = np.array_split(weight, tensor_model_parallel_size, axis=current_axis) if tensor_parallel_rank is None: return chunks return chunks[tensor_parallel_rank] @@ -396,7 +396,7 @@ def test_invalid_placeholder_raises(self): class GQATensorOpsTest(unittest.TestCase): def test_gqa_split_returns_all_partitions(self): func = _tp_utils.gqa_qkv_split_func( - tensor_parallel_degree=2, + tensor_model_parallel_size=2, tensor_parallel_rank=None, num_attention_heads=4, num_key_value_heads=2, @@ -411,7 +411,7 @@ def test_gqa_split_returns_all_partitions(self): def test_gqa_split_with_rank_and_repeat_kv(self): func = _tp_utils.gqa_qkv_split_func( - tensor_parallel_degree=2, + tensor_model_parallel_size=2, tensor_parallel_rank=1, num_attention_heads=2, num_key_value_heads=1, @@ -423,7 +423,7 @@ def test_gqa_split_with_rank_and_repeat_kv(self): def test_gqa_split_on_matrix_rows(self): func = _tp_utils.gqa_qkv_split_func( - tensor_parallel_degree=2, + tensor_model_parallel_size=2, tensor_parallel_rank=None, num_attention_heads=4, num_key_value_heads=2, @@ -454,7 +454,7 @@ def test_split_or_merge_qkv_dispatch(self): def test_split_or_merge_func_v1_row_bias(self): fn = _tp_utils.split_or_merge_func_v1( is_split=True, - tensor_parallel_degree=4, + tensor_model_parallel_size=4, tensor_parallel_rank=0, ) bias = np.ones(4, dtype=np.float32) @@ -464,7 +464,7 @@ def test_split_or_merge_func_v1_row_bias(self): def test_split_or_merge_func_v1_gqa_path(self): fn = _tp_utils.split_or_merge_func_v1( is_split=True, - tensor_parallel_degree=2, + tensor_model_parallel_size=2, tensor_parallel_rank=None, num_attention_heads=4, num_key_value_heads=2, @@ -477,7 +477,7 @@ def test_split_or_merge_func_v1_gqa_path(self): def test_split_or_merge_func_v1_default_path(self): fn = _tp_utils.split_or_merge_func_v1( is_split=False, - tensor_parallel_degree=2, + tensor_model_parallel_size=2, tensor_parallel_rank=None, num_attention_heads=4, ) diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index bc8252a4427..0170bef1da6 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -140,7 +140,7 @@ def test_model_against_baseline( # Get baseline suffix from config model_config = hugging_face_model_param_map.get(model_name_or_path, {}) - baseline_suffix = model_config.get("baseline_suffix", "tp2") + baseline_suffix = model_config.get("baseline_suffix", "tp2-24") baseline_filename = f"{model_name_or_path}-{baseline_suffix}" if base_path: diff --git a/tests/operators/test_draft_model_preprocess.py b/tests/operators/test_draft_model_preprocess.py index 8bd735111a6..5a4e418d317 100644 --- a/tests/operators/test_draft_model_preprocess.py +++ b/tests/operators/test_draft_model_preprocess.py @@ -87,6 +87,8 @@ def draft_model_preprocess_kernel( is_block_step, batch_drop, pre_ids, + mask_rollback, + recompute_token_num, accept_tokens, accept_num, base_model_seq_lens_this_time, @@ -114,6 +116,7 @@ def draft_model_preprocess_kernel( base_model_seq_len_decoder = base_model_seq_lens_decoder[tid] base_model_seq_len_this_time = base_model_seq_lens_this_time[tid] pre_ids_now = pre_ids[tid] + recompute_token_num_now = recompute_token_num[tid] base_model_draft_tokens_now[1:base_model_draft_tokens_len] = -1 @@ -156,8 +159,10 @@ def draft_model_preprocess_kernel( step_idx[tid] = base_model_step_idx[tid] - base_model_seq_len_this_time else: # 2: Last base model generated token and first MTP token - seq_lens_decoder[tid] -= num_model_step - 1 - step_idx[tid] -= num_model_step - 1 + seq_lens_decoder[tid] -= recompute_token_num_now + step_idx[tid] -= recompute_token_num_now + mask_rollback[tid] += recompute_token_num_now + recompute_token_num[tid] = num_model_step - 1 for i in range(accept_num_now): draft_tokens_now[i] = accept_tokens_now[i] @@ -187,6 +192,8 @@ def DispatchRunner( is_block_step, batch_drop, pre_ids, + mask_rollback, + recompute_token_num, accept_tokens, accept_num, base_model_seq_lens_this_time, @@ -244,6 +251,8 @@ def DispatchRunner( is_block_step, batch_drop, pre_ids, + mask_rollback, + recompute_token_num, accept_tokens, accept_num, base_model_seq_lens_this_time, @@ -273,6 +282,8 @@ def draft_model_preprocess_ref( is_block_step, batch_drop, pre_ids, + mask_rollback, + recompute_token_num, accept_tokens, accept_num, base_model_seq_lens_this_time, @@ -301,6 +312,8 @@ def draft_model_preprocess_ref( is_block_step, batch_drop, pre_ids, + mask_rollback, + recompute_token_num, accept_tokens, accept_num, base_model_seq_lens_this_time, @@ -318,7 +331,7 @@ def draft_model_preprocess_ref( ) -class TestDraftModelPreprocess: +class TestDraftModelPreprocess(unittest.TestCase): def _run_tests(self): paddle.seed(2022) @@ -343,6 +356,8 @@ def _run_tests(self): not_need_stop = paddle.zeros([1], dtype="bool").cpu() is_block_step = paddle.zeros([bsz], dtype="bool") batch_drop = paddle.zeros([bsz], dtype="bool") + mask_rollback = paddle.zeros([bsz], dtype="int32") + recompute_token_num = paddle.zeros([bsz], dtype="int32") # Output tensors accept_tokens = paddle.randint(0, 100, [bsz, 100], dtype="int64") @@ -371,6 +386,8 @@ def _run_tests(self): is_block_step, batch_drop, pre_ids, + mask_rollback, + recompute_token_num, accept_tokens, accept_num, base_model_seq_lens_this_time, @@ -393,13 +410,8 @@ def _run_tests(self): def test_draft_model_preprocess(self): results1, results2 = self._run_tests() - np.testing.assert_allclose(results1[0], results2[0]) # draft_tokens - np.testing.assert_allclose(results1[1], results2[1]) # input_ids - np.testing.assert_allclose(results1[2], results2[2]) # stop_flags - np.testing.assert_allclose(results1[3], results2[3]) # seq_lens_this_time - np.testing.assert_allclose(results1[11], results2[11]) # accept_tokens - np.testing.assert_allclose(results1[12], results2[12]) # accept_num - np.testing.assert_allclose(results1[7], results2[7]) # not_need_stop + for i in range(12): + np.testing.assert_equal(results1[i].numpy(), results2[i].numpy()) if __name__ == "__main__": diff --git a/tests/operators/test_pre_cache_len_concat.py b/tests/operators/test_pre_cache_len_concat.py index 4844c1c712d..84389a104f5 100644 --- a/tests/operators/test_pre_cache_len_concat.py +++ b/tests/operators/test_pre_cache_len_concat.py @@ -69,7 +69,10 @@ def test_smoke_shapes(self): seq_lens_decoder_t = paddle.to_tensor(seq_lens_decoder, dtype="int32") seq_lens_this_time_t = paddle.to_tensor(seq_lens_this_time, dtype="int32") - outputs = pre_cache_len_concat(seq_lens_decoder_t, seq_lens_this_time_t, max_dec_len, block_size) + seq_lens_encoder_t = seq_lens_this_time_t + outputs = pre_cache_len_concat( + seq_lens_encoder_t, seq_lens_decoder_t, seq_lens_this_time_t, max_dec_len, block_size + ) cu_seqlens_k, batch_ids, tile_ids, num_blocks, kv_token_num = [out.numpy() for out in outputs] # Shape checks @@ -91,8 +94,11 @@ def test_strict_values_with_ref(self): seq_lens_decoder_t = paddle.to_tensor(seq_lens_decoder, dtype="int32") seq_lens_this_time_t = paddle.to_tensor(seq_lens_this_time, dtype="int32") + seq_lens_encoder_t = seq_lens_this_time_t - outputs = pre_cache_len_concat(seq_lens_decoder_t, seq_lens_this_time_t, max_dec_len, block_size) + outputs = pre_cache_len_concat( + seq_lens_encoder_t, seq_lens_decoder_t, seq_lens_this_time_t, max_dec_len, block_size + ) cu_seqlens_k, batch_ids, tile_ids, num_blocks, kv_token_num = [out.numpy() for out in outputs] # Reference implementation diff --git a/tests/operators/test_speculate_limit_thinking_content_length.py b/tests/operators/test_speculate_limit_thinking_content_length.py index aa36793f6c5..2f88c1572b6 100644 --- a/tests/operators/test_speculate_limit_thinking_content_length.py +++ b/tests/operators/test_speculate_limit_thinking_content_length.py @@ -36,7 +36,6 @@ def test_normal_thinking_phase_no_truncation(self): step_idx = paddle.to_tensor([5, 8], dtype="int64") limit_think_status = paddle.to_tensor([0, 0], dtype="int32") accept_num = paddle.to_tensor([3, 2], dtype="int32") - seq_lens_decoder = paddle.to_tensor([5, 8], dtype="int32") stop_flags = paddle.to_tensor([False, False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -48,7 +47,6 @@ def test_normal_thinking_phase_no_truncation(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -75,7 +73,6 @@ def test_force_truncation_when_exceeding_limit(self): step_idx = paddle.to_tensor([12], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([4], dtype="int32") - seq_lens_decoder = paddle.to_tensor([12], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -87,7 +84,6 @@ def test_force_truncation_when_exceeding_limit(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -99,9 +95,8 @@ def test_force_truncation_when_exceeding_limit(self): assert next_tokens.numpy()[0, 1] == 999 # Token at step 10, replaced with think_end_id assert accept_num.numpy()[0] == 2 # Only accept first 2 tokens assert limit_think_status.numpy()[0] == 2 # Status updated to 2 - # step_idx and seq_lens_decoder should be adjusted + # step_idx should be adjusted assert step_idx.numpy()[0] == 10 # 12 - (4-2) = 10 - assert seq_lens_decoder.numpy()[0] == 10 # 12 - (4-2) = 10 def test_model_naturally_generates_think_end_id(self): """Test when model naturally generates think_end_id in accepted tokens""" @@ -110,7 +105,6 @@ def test_model_naturally_generates_think_end_id(self): step_idx = paddle.to_tensor([5], dtype="int64") # step 3-5 limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([3], dtype="int32") - seq_lens_decoder = paddle.to_tensor([5], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -122,7 +116,6 @@ def test_model_naturally_generates_think_end_id(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -140,7 +133,6 @@ def test_disabled_feature_negative_max_think_len(self): step_idx = paddle.to_tensor([100], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([3], dtype="int32") - seq_lens_decoder = paddle.to_tensor([100], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -152,7 +144,6 @@ def test_disabled_feature_negative_max_think_len(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -170,7 +161,6 @@ def test_zero_accept_num_early_return(self): step_idx = paddle.to_tensor([10], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([0], dtype="int32") # No tokens accepted - seq_lens_decoder = paddle.to_tensor([10], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -182,7 +172,6 @@ def test_zero_accept_num_early_return(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -199,7 +188,6 @@ def test_already_in_response_phase_status_3(self): step_idx = paddle.to_tensor([10], dtype="int64") limit_think_status = paddle.to_tensor([3], dtype="int32") # Terminal status accept_num = paddle.to_tensor([2], dtype="int32") - seq_lens_decoder = paddle.to_tensor([10], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -211,7 +199,6 @@ def test_already_in_response_phase_status_3(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -228,7 +215,6 @@ def test_status_transition_from_0_to_1_to_2(self): step_idx = paddle.to_tensor([9], dtype="int64") # base step = 9-2+1 = 8 limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([2], dtype="int32") - seq_lens_decoder = paddle.to_tensor([9], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -239,7 +225,6 @@ def test_status_transition_from_0_to_1_to_2(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -258,7 +243,6 @@ def test_mixed_batch_with_different_states(self): step_idx = paddle.to_tensor([6, 8, 50], dtype="int64") limit_think_status = paddle.to_tensor([0, 0, 0], dtype="int32") accept_num = paddle.to_tensor([3, 3, 2], dtype="int32") - seq_lens_decoder = paddle.to_tensor([6, 8, 50], dtype="int32") stop_flags = paddle.to_tensor([False, False, False], dtype="bool") eos_token_ids = paddle.to_tensor([[2], [2]], dtype="int64") think_end_id = 999 @@ -270,7 +254,6 @@ def test_mixed_batch_with_different_states(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, eos_token_ids, think_end_id, @@ -302,7 +285,6 @@ def test_normal_thinking_phase_no_truncation(self): step_idx = paddle.to_tensor([5, 8], dtype="int64") limit_think_status = paddle.to_tensor([0, 0], dtype="int32") accept_num = paddle.to_tensor([3, 2], dtype="int32") - seq_lens_decoder = paddle.to_tensor([5, 8], dtype="int32") stop_flags = paddle.to_tensor([False, False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -314,7 +296,6 @@ def test_normal_thinking_phase_no_truncation(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -335,7 +316,6 @@ def test_force_truncation_with_sequence_injection(self): step_idx = paddle.to_tensor([12], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([5], dtype="int32") - seq_lens_decoder = paddle.to_tensor([12], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -347,7 +327,6 @@ def test_force_truncation_with_sequence_injection(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -358,7 +337,6 @@ def test_force_truncation_with_sequence_injection(self): assert limit_think_status.numpy()[0] == 1 assert accept_num.numpy()[0] == 1 # Truncated after 1st token assert step_idx.numpy()[0] == 8 # 12 - (5-1) - assert seq_lens_decoder.numpy()[0] == 8 def test_injection_sequence_steps(self): """Test each step of the injection sequence: \n, , \n, \n""" @@ -371,7 +349,6 @@ def test_injection_sequence_steps(self): step_idx = paddle.to_tensor([5], dtype="int64") # base_step = 5-1+1 = 5 limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([1], dtype="int32") - seq_lens_decoder = paddle.to_tensor([5], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") speculate_limit_thinking_content_length_v2( @@ -380,7 +357,6 @@ def test_injection_sequence_steps(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -393,7 +369,6 @@ def test_injection_sequence_steps(self): step_idx = paddle.to_tensor([6], dtype="int64") # base_step = 6 limit_think_status = paddle.to_tensor([1], dtype="int32") accept_num = paddle.to_tensor([1], dtype="int32") - seq_lens_decoder = paddle.to_tensor([6], dtype="int32") speculate_limit_thinking_content_length_v2( next_tokens, @@ -401,7 +376,6 @@ def test_injection_sequence_steps(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -414,7 +388,6 @@ def test_injection_sequence_steps(self): step_idx = paddle.to_tensor([7], dtype="int64") limit_think_status = paddle.to_tensor([1], dtype="int32") accept_num = paddle.to_tensor([1], dtype="int32") - seq_lens_decoder = paddle.to_tensor([7], dtype="int32") speculate_limit_thinking_content_length_v2( next_tokens, @@ -422,7 +395,6 @@ def test_injection_sequence_steps(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -435,7 +407,6 @@ def test_injection_sequence_steps(self): step_idx = paddle.to_tensor([8], dtype="int64") limit_think_status = paddle.to_tensor([1], dtype="int32") accept_num = paddle.to_tensor([1], dtype="int32") - seq_lens_decoder = paddle.to_tensor([8], dtype="int32") speculate_limit_thinking_content_length_v2( next_tokens, @@ -443,7 +414,6 @@ def test_injection_sequence_steps(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -458,7 +428,6 @@ def test_model_naturally_generates_think_end_id(self): step_idx = paddle.to_tensor([5], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([3], dtype="int32") - seq_lens_decoder = paddle.to_tensor([5], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -470,7 +439,6 @@ def test_model_naturally_generates_think_end_id(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -486,7 +454,6 @@ def test_status_2_to_status_3_transition(self): step_idx = paddle.to_tensor([10], dtype="int64") limit_think_status = paddle.to_tensor([2], dtype="int32") accept_num = paddle.to_tensor([1], dtype="int32") - seq_lens_decoder = paddle.to_tensor([10], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -498,7 +465,6 @@ def test_status_2_to_status_3_transition(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -514,7 +480,6 @@ def test_disabled_feature_negative_max_think_len(self): step_idx = paddle.to_tensor([100], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([2], dtype="int32") - seq_lens_decoder = paddle.to_tensor([100], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -526,7 +491,6 @@ def test_disabled_feature_negative_max_think_len(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -543,7 +507,6 @@ def test_zero_accept_num_early_return(self): step_idx = paddle.to_tensor([10], dtype="int64") limit_think_status = paddle.to_tensor([0], dtype="int32") accept_num = paddle.to_tensor([0], dtype="int32") - seq_lens_decoder = paddle.to_tensor([10], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -555,7 +518,6 @@ def test_zero_accept_num_early_return(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, @@ -572,7 +534,6 @@ def test_already_in_response_phase_status_3(self): step_idx = paddle.to_tensor([10], dtype="int64") limit_think_status = paddle.to_tensor([3], dtype="int32") accept_num = paddle.to_tensor([1], dtype="int32") - seq_lens_decoder = paddle.to_tensor([10], dtype="int32") stop_flags = paddle.to_tensor([False], dtype="bool") think_end_id = 999 line_break_id = 888 @@ -584,7 +545,6 @@ def test_already_in_response_phase_status_3(self): step_idx, limit_think_status, accept_num, - seq_lens_decoder, stop_flags, think_end_id, line_break_id, diff --git a/tests/output/test_process_batch_output.py b/tests/output/test_process_batch_output.py index 6dd8f51356b..46fed90fb05 100644 --- a/tests/output/test_process_batch_output.py +++ b/tests/output/test_process_batch_output.py @@ -117,6 +117,7 @@ def setup_token_processor(self, speculative_decoding=False, use_logprobs=False): cfg.speculative_config.method = "mtp" if speculative_decoding else None cfg.speculative_config.num_speculative_tokens = 1 cfg.model_config.enable_logprob = use_logprobs + cfg.speculative_config.enable_draft_logprob = True processor = TokenProcessor.__new__(TokenProcessor) processor.cfg = cfg @@ -134,17 +135,15 @@ def setup_token_processor(self, speculative_decoding=False, use_logprobs=False): processor.number_of_output_tokens = 0 processor.prefill_result_status = {} processor.use_logprobs = use_logprobs + processor.enable_draft_logprob = cfg.speculative_config.enable_draft_logprob processor.num_draft_tokens = 0 processor.num_accepted_tokens = 0 processor.num_emitted_tokens = 0 processor.max_num_emitted_tokens = 0 - processor.num_rest_requests_per_head = [ - 0, - ] * MAX_DRAFT_TOKENS - processor.num_accept_requests_per_head = [ - 0, - ] * MAX_DRAFT_TOKENS processor.speculative_stats_step = 0 + processor.total_step_per_request = {} + processor.accept_token_num_per_head_per_request = {} + processor.accept_token_num_per_head = [0] * MAX_DRAFT_TOKENS # processor._recycle_resources = Mock() diff --git a/tests/utils.py b/tests/utils.py index b6bf65317a8..ff178b594e5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -60,6 +60,7 @@ def __init__(self): self.model_format = "auto" self.enable_mm = False self.max_model_len = 512 + self.architectures = ["test_model"] def get_default_test_fd_config(): diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 82f06ef0ea8..4f491052067 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -38,6 +38,7 @@ def test_fdconfig_nnode(self): scheduler_config = SchedulerConfig({}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, @@ -59,6 +60,7 @@ def test_fdconfig_ips(self): scheduler_config = SchedulerConfig({}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, @@ -80,6 +82,7 @@ def test_fdconfig_max_num_tokens(self): scheduler_config = SchedulerConfig({}) model_config: Mock = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, @@ -119,6 +122,7 @@ def test_fdconfig_init_cache(self): scheduler_config.splitwise_role = "prefill" model_config: Mock = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, diff --git a/tests/utils/test_download.py b/tests/utils/test_download.py index 50b8e99c07d..15369f4671a 100644 --- a/tests/utils/test_download.py +++ b/tests/utils/test_download.py @@ -127,7 +127,7 @@ def test_init_bos_client_missing_envs(self): with self.assertRaises(Exception) as context: init_bos_client() - self.assertIn("BOS client validation error", str(context.exception)) + self.assertIn("Create BOSClient Error, Please check your ENV", str(context.exception)) os.environ.clear() diff --git a/tests/v1/cache_manager/test_prefix_cache.py b/tests/v1/cache_manager/test_prefix_cache.py index 8107d5597b2..1d4111f681a 100644 --- a/tests/v1/cache_manager/test_prefix_cache.py +++ b/tests/v1/cache_manager/test_prefix_cache.py @@ -33,6 +33,7 @@ def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_over model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196) speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args) diff --git a/tests/v1/cache_manager/test_revert_blocks.py b/tests/v1/cache_manager/test_revert_blocks.py deleted file mode 100644 index 5c23f4faea8..00000000000 --- a/tests/v1/cache_manager/test_revert_blocks.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from dataclasses import asdict -from types import SimpleNamespace - -from fastdeploy.cache_manager.cache_data import BlockNode -from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager -from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig -from fastdeploy.engine.args_utils import EngineArgs -from fastdeploy.engine.request import ImagePosition, Request -from fastdeploy.scheduler import SchedulerConfig - - -def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_override=100, max_num_batched_tokens=3200): - engine_args = EngineArgs( - max_num_seqs=max_num_seqs, - num_gpu_blocks_override=num_gpu_blocks_override, - max_num_batched_tokens=max_num_batched_tokens, - ) - args = asdict(engine_args) - cache_cfg = CacheConfig(args) - model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196) - speculative_cfg = SimpleNamespace(method=None) - model_cfg.print = print - cache_cfg.bytes_per_layer_per_block = 1 - parallel_cfg = ParallelConfig(args) - scheduler_cfg = SchedulerConfig(args) - graph_opt_cfg = engine_args.create_graph_optimization_config() - fd_config = FDConfig( - model_config=model_cfg, - cache_config=cache_cfg, - parallel_config=parallel_cfg, - graph_opt_config=graph_opt_cfg, - speculative_config=speculative_cfg, - scheduler_config=scheduler_cfg, - ) - return PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed") - - -class TestIsChunkedMMInput(unittest.TestCase): - def setUp(self): - self.cache_manager = make_prefix_cache_manager(max_num_seqs=3, enable_mm=True, num_gpu_blocks_override=100) - - def test_is_chunked_mm_input_none_input(self): - result, idx = self.cache_manager.is_chunked_mm_input(None, 10) - self.assertFalse(result) - self.assertEqual(idx, 0) - - def test_is_chunked_mm_input_no_mm_positions(self): - mm_inputs = {"other_field": "value"} - result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 10) - self.assertFalse(result) - self.assertEqual(idx, 0) - - def test_is_chunked_mm_input_empty_positions(self): - mm_inputs = {"mm_positions": []} - result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 10) - self.assertFalse(result) - self.assertEqual(idx, 0) - - def test_is_chunked_mm_input_matched_in_chunk(self): - mm_inputs = { - "mm_positions": [ - ImagePosition(offset=5, length=10), - ImagePosition(offset=20, length=10), - ] - } - result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 8) - self.assertTrue(result) - self.assertEqual(idx, 0) - - def test_is_chunked_mm_input_matched_in_second_chunk(self): - mm_inputs = { - "mm_positions": [ - ImagePosition(offset=5, length=10), - ImagePosition(offset=20, length=10), - ] - } - result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 25) - self.assertTrue(result) - self.assertEqual(idx, 1) - - def test_is_chunked_mm_input_before_first_chunk(self): - mm_inputs = { - "mm_positions": [ - ImagePosition(offset=5, length=10), - ImagePosition(offset=20, length=10), - ] - } - result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 3) - self.assertFalse(result) - self.assertEqual(idx, 0) - - def test_is_chunked_mm_input_after_last_chunk(self): - mm_inputs = { - "mm_positions": [ - ImagePosition(offset=5, length=10), - ImagePosition(offset=20, length=10), - ] - } - result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 35) - self.assertFalse(result) - self.assertEqual(idx, 0) - - -class TestRevertMatchBlocks(unittest.TestCase): - def setUp(self): - self.block_size = 64 - self.cache_manager = make_prefix_cache_manager(max_num_seqs=3, enable_mm=True, num_gpu_blocks_override=100) - - def make_match_blocks(self, gpu_block_num, cpu_block_num): - block_num = gpu_block_num + cpu_block_num - matched_token_num = block_num * self.block_size - match_node_ids = [] - matche_nodes = [] - match_gpu_block_ids = [] - match_cpu_block_ids = [] - for idx in range(block_num): - node_id = idx + 10 - block = BlockNode(node_id, [], 0, 0, idx, 0, None, None, None) - match_node_ids.append(node_id) - matche_nodes.append(block) - match_gpu_block_ids.append(idx) - - for _ in range(cpu_block_num): - match_cpu_block_ids.append(match_gpu_block_ids.pop()) - - gpu_match_token_num = len(match_gpu_block_ids) * self.block_size - cpu_match_token_num = len(match_cpu_block_ids) * self.block_size - return ( - matched_token_num, - match_node_ids, - matche_nodes, - match_gpu_block_ids, - match_cpu_block_ids, - gpu_match_token_num, - cpu_match_token_num, - ) - - def test_revert_full_blocks(self): - # Setup test data - multimodal_inputs = { - "mm_positions": [ImagePosition(offset=0, length=1200)], - "mm_hashes": ["image1"], - } - req_dict = { - "request_id": "req1", - "prompt_token_ids": [-1] * 1200 + [2] * 120, - "prompt_token_ids_len": 1320, - "multimodal_inputs": multimodal_inputs, - } - - ( - matched_token_num, - match_node_ids, - matche_nodes, - match_gpu_block_ids, - match_cpu_block_ids, - gpu_match_token_num, - cpu_match_token_num, - ) = self.make_match_blocks(gpu_block_num=2, cpu_block_num=0) - - # Call method - ( - gpu_match_token_num, - cpu_match_token_num, - current_match_node, - ) = self.cache_manager._revert_match_blocks( - request=Request.from_dict(req_dict), - matched_token_num=matched_token_num, - block_size=self.block_size, - chunk_idx=0, - match_node_ids=match_node_ids, - matche_nodes=matche_nodes, - match_gpu_block_ids=match_gpu_block_ids, - match_cpu_block_ids=match_cpu_block_ids, - gpu_match_token_num=gpu_match_token_num, - cpu_match_token_num=cpu_match_token_num, - swap_node_ids=[], - ) - - # Assertions - self.assertEqual(gpu_match_token_num, 0) - self.assertEqual(cpu_match_token_num, 0) - self.assertEqual(len(match_node_ids), 0) - self.assertEqual(len(match_gpu_block_ids), 0) - - def test_revert_partial_block(self): - # Setup test data - multimodal_inputs = { - "mm_positions": [ImagePosition(offset=120, length=1200)], - "mm_hashes": ["image1"], - } - req_dict = { - "request_id": "req1", - "prompt_token_ids": [1] * 120 + [-1] * 1200 + [2] * 120, - "prompt_token_ids_len": 1440, - "multimodal_inputs": multimodal_inputs, - } - - ( - matched_token_num, - match_node_ids, - matche_nodes, - match_gpu_block_ids, - match_cpu_block_ids, - gpu_match_token_num, - cpu_match_token_num, - ) = self.make_match_blocks(gpu_block_num=20, cpu_block_num=0) - - # Call method - ( - gpu_match_token_num, - cpu_match_token_num, - current_match_node, - ) = self.cache_manager._revert_match_blocks( - request=Request.from_dict(req_dict), - matched_token_num=matched_token_num, - block_size=self.block_size, - chunk_idx=0, - match_node_ids=match_node_ids, - matche_nodes=matche_nodes, - match_gpu_block_ids=match_gpu_block_ids, - match_cpu_block_ids=match_cpu_block_ids, - gpu_match_token_num=gpu_match_token_num, - cpu_match_token_num=cpu_match_token_num, - swap_node_ids=[], - ) - - # Assertions - self.assertEqual(gpu_match_token_num, 120) - self.assertEqual(cpu_match_token_num, 0) - self.assertEqual(len(match_node_ids), 2) - self.assertEqual(len(match_gpu_block_ids), 2) - - def test_revert_with_cpu_blocks(self): - # Setup test data - multimodal_inputs = { - "mm_positions": [ImagePosition(offset=120, length=1200), ImagePosition(offset=1440, length=420)], - "mm_hashes": ["image1", "image2"], - } - req_dict = { - "request_id": "req1", - "prompt_token_ids": [1] * 120 + [-1] * 1200 + [2] * 120 + [-1] * 420, - "prompt_token_ids_len": 1860, - "multimodal_inputs": multimodal_inputs, - } - - ( - matched_token_num, - match_node_ids, - matche_nodes, - match_gpu_block_ids, - match_cpu_block_ids, - gpu_match_token_num, - cpu_match_token_num, - ) = self.make_match_blocks(gpu_block_num=22, cpu_block_num=6) - - # Call method - ( - gpu_match_token_num, - cpu_match_token_num, - current_match_node, - ) = self.cache_manager._revert_match_blocks( - request=Request.from_dict(req_dict), - matched_token_num=matched_token_num, - block_size=self.block_size, - chunk_idx=1, - match_node_ids=match_node_ids, - matche_nodes=matche_nodes, - match_gpu_block_ids=match_gpu_block_ids, - match_cpu_block_ids=match_cpu_block_ids, - gpu_match_token_num=gpu_match_token_num, - cpu_match_token_num=cpu_match_token_num, - swap_node_ids=[], - ) - - # Assertions - self.assertEqual(gpu_match_token_num, 22 * self.block_size) - self.assertEqual(cpu_match_token_num, 32) - self.assertEqual(len(match_node_ids), 23) - self.assertEqual(len(match_gpu_block_ids), 22) - self.assertEqual(len(match_cpu_block_ids), 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/v1/test_resource_manager_v1.py b/tests/v1/test_resource_manager_v1.py index 038a18b403e..21e38ddb2fd 100644 --- a/tests/v1/test_resource_manager_v1.py +++ b/tests/v1/test_resource_manager_v1.py @@ -9,7 +9,7 @@ from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig from fastdeploy.engine.args_utils import EngineArgs -from fastdeploy.engine.request import Request +from fastdeploy.engine.request import ImagePosition, Request from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1 @@ -27,6 +27,7 @@ def setUp(self): model_cfg = SimpleNamespace(enable_mm=True) # Enable multimodal for feature testing speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.architectures = ["test_model"] model_cfg.max_model_len = 5120 cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) @@ -172,5 +173,110 @@ def test_download_features_retry(self): self.assertEqual(self.request.error_code, 530) +class TestRevertChunkedMMInput(unittest.TestCase): + def setUp(self): + max_num_seqs = 2 + engine_args = EngineArgs( + max_num_seqs=max_num_seqs, + num_gpu_blocks_override=102, + max_num_batched_tokens=3200, + ) + args = asdict(engine_args) + + cache_cfg = CacheConfig(args) + model_cfg = SimpleNamespace(enable_mm=True) # Enable multimodal for feature testing + speculative_cfg = SimpleNamespace(method=None) + model_cfg.print = print + model_cfg.max_model_len = 5120 + model_cfg.architectures = ["test_model"] + cache_cfg.bytes_per_layer_per_block = 1 + cache_cfg.block_size = 64 + parallel_cfg = ParallelConfig(args) + scheduler_cfg = SchedulerConfig(args) + graph_opt_cfg = engine_args.create_graph_optimization_config() + + fd_config = FDConfig( + model_config=model_cfg, + cache_config=cache_cfg, + parallel_config=parallel_cfg, + graph_opt_config=graph_opt_cfg, + speculative_config=speculative_cfg, + scheduler_config=scheduler_cfg, + ) + self.manager = ResourceManagerV1( + max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed" + ) + req_dict = { + "request_id": "test_request", + "multimodal_inputs": {}, + } + self.request = Request.from_dict(req_dict) + self.request.async_process_futures = [] + self.request.multimodal_inputs = {} + + def test_revert_chunked_mm_input_none_input(self): + result = self.manager.revert_chunked_mm_input(None, 64) + self.assertEqual(result, 64) + + def test_revert_chunked_mm_input_no_mm_positions(self): + mm_inputs = {"other_field": "value"} + result = self.manager.revert_chunked_mm_input(mm_inputs, 128) + self.assertEqual(result, 128) + + def test_revert_chunked_mm_input_empty_positions(self): + mm_inputs = {"mm_positions": []} + result = self.manager.revert_chunked_mm_input(mm_inputs, 128) + self.assertEqual(result, 128) + + def test_revert_chunked_mm_input_matched_in_chunk(self): + mm_inputs = { + "mm_positions": [ + ImagePosition(offset=40, length=100), + ImagePosition(offset=200, length=80), + ] + } + result = self.manager.revert_chunked_mm_input(mm_inputs, 256) + self.assertEqual(result, 192) + + def test_revert_chunked_mm_input_matched_in_second_chunk(self): + mm_inputs = { + "mm_positions": [ + ImagePosition(offset=100, length=100), + ImagePosition(offset=200, length=80), + ] + } + result = self.manager.revert_chunked_mm_input(mm_inputs, 256) + self.assertEqual(result, 64) + + def test_revert_chunked_mm_input_before_first_chunk(self): + mm_inputs = { + "mm_positions": [ + ImagePosition(offset=60, length=100), + ImagePosition(offset=180, length=100), + ] + } + result = self.manager.revert_chunked_mm_input(mm_inputs, 256) + self.assertEqual(result, 0) + + def test_revert_chunked_mm_input_after_last_chunk(self): + mm_inputs = { + "mm_positions": [ + ImagePosition(offset=5, length=10), + ImagePosition(offset=200, length=56), + ] + } + result = self.manager.revert_chunked_mm_input(mm_inputs, 256) + self.assertEqual(result, 256) + + def test_revert_chunked_mm_input_match_image_offset(self): + mm_inputs = { + "mm_positions": [ + ImagePosition(offset=64, length=21), + ] + } + result = self.manager.revert_chunked_mm_input(mm_inputs, 64) + self.assertEqual(result, 64) + + if __name__ == "__main__": unittest.main() diff --git a/tests/xpu_ci/conftest.py b/tests/xpu_ci/conftest.py index 402ad5cad9f..be90386daa2 100644 --- a/tests/xpu_ci/conftest.py +++ b/tests/xpu_ci/conftest.py @@ -429,3 +429,28 @@ def restore_pd_env(original_values): else: os.environ[key] = original_values[key] print(f"恢复环境变量: {key}={original_values[key]}") + + +def setup_logprobs_env(): + """ + 设置logprobs相关环境变量 + + Returns: + dict: 原始环境变量值,用于后续恢复 + """ + env_vars = { + "FD_USE_GET_SAVE_OUTPUT_V1": "1", + } + os.system("sysctl -w kernel.msgmax=131072") + os.system("sysctl -w kernel.msgmnb=33554432") + + # 保存原始值 + original_values = {} + for key in env_vars: + original_values[key] = os.environ.get(key) + + # 设置新值 + for key, value in env_vars.items(): + os.environ[key] = value + print(f"设置环境变量: {key}={value}") + return original_values diff --git a/tests/xpu_ci/test_ep4tp1_online.py b/tests/xpu_ci/test_ep4tp1_online.py index 8acb5da3a98..c248b24e263 100644 --- a/tests/xpu_ci/test_ep4tp1_online.py +++ b/tests/xpu_ci/test_ep4tp1_online.py @@ -63,6 +63,7 @@ def test_ep4tp1_online(xpu_env): "--tensor-parallel-size", "1", "--enable-expert-parallel", + "--enable-prefix-caching", "--data-parallel-size", "4", "--max-model-len", diff --git a/tests/xpu_ci/test_ep4tp4_all2all.py b/tests/xpu_ci/test_ep4tp4_all2all.py index e2fbbf227d7..8c5a81e0be1 100644 --- a/tests/xpu_ci/test_ep4tp4_all2all.py +++ b/tests/xpu_ci/test_ep4tp4_all2all.py @@ -65,6 +65,7 @@ def test_ep4tp4_all2all(xpu_env): "--tensor-parallel-size", "4", "--enable-expert-parallel", + "--enable-prefix-caching", "--data-parallel-size", "1", "--max-model-len", diff --git a/tests/xpu_ci/test_ep4tp4_online.py b/tests/xpu_ci/test_ep4tp4_online.py index 3850b4efb2a..6f64016be49 100644 --- a/tests/xpu_ci/test_ep4tp4_online.py +++ b/tests/xpu_ci/test_ep4tp4_online.py @@ -64,6 +64,7 @@ def test_ep4tp4_online(xpu_env): "--tensor-parallel-size", "4", "--enable-expert-parallel", + "--enable-prefix-caching", "--data-parallel-size", "1", "--max-model-len", diff --git a/tests/xpu_ci/test_logprobs_21b_tp4.py b/tests/xpu_ci/test_logprobs_21b_tp4.py new file mode 100644 index 00000000000..b45283495ad --- /dev/null +++ b/tests/xpu_ci/test_logprobs_21b_tp4.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +V1模式测试 - ERNIE-4.5-21B-A3B 模型 + +测试配置: +- 模型: ERNIE-4.5-21B-A3B-Paddle +- 量化: wint8 +- Tensor Parallel: 4 +- 特性: enable-logprob +- 调用方式: 原生 HTTP(不使用 OpenAI SDK) +""" + +import pytest +import requests +from conftest import ( + get_model_path, + get_port_num, + print_logs_on_failure, + restore_env, + setup_logprobs_env, + start_server, +) + + +def test_logprobs_mode(xpu_env): + """logprobs 测试(HTTP 直连,不使用 SDK)""" + + print("\n============================开始 logprobs 测试!============================") + + port_num = get_port_num() + model_path = get_model_path() + + original_env = setup_logprobs_env() + + server_args = [ + "--model", + f"{model_path}/ERNIE-4.5-21B-A3B-Paddle", + "--port", + str(port_num), + "--engine-worker-queue-port", + str(port_num + 1), + "--metrics-port", + str(port_num + 2), + "--cache-queue-port", + str(port_num + 47873), + "--tensor-parallel-size", + "4", + "--num-gpu-blocks-override", + "16384", + "--max-model-len", + "32768", + "--max-num-seqs", + "128", + "--quantization", + "wint8", + "--gpu-memory-utilization", + "0.9", + "--enable-logprob", + "--no-enable-prefix-caching", + ] + + if not start_server(server_args): + pytest.fail("logprobs 服务启动失败") + + try: + url = f"http://127.0.0.1:{port_num}/v1/chat/completions" + + payload = { + "model": "default", + "messages": [{"role": "user", "content": "你好,你是谁?"}], + "temperature": 1, + "top_p": 0, + "max_tokens": 64, + "stream": False, + "logprobs": True, + "top_logprobs": 1, + "prompt_logprobs": 1, + } + + resp = requests.post(url, json=payload, timeout=300) + assert resp.status_code == 200, f"HTTP 请求失败: {resp.text}" + + response = resp.json() + print("\n完整返回:\n", response) + + # ======================== + # 基本返回结构 + # ======================== + assert "choices" in response + assert isinstance(response["choices"], list) + assert len(response["choices"]) > 0 + + choice = response["choices"][0] + + # ======================== + # message 结构 + # ======================== + assert "message" in choice + assert "content" in choice["message"] + assert isinstance(choice["message"]["content"], str) + assert len(choice["message"]["content"]) > 0 + + print(f"\n模型回复: {choice['message']['content']}") + + # ======================== + # completion logprobs + # ======================== + assert "logprobs" in choice + assert choice["logprobs"] is not None + + assert "content" in choice["logprobs"] + assert isinstance(choice["logprobs"]["content"], list) + assert len(choice["logprobs"]["content"]) > 0 + + for token_info in choice["logprobs"]["content"]: + assert "token" in token_info + assert "logprob" in token_info + assert "bytes" in token_info + assert "top_logprobs" in token_info + + assert isinstance(token_info["token"], str) + assert isinstance(token_info["logprob"], (int, float)) + assert isinstance(token_info["bytes"], list) + assert token_info["top_logprobs"] is None or isinstance(token_info["top_logprobs"], list) + + # ======================== + # prompt_logprobs(扩展字段) + # ======================== + assert "prompt_logprobs" in choice + assert isinstance(choice["prompt_logprobs"], list) + assert len(choice["prompt_logprobs"]) > 0 + + print("\nlogprobs 测试通过!") + + except Exception as e: + print(f"\nlogprobs 测试失败: {str(e)}") + print_logs_on_failure() + pytest.fail(f"logprobs 测试失败: {str(e)}") + + finally: + restore_env(original_env) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"])