SemiAnalysisAI · Oseltamivir · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 30, 2026
@@ -1560,13 +1560,11 @@ dsv4-fp8-mi355x-vllm:
     search-space:
     - { tp: 8, conc-start: 1, conc-end: 1 }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
+# Day-0 DeepSeek-V4 on ATOM (ROCm/ATOM#650) with local runtime overlays.
+# dsv4_fp4_mi355x_atom.sh patches PR650 to give each request persistent DSv4
+# KV/compressor/indexer cache slots, unblocking CONC>1 smoke coverage. The path
+# still uses eager execution and per-sequence sparse attention, but batches
+# attention projections, mHC, and MoE/FFN layer-by-layer across active requests.
 dsv4-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1579,8 +1577,8 @@ dsv4-fp4-mi355x-atom:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
@@ -161,8 +161,8 @@ jobs:
             - If jobs cannot be run, say exactly what you could not run and why
             - **Important** Modify perf-changelog.yaml for any config changes affecting performance
 
-            ## Profiling (SGLang only)
-            When asked to profile a config, dispatch the `profile.yml` workflow. **Only SGLang configs can be profiled** — the profiler uses SGLang's `/start_profile` and `/stop_profile` HTTP endpoints. Reject profiling requests for vLLM, TRT, or other frameworks.
+            ## Profiling
+            When asked to profile a config, dispatch the `profile.yml` workflow. SGLang, vLLM, and ATOM single-node configs can be profiled through their `/start_profile` and `/stop_profile` HTTP endpoints when the server is launched with the corresponding torch profiler directory. Reject profiling requests for TRT, disaggregated/multi-node configs, or other frameworks.
 
             **Syntax:**
             ```
@@ -172,9 +172,10 @@ jobs:
                 workflow_id="profile.yml",
                 ref="main",
                 inputs={
-                    "config-key": "<config-key-ending-in-sglang>",
+                    "config-key": "<config-key>",
                     "config-file": "<.github/configs/nvidia-master.yaml or amd-master.yaml>",
-                    "conc": "<concurrency>"
+                    "conc": "<concurrency>",
+                    "seq-len": "<1k1k or 8k1k>"
                 }
             )
             ```
@@ -184,19 +185,16 @@ jobs:
             - Model: "deepseek" / "dsr1" → model-prefix `dsr1`; "gptoss" → `gptoss`; "qwen" → `qwen3.5`
             - Precision: "fp4" / "fp8" / "bf16"
             - Runner/hardware: "b200", "h200", "h100", "mi300x", "mi325x", "mi355x", etc.
-            - Framework: must be "sglang" (reject if not)
+            - Framework: must be "sglang", "vllm", or "atom" (reject TRT and disaggregated/multi-node)
             - Concurrency: "conc=N" → `"conc": "N"`. Default to `"64"` if not specified.
+            - Sequence length: default to `"1k1k"` unless the user asks for `"8k1k"`.
 
-            Construct the config-key as: `{model-prefix}-{precision}-{runner}-sglang`
+            Construct the config-key as: `{model-prefix}-{precision}-{runner}-{framework}`
             Choose config-file: NVIDIA runners (b200, h200, h100, gb200, gb300) → `nvidia-master.yaml`; AMD runners (mi300x, mi325x, mi355x) → `amd-master.yaml`
 
-            **Available SGLang config keys:**
-            NVIDIA: `dsr1-fp4-b200-sglang`, `dsr1-fp8-b200-sglang`, `dsr1-fp8-h200-sglang`, `qwen3.5-bf16-b200-sglang`
-            AMD: `dsr1-fp4-mi355x-sglang`, `dsr1-fp8-mi300x-sglang`, `dsr1-fp8-mi325x-sglang`, `dsr1-fp8-mi355x-sglang`, `qwen3.5-bf16-mi355x-sglang`, `qwen3.5-fp8-mi355x-sglang`
-
             **Examples:**
-            - "profile sglang b200 deepseek fp4 conc=4" → `config-key: dsr1-fp4-b200-sglang`, `config-file: .github/configs/nvidia-master.yaml`, `conc: 4`
-            - "profile sglang mi355x dsr1 fp8" → `config-key: dsr1-fp8-mi355x-sglang`, `config-file: .github/configs/amd-master.yaml`, `conc: 64`
+            - "profile sglang b200 deepseek fp4 conc=4" → `config-key: dsr1-fp4-b200-sglang`, `config-file: .github/configs/nvidia-master.yaml`, `conc: 4`, `seq-len: 1k1k`
+            - "profile atom mi355x dsv4 fp4 conc=4 8k1k" → `config-key: dsv4-fp4-mi355x-atom`, `config-file: .github/configs/amd-master.yaml`, `conc: 4`, `seq-len: 8k1k`
 
             **After dispatch:**
             Monitor with `mcp__github__get_workflow_run`. The profile workflow takes ~15-30 minutes. When complete, the **Perfetto relay link** is in the workflow run's step summary. Retrieve it with:

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
@@ -17,6 +17,14 @@ on:
         required: false
         type: string
         default: '64'
+      seq-len:
+        description: "Sequence length config to profile"
+        required: false
+        type: choice
+        options:
+          - 1k1k
+          - 8k1k
+        default: 1k1k
       moe-debug:
         description: "Enable MoE debug patch and log (MOE_DEBUG_LOG)"
         required: false
@@ -54,7 +62,7 @@ jobs:
         name: Generate matrix via script
         run: |
           pip install pydantic
-          CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}"
+          CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }} --seq-lens ${{ inputs.seq-len }}"
           CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS)
           echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
@@ -148,13 +156,16 @@ jobs:
           ref: ${{ inputs.ref || github.sha }}
           clean: false
 
-      - name: Launch + Profile (single-node sglang/vllm)
+      - name: Launch + Profile (single-node)
         id: run
         env:
           RUNNER_NAME: ${{ runner.name }}
           PROFILE: '1'
           SGLANG_TORCH_PROFILER_DIR: /workspace/
           VLLM_TORCH_PROFILER_DIR: /workspace/
+          ATOM_TORCH_PROFILER_DIR: /workspace/atom_profiles
+          PROFILE_NUM_STEPS: '1'
+          PROFILE_OUTPUT_LEN: '1'
           VLLM_RPC_TIMEOUT: '1800000'
         shell: bash
         run: |
@@ -193,32 +204,46 @@ jobs:
             fi
           else
             echo "Profile trace not found: $trace_path" >&2
+            exit 1
           fi
 
       - name: Process result (json -> agg)
+        continue-on-error: true
         env:
           RUNNER_TYPE: ${{ matrix.config.runner }}
         run: |
           python3 utils/process_result.py
 
+      - name: Upload profile diagnostics
+        if: ${{ always() && env.RESULT_FILENAME != '' }}
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: profile_diagnostics_${{ env.RESULT_FILENAME }}
+          path: |
+            ${{ env.RESULT_FILENAME }}.json
+            agg_${{ env.RESULT_FILENAME }}.json
+            server.log
+            gpu_metrics.csv
+          if-no-files-found: ignore
+
       - name: Upload profile as artifact
-        if: ${{ steps.run.outputs.trace != '' }}
+        if: ${{ always() && steps.run.outputs.trace != '' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}
           path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
           if-no-files-found: ignore
 
       - name: Upload TP-0-DECODE trace as artifact
-        if: ${{ steps.run.outputs.tp0_decode != '' }}
+        if: ${{ always() && steps.run.outputs.tp0_decode != '' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}_TP0_DECODE
           path: ${{ steps.run.outputs.tp0_decode }}
           if-no-files-found: ignore
 
       - name: Upload TP-0-EXTEND trace as artifact
-        if: ${{ steps.run.outputs.tp0_extend != '' }}
+        if: ${{ always() && steps.run.outputs.tp0_extend != '' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}_TP0_EXTEND

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -324,15 +324,18 @@ run_benchmark_serving() {
     fi
 
     # Profiling support: when PROFILE=1, ensure profiler dir exists, add --profile flag,
-    # and cap num_prompts to keep traces small.
+    # and cap the run to a tiny one-step window by default.
     local profile_flag=()
     if [[ "${PROFILE:-}" == "1" ]]; then
-        local _prof_dir="${SGLANG_TORCH_PROFILER_DIR:-${VLLM_TORCH_PROFILER_DIR:-}}"
-        if [[ -n "$_prof_dir" ]]; then
-            mkdir -p "$_prof_dir"
-        fi
+        local _prof_dir=""
+        for _prof_dir in "${SGLANG_TORCH_PROFILER_DIR:-}" "${VLLM_TORCH_PROFILER_DIR:-}" "${ATOM_TORCH_PROFILER_DIR:-}"; do
+            if [[ -n "$_prof_dir" ]]; then
+                mkdir -p "$_prof_dir"
+            fi
+        done
         profile_flag+=(--profile)
-        num_prompts="$max_concurrency"
+        num_prompts="${PROFILE_NUM_PROMPTS:-$max_concurrency}"
+        output_len="${PROFILE_OUTPUT_LEN:-${PROFILE_NUM_STEPS:-1}}"
     fi
 
     # Build benchmark command
@@ -415,6 +418,15 @@ run_benchmark_serving() {
 # Profiling trace helpers
 # --------------------------------
 
+setup_atom_profile_args() {
+    ATOM_PROFILE_ARGS=()
+    if [[ "${PROFILE:-}" == "1" ]]; then
+        ATOM_TORCH_PROFILER_DIR=${ATOM_TORCH_PROFILER_DIR:-/workspace/atom_profiles}
+        mkdir -p "$ATOM_TORCH_PROFILER_DIR"
+        ATOM_PROFILE_ARGS+=(--torch-profiler-dir "$ATOM_TORCH_PROFILER_DIR")
+    fi
+}
+
 _find_latest_profile_trace() {
     local latest=""
     local dir="" candidate="" base=""
@@ -424,6 +436,9 @@ _find_latest_profile_trace() {
         search_roots=()
         if [[ -d "$dir" ]]; then
             search_roots+=("$dir")
+            while IFS= read -r -d '' candidate; do
+                search_roots+=("$candidate")
+            done < <(find "$dir" -mindepth 1 -maxdepth 1 -type d -print0 2>/dev/null)
         fi
         if [[ -d "$dir/profiles" ]]; then
             search_roots+=("$dir/profiles")
@@ -463,11 +478,12 @@ move_profile_trace_for_relay() {
 
     local sglang_dir="${SGLANG_TORCH_PROFILER_DIR:-/workspace}"
     local vllm_dir="${VLLM_TORCH_PROFILER_DIR:-/workspace}"
+    local atom_dir="${ATOM_TORCH_PROFILER_DIR:-/workspace}"
     local -a search_dirs=()
     local dir="" existing=""
     local seen=0
 
-    for dir in "$sglang_dir" "$vllm_dir" "/workspace"; do
+    for dir in "$sglang_dir" "$vllm_dir" "$atom_dir" "/workspace"; do
         if [[ -z "$dir" ]]; then
             continue
         fi
@@ -538,7 +554,7 @@ _patch_lm_eval() {
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
 # --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content ---
-import re, sys, unicodedata, json
+import os, re, sys, unicodedata, json
 from lm_eval.filters import extraction as ex
 from lm_eval.models.openai_completions import LocalChatCompletion as _LCC
 
@@ -565,7 +581,7 @@ def _le_parse_generations(outputs, **kwargs):
 # Keep staticmethod semantics
 _LCC.parse_generations = staticmethod(_le_parse_generations)
 
-# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT ---
+# --- Patch TemplateAPI.apply_chat_template ---
 try:
     from lm_eval.models import api_models as _api_models
     _TemplateAPI = _api_models.TemplateAPI
@@ -576,13 +592,65 @@ except Exception:
 
 if _TemplateAPI is not None and _JsonChatStr is not None:
     _orig_apply_chat_template = _TemplateAPI.apply_chat_template
+    _dsv4_encode_messages = None
+
+    def _content_to_text(content):
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            parts = []
+            for item in content:
+                if isinstance(item, dict):
+                    parts.append(str(item.get("text", item.get("content", ""))))
+                else:
+                    parts.append(str(item))
+            return "\n".join(part for part in parts if part)
+        if content is None:
+            return ""
+        return str(content)
+
+    def _load_dsv4_encoder():
+        global _dsv4_encode_messages
+        if _dsv4_encode_messages is not None:
+            return _dsv4_encode_messages
+
+        roots = [
+            os.environ.get("INFMAX_WORKSPACE"),
+            os.environ.get("GITHUB_WORKSPACE"),
+            os.getcwd(),
+            "/workspace",
+            "/infmax-workspace",
+        ]
+        for root in roots:
+            if not root:
+                continue
+            candidate = os.path.join(root, "utils", "bench_serving")
+            if os.path.exists(os.path.join(candidate, "encoding_dsv4.py")) and candidate not in sys.path:
+                sys.path.insert(0, candidate)
+
+        from encoding_dsv4 import encode_messages
+
+        _dsv4_encode_messages = encode_messages
+        return _dsv4_encode_messages
+
+    def _apply_dsv4_chat_template(chat_history):
+        encode_messages = _load_dsv4_encoder()
+        messages = []
+        for item in chat_history:
+            normalized = {**item}
+            normalized.pop("type", None)
+            normalized["content"] = _content_to_text(normalized.get("content"))
+            messages.append(normalized)
+        return encode_messages(messages, thinking_mode="thinking")
 
     def _patched_apply_chat_template(
         self,
         chat_history,
         add_generation_prompt: bool = True,
     ):
         """Applies a chat template to a list of chat history between user and model."""
+        if os.environ.get("EVAL_DSV4_CHAT_TEMPLATE") == "1":
+            return _apply_dsv4_chat_template(chat_history)
         if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
             return self.tokenizer.apply_chat_template(
                 chat_history,
@@ -673,7 +741,8 @@ run_lm_eval() {
     local eval_context_len="${EVAL_MAX_MODEL_LEN:-16384}"
     local temperature=0
     local top_p=1
-    local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}"
+    local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-${CONC:-64}}"
+    local eval_limit="${EVAL_LIMIT:-}"
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -683,17 +752,36 @@ run_lm_eval() {
             --gen-max-tokens) eval_context_len="$2"; shift 2 ;;
             --temperature)    temperature="$2"; shift 2 ;;
             --top-p)          top_p="$2"; shift 2 ;;
+            --limit)          eval_limit="$2"; shift 2 ;;
             *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
 
-    _install_lm_eval_deps
-    _patch_lm_eval
-
     local openai_server_base="http://0.0.0.0:${port}"
     local openai_chat_base="${openai_server_base}/v1/chat/completions"
+    local openai_completions_base="${openai_server_base}/v1/completions"
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
-    MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
+    export MODEL_NAME="${MODEL_NAME:-$MODEL}" # Prefer MODEL_NAME, else MODEL
+
+    local lm_eval_model="local-chat-completions"
+    local lm_eval_base_url="$openai_chat_base"
+    local lm_eval_eos_string="${EVAL_EOS_STRING:-</s>}"
+    local lm_eval_tokenizer_args="tokenized_requests=False"
+
+    if [[ "${MODEL_PREFIX:-}" == "dsv4" || "${MODEL_NAME:-}" == *"DeepSeek-V4"* || "${MODEL:-}" == *"DeepSeek-V4"* ]]; then
+        export EVAL_DSV4_CHAT_TEMPLATE=1
+        lm_eval_model="local-completions"
+        lm_eval_base_url="$openai_completions_base"
+        lm_eval_eos_string="${EVAL_EOS_STRING:-<｜end▁of▁sentence｜>}"
+        lm_eval_tokenizer_args="tokenizer_backend=None,tokenized_requests=False"
+        eval_limit="${eval_limit:-40}"
+        echo "Using DeepSeek-V4 eval prompt encoding via utils/bench_serving/encoding_dsv4.py"
+    else
+        unset EVAL_DSV4_CHAT_TEMPLATE
+    fi
+
+    _install_lm_eval_deps
+    _patch_lm_eval
 
     # Cap output tokens: must fit within context window (leave room for input),
     # and avoid excessive KV cache reservation per request on TRT.
@@ -705,12 +793,18 @@ run_lm_eval() {
 
     # Export for append_lm_eval_summary to pick up
     export EVAL_RESULT_DIR="$results_dir"
+    local limit_args=()
+    if [ -n "$eval_limit" ]; then
+        limit_args=(--limit "$eval_limit")
+        echo "Eval sample limit: ${eval_limit}"
+    fi
     set -x
-    python3 -m lm_eval --model local-chat-completions --apply_chat_template \
+    python3 -m lm_eval --model "${lm_eval_model}" --apply_chat_template \
       --tasks "${tasks_dir}" \
       --output_path "${results_dir}" \
       --log_samples \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \
+      "${limit_args[@]}" \
+      --model_args "model=${MODEL_NAME},base_url=${lm_eval_base_url},api_key=${OPENAI_API_KEY},eos_string=${lm_eval_eos_string},max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,${lm_eval_tokenizer_args},max_length=${eval_context_len}" \
       --gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}"
     local eval_exit=$?
     set +x