From c732b993de05c26ad84d0c9476399ff4c3db6842 Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Thu, 23 Apr 2026 21:20:48 +0000
Subject: [PATCH 1/8] feat: add Step-3.5-Flash support and fix MoE weight
 shuffling on gfx950
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Step3p5ForCausalLM model support for the Step-3.5-Flash architecture,
and fix a critical MoE correctness bug on gfx950 (MI350X).

Core MoE fix (atom/model_ops/moe.py):
  Previously skipped shuffle_weights() for gfx950 BF16 g1u1 based on the
  incorrect assumption that the CK 2-stage preshuffle_off (NSwizzle=0)
  kernel expects un-shuffled weights. Verified: preshuffle_off GEMM is
  wrong on gfx950; preshuffle_on (NSwizzle=1) is correct. Always call
  shuffle_weights() so the correct kernel path is selected.

Step-3.5-Flash model support (atom/models/step3p5.py):
  - Mixed full/sliding window attention (per layer_types config)
  - 288 routed + 1 shared expert MoE with sigmoid routing
  - Per-layer SwigluStep activation: layers with swiglu_limits[i]>0 use
    ActivationType.SwigluStep (CK kernel applies silu(g).clamp(7)*up.clamp(±7));
    other layers use plain Silu. Shared expert at SwigluStep layers is kept
    on the dense MLP path (kernel clamp is routed-expert-only).
  - Fused expert loading (flat [E,I,H] checkpoint format)
  - clamp_limit applied to dense MLP and shared expert via Step3p5MLP

atom/model_engine/model_runner.py:
  - Register Step3p5ForCausalLM architecture
  - Handle num_attention_groups config key (Step-3.5 uses this instead of
    num_key_value_heads) in KV head count calculations

atom/model_loader/loader.py:
  - Fix fused expert detection order: check before packed_modules_mapping
    to prevent moe.gate_proj being matched as gate_up_proj

atom/model_ops/attentions/aiter_attention.py:
  - Handle num_attention_groups config key for KV head count

atom/examples/simple_inference.py:
  - Add --max-tokens arg and trust_remote_code support

Verified: tp=2 Step-3.5-Flash inference, 4 prompts, no NaN/crash,
coherent output (with ATOM_STEP3P5_NO_SLIDING=1 workaround for
pa_decode_gluon bug on gfx950, tracked separately).

Co-Authored-By: Jun Lin <junlin12@amd.com>
---
 atom/examples/simple_inference.py            |  12 +-
 atom/model_engine/model_runner.py            |  25 +-
 atom/model_loader/loader.py                  |  11 +
 atom/model_ops/attentions/aiter_attention.py |   4 +-
 atom/model_ops/moe.py                        |  22 +-
 atom/models/step3p5.py                       | 918 +++++++++++++++++++
 6 files changed, 976 insertions(+), 16 deletions(-)
 create mode 100644 atom/models/step3p5.py

diff --git a/atom/examples/simple_inference.py b/atom/examples/simple_inference.py
index e4b37173c..d611fc9f2 100644
--- a/atom/examples/simple_inference.py
+++ b/atom/examples/simple_inference.py
@@ -19,6 +19,9 @@
 parser.add_argument(
     "--temperature", type=float, default=0.6, help="temperature for sampling"
 )
+parser.add_argument(
+    "--max-tokens", type=int, default=256, help="max tokens to generate"
+)
 
 
 def generate_cuda_graph_sizes(max_size):
@@ -46,9 +49,11 @@ def main():
     engine_args = EngineArgs.from_cli_args(args)
     llm = engine_args.create_engine()
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model, trust_remote_code=getattr(args, "trust_remote_code", False)
+    )
 
-    sampling_params = SamplingParams(temperature=args.temperature, max_tokens=256)
+    sampling_params = SamplingParams(temperature=args.temperature, max_tokens=args.max_tokens)
 
     prompts = [
         tokenizer.apply_chat_template(
@@ -60,9 +65,6 @@ def main():
         for prompt in prompts
     ]
     print("This is prompts:", prompts)
-    # print("Warming up...")
-    # _ = llm.generate(["warmup"], sampling_params)
-    # print("Warm up done")
 
     print("\n" + "=" * 70)
     print("Starting profiling...")
diff --git a/atom/model_engine/model_runner.py b/atom/model_engine/model_runner.py
index bbd2d061c..1869d1b6f 100644
--- a/atom/model_engine/model_runner.py
+++ b/atom/model_engine/model_runner.py
@@ -66,6 +66,7 @@
     "Qwen3_5MoeForConditionalGeneration": "atom.models.qwen3_5.Qwen3_5MoeForConditionalGenerationTextOnly",
     "KimiK25ForConditionalGeneration": "atom.models.kimi_k25.KimiK25ForCausalLM",
     "MiniMaxM2ForCausalLM": "atom.models.minimax_m2.MiniMaxM2ForCausalLM",
+    "Step3p5ForCausalLM": "atom.models.step3p5.Step3p5ForCausalLM",
     "MiMoV2FlashForCausalLM": "atom.models.mimo_v2_flash.MiMoV2FlashForCausalLM",
 }
 # seed = 34567
@@ -1027,11 +1028,15 @@ def allocate_forward_vars(self):
     def _get_num_kv_heads(self):
         """Return the per-rank number of KV heads."""
         hf_config = self.config.hf_config
-        if hf_config.num_key_value_heads >= self.world_size:
-            assert hf_config.num_key_value_heads % self.world_size == 0
-            return hf_config.num_key_value_heads // self.world_size
+        num_kv_heads_cfg = getattr(
+            hf_config, "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None)
+        )
+        if num_kv_heads_cfg >= self.world_size:
+            assert num_kv_heads_cfg % self.world_size == 0
+            return num_kv_heads_cfg // self.world_size
         else:
-            assert self.world_size % hf_config.num_key_value_heads == 0
+            assert self.world_size % num_kv_heads_cfg == 0
             return 1
 
     def _get_total_num_layers(self):
@@ -1321,11 +1326,15 @@ def allocate_kv_cache(self, num_kvcache_blocks):
         self.num_physical_kvcache_blocks = (
             num_kvcache_blocks * self.attn_metadata_builder.block_ratio
         )
-        if hf_config.num_key_value_heads >= self.world_size:
-            assert hf_config.num_key_value_heads % self.world_size == 0
-            num_kv_heads = hf_config.num_key_value_heads // self.world_size
+        num_kv_heads_cfg = getattr(
+            hf_config, "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None)
+        )
+        if num_kv_heads_cfg >= self.world_size:
+            assert num_kv_heads_cfg % self.world_size == 0
+            num_kv_heads = num_kv_heads_cfg // self.world_size
         else:
-            assert self.world_size % hf_config.num_key_value_heads == 0
+            assert self.world_size % num_kv_heads_cfg == 0
             num_kv_heads = 1
 
         # Calculate total number of layers (target + draft)
diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py
index 219b93f59..7ebbba8a4 100644
--- a/atom/model_loader/loader.py
+++ b/atom/model_loader/loader.py
@@ -337,10 +337,21 @@ def extract_expert_target_and_id(name: str) -> Tuple[str, int] | None:
                     maybe_matching_name,
                     f"mlp.experts.{hf_config.n_routed_experts}.",
                 )
+            # Check fused expert format before packed_modules_mapping to avoid
+            # expert weights (e.g. moe.gate_proj) being incorrectly matched
+            # by packed_modules_mapping entries (e.g. gate_proj -> gate_up_proj).
+            if detect_fused_expert_fn is not None and not is_fused_expert:
+                if detect_fused_expert_fn(name):
+                    is_fused_expert = True
+                    if get_fused_expert_mapping_fn is not None:
+                        fused_expert_params_mapping = get_fused_expert_mapping_fn()
             for k in packed_modules_mapping:
                 # We handle the experts below in expert_params_mapping
                 if "mlp.experts." in name and name not in params_dict:
                     continue
+                # Skip fused expert weights — handled below in expert loading path
+                if is_fused_expert and detect_fused_expert_fn is not None and detect_fused_expert_fn(name):
+                    continue
                 if k in name:
                     packed_value = packed_modules_mapping[k]
                     # Handle both tuple (fuse parameter) and list (shard parameter)
diff --git a/atom/model_ops/attentions/aiter_attention.py b/atom/model_ops/attentions/aiter_attention.py
index a8aa3a658..917987834 100644
--- a/atom/model_ops/attentions/aiter_attention.py
+++ b/atom/model_ops/attentions/aiter_attention.py
@@ -83,7 +83,7 @@ def __init__(
         else:
             max_qlen = 1
 
-        num_head_k = max(1, hf_config.num_key_value_heads // get_tp_group().world_size)
+        num_head_k = max(1, getattr(hf_config, "num_key_value_heads", getattr(hf_config, "num_attention_groups", None)) // get_tp_group().world_size)
         (
             (work_meta_data_size, work_meta_data_type),
             (work_indptr_size, work_indptr_type),
@@ -240,7 +240,7 @@ def set_aiter_persistent_worker_buffers(self, bs: int):
         hf_config = config.hf_config
         num_query_heads = self.num_attention_heads
         num_kv_heads = max(
-            1, hf_config.num_key_value_heads // get_tp_group().world_size
+            1, getattr(hf_config, "num_key_value_heads", getattr(hf_config, "num_attention_groups", None)) // get_tp_group().world_size
         )
         block_size = self.block_size
 
diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 2315d371c..0c182359c 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -485,7 +485,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         layer.w13_weight = atom_parameter(self._maybe_pad_weight(layer.w13_weight.data))
         layer.w2_weight = atom_parameter(self._maybe_pad_weight(layer.w2_weight.data))
-        # reshaping weights is required for aiter moe kernel.
+        # Shuffle weights for CK/ASM kernels.
+        # Previously skipped for gfx950 bf16 g1u1 on the assumption that the CK
+        # 2-stage preshuffle_off (NSwizzle=0) kernel expected un-shuffled weights.
+        # Verified 2026-04-23: preshuffle_off GEMM is wrong on gfx950; preshuffle_on
+        # (NSwizzle=1) is correct. Always shuffle so the right kernel path is used.
         shuffle_weights(layer.w13_weight, layer.w2_weight)
 
     def get_fused_moe_quant_config(
@@ -998,6 +1002,13 @@ def apply(
                     global_num_experts=global_num_experts,
                     expert_map=expert_map,
                 )
+                _tp_size = getattr(self, 'tp_size', getattr(self.moe, 'tp_size', 1)) if hasattr(self, 'moe') else getattr(self, 'tp_size', 1)
+                _ep_size = getattr(self, 'ep_size', getattr(self.moe, 'ep_size', 1)) if hasattr(self, 'moe') else getattr(self, 'ep_size', 1)
+                if layer.reduce_results and (_tp_size > 1 or _ep_size > 1):
+                    from aiter.dist.parallel_state import get_tp_group
+                    _moe_result = get_tp_group().all_reduce(
+                        _moe_result, ca_fp8_quant=False
+                    )
                 return _moe_result
 
             assert (
@@ -2556,6 +2567,14 @@ def select_experts(
         routed_scaling_factor: float = 1.0,
     ):
 
+        # Model-provided custom routing (e.g. sigmoid + bias + scaling for Step-3.5)
+        if custom_routing_function is not None:
+            return custom_routing_function(
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+            )
+
         # DeekSeekv2 uses grouped_top_k
         if use_grouped_topk:
             assert topk_group is not None
@@ -2697,6 +2716,7 @@ def forward_impl(self, hidden_states: torch.Tensor, router_logits: torch.Tensor)
             hidden_states = naive_multicast(hidden_states, cu_tokens_across_dp_cpu)
             router_logits = naive_multicast(router_logits, cu_tokens_across_dp_cpu)
 
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
diff --git a/atom/models/step3p5.py b/atom/models/step3p5.py
new file mode 100644
index 000000000..c5e97c995
--- /dev/null
+++ b/atom/models/step3p5.py
@@ -0,0 +1,918 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Inference-only Step-3.5 (Flash) model.
+
+Step-3.5 is a sparse MoE transformer with:
+  - 45 decoder layers, hidden_size=4096, head_dim=128
+  - GQA with two attention configs: full_attention (64 q heads, 8 kv groups)
+    and sliding_attention (96 q heads, 8 kv groups, window=512)
+  - 3:1 sliding window pattern (1 full + 3 sliding)
+  - Per-layer rope_theta and partial_rotary_factor
+  - QK RMSNorm (zero-centered, i.e. weight * (1 + param))
+  - Head-wise attention gating via g_proj (sigmoid)
+  - MoE on layers 3-44: 288 routed experts + 1 shared expert, top-8,
+    sigmoid routing with learnable router bias
+  - Dense MLP on layers 0-2
+  - Per-layer SwiGLU clamp limits
+  - Multi-token prediction (MTP) with num_nextn_predict_layers=3
+"""
+
+import os
+from typing import Any, Optional, Union
+
+import torch
+from aiter import ActivationType
+from aiter.dist.parallel_state import get_pp_group, get_tensor_model_parallel_world_size
+from aiter.rotary_embedding import get_rope
+from atom.config import Config, QuantizationConfig
+from atom.model_ops.activation import SiluAndMul
+from atom.model_ops.base_attention import Attention
+from atom.model_ops.embed_head import ParallelLMHead, VocabParallelEmbedding
+from atom.model_ops.layernorm import GemmaRMSNorm as Step3p5RMSNorm
+from atom.model_ops.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from atom.model_ops.moe import FusedMoE
+from atom.model_ops.topK import is_rocm_aiter_fusion_shared_expert_enabled
+from atom.models.utils import (
+    IntermediateTensors,
+    PPMissingLayer,
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from atom.utils.decorators import support_torch_compile
+from torch import nn
+from transformers import PretrainedConfig
+
+
+def _uses_swiglustep_at_layer(config: PretrainedConfig, layer_idx: Optional[int]) -> bool:
+    """Return True iff the routed FusedMoE at this layer needs the SwigluStep
+    activation (i.e. ``swiglu_limits[layer_idx] > 0``).
+
+    The CK kernel hard-codes the clamp at 7.0; Step-3.5-Flash uses 7.0 at
+    layers 43 and 44, which is why the kernel is only valid at those layers.
+    Other layers must keep the plain Silu path.
+    """
+    if layer_idx is None:
+        return False
+    # Toggle-off bit: ATOM_DISABLE_SWIGLUSTEP=1 forces plain Silu at every
+    # layer (verification helper only).
+    if os.environ.get("ATOM_DISABLE_SWIGLUSTEP"):
+        return False
+    swiglu_limits = getattr(config, "swiglu_limits", None)
+    if not swiglu_limits or layer_idx >= len(swiglu_limits):
+        return False
+    return bool(swiglu_limits[layer_idx])
+
+
+def _fuse_shared_at_layer(config: PretrainedConfig, layer_idx: Optional[int]) -> bool:
+    """Whether to fuse the shared expert into the routed FusedMoE at this layer.
+
+    R5 mitigation: at SwigluStep layers the kernel clamps every expert at 7.0,
+    but the shared expert may use a different clamp (e.g. 16 at layer 44 or 0
+    at layer 43). Therefore the shared expert MUST stay on the dense path at
+    every SwigluStep layer, even when the global aiter fusion is enabled.
+    """
+    # ATOM_FORCE_FUSE_SHARED=1 always fuses the shared expert into the
+    # routed kernel (verification helper: bypass R5 mitigation).
+    if os.environ.get("ATOM_FORCE_FUSE_SHARED"):
+        return is_rocm_aiter_fusion_shared_expert_enabled()
+    return (
+        is_rocm_aiter_fusion_shared_expert_enabled()
+        and not _uses_swiglustep_at_layer(config, layer_idx)
+    )
+
+
+# ---------------------------------------------------------------------------
+# MLP (dense, used for first few layers and shared expert)
+# ---------------------------------------------------------------------------
+
+
+class Step3p5MLP(nn.Module):
+    """Dense SwiGLU MLP with optional activation clamping."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+        clamp_limit: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+        # 0.0 means no clamping (disabled), only apply if > 0
+        self.clamp_limit = clamp_limit if (clamp_limit is not None and clamp_limit > 0) else None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.gate_up_proj(x)
+        if self.clamp_limit is not None:
+            # Match HF: clamp AFTER silu activation on gate, symmetric on up
+            # gate_proj output is first half, up_proj output is second half
+            half = x.shape[-1] // 2
+            gate, up = x[..., :half], x[..., half:]
+            gate = torch.nn.functional.silu(gate).clamp(max=self.clamp_limit)
+            up = up.clamp(min=-self.clamp_limit, max=self.clamp_limit)
+            x = self.down_proj(gate * up)
+        else:
+            x = self.act_fn(x)
+            x = self.down_proj(x)
+        return x
+
+
+# ---------------------------------------------------------------------------
+# MoE block (routed experts + shared expert)
+# ---------------------------------------------------------------------------
+
+
+class Step3p5MoE(nn.Module):
+    """Sparse MoE block for Step-3.5.
+
+    Checkpoint weight layout under ``layers.{i}.moe.*``:
+      - gate.weight            (router linear)
+      - router_bias            (learnable additive bias for sigmoid routing)
+      - gate_proj.weight       (per-expert, shape [num_experts, intermediate, hidden])
+      - up_proj.weight         (per-expert)
+      - down_proj.weight       (per-expert)
+
+    The FusedMoE kernel maps these via ``get_expert_mapping``.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        num_experts: int = config.moe_num_experts
+        top_k: int = config.moe_top_k
+        moe_intermediate_size: int = config.moe_intermediate_size
+
+        # Per-layer SwiGLU clamp limit for routed experts.
+        # Step-3.5 applies clamp(x, -limit, limit) after gate_up_proj and
+        # before the SwiGLU activation inside each expert.  The CK kernel
+        # implements this as ``ActivationType.SwigluStep`` with a hard-coded
+        # ±7 clamp; Step-3.5-Flash uses 7 at layers 43-44 only.
+        layer_idx = extract_layer_index(prefix) if prefix else None
+        self._layer_idx = layer_idx
+        swiglu_limits = getattr(config, "swiglu_limits", None)
+        self.clamp_limit = (
+            swiglu_limits[layer_idx]
+            if (swiglu_limits and layer_idx is not None and swiglu_limits[layer_idx] > 0)
+            else None
+        )
+        self._uses_swiglustep = self.clamp_limit is not None
+        self._activation = (
+            ActivationType.SwigluStep if self._uses_swiglustep else ActivationType.Silu
+        )
+
+        # Router ---------------------------------------------------------
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        # Learnable router bias (added to sigmoid probs before top-k)
+        self.router_bias = nn.Parameter(
+            torch.zeros(num_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+        self.routed_scaling_factor = getattr(
+            config, "moe_router_scaling_factor", 1.0
+        )
+        self._need_fp32_gate = getattr(config, "need_fp32_gate", False)
+
+        # Routed experts (fused MoE kernel) --------------------------------
+        # R5 mitigation: at SwigluStep layers we MUST NOT fuse the shared
+        # expert into the routed FusedMoE (the kernel hard-codes ±7 clamp,
+        # but the shared expert uses a different limit, e.g. 16 at layer 44
+        # or 0 at layer 43).  Fall back to the dense Step3p5MLP path there.
+        self._fuse_shared = _fuse_shared_at_layer(config, layer_idx)
+        n_shared = 1 if self._fuse_shared else 0
+        self._n_shared_fused = n_shared  # 1 when shared expert is fused as expert num_experts
+        self.experts = FusedMoE(
+            num_experts=num_experts + n_shared,
+            top_k=top_k + n_shared,  # +1 so kernel selects top_k routed + 1 shared
+            hidden_size=self.hidden_size,
+            intermediate_size=moe_intermediate_size,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            custom_routing_function=self._routing_function,
+            config=config,
+            activation=self._activation,
+        )
+
+    def _routing_function(
+        self,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ):
+        """Sigmoid routing with additive bias and scaling.
+
+        When the shared expert is fused (self._n_shared_fused == 1), topk is
+        top_k_routed + 1.  We select top_k_routed routed experts and append
+        the shared expert (index num_routed_experts) with weight 1.0.
+        """
+        n_shared = self._n_shared_fused
+        top_k_routed = topk - n_shared  # number of routed experts to pick
+
+        gate_prob = torch.sigmoid(gating_output.float())
+        gate_prob_biased = gate_prob + self.router_bias.unsqueeze(0)
+        _, indices = torch.topk(gate_prob_biased, k=top_k_routed, dim=1)
+        topk_prob = torch.gather(gate_prob, 1, indices)
+        if renormalize:
+            topk_prob = topk_prob / (
+                topk_prob.sum(dim=-1, keepdim=True) + 1e-20
+            )
+        topk_prob = topk_prob * self.routed_scaling_factor
+
+        if n_shared > 0:
+            # Append shared expert (always selected, weight=1.0)
+            T = gating_output.shape[0]
+            num_routed = gating_output.shape[1]  # 288
+            shared_ids = torch.full(
+                (T, n_shared), num_routed, dtype=torch.int32, device=gating_output.device
+            )
+            shared_weights = torch.ones(
+                (T, n_shared), dtype=torch.float32, device=gating_output.device
+            )
+            topk_prob = torch.cat([topk_prob, shared_weights], dim=1)
+            indices = torch.cat([indices.to(torch.int32), shared_ids], dim=1)
+            return topk_prob, indices
+
+        return topk_prob, indices.to(torch.int32)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+
+        # Router logits must be computed in fp32 (need_fp32_gate=True in config)
+        if getattr(self, "_need_fp32_gate", True):
+            router_logits = torch.nn.functional.linear(
+                hidden_states.float(), self.gate.weight.float()
+            )
+        else:
+            router_logits = self.gate(hidden_states)
+
+        # Routed experts.  At SwigluStep layers (43-44) the FusedMoE was
+        # constructed with ``activation=ActivationType.SwigluStep`` so the CK
+        # kernel applies ``silu(g).clamp(max=7) * up.clamp(±7)`` per expert.
+        routed_out = self.experts(hidden_states, router_logits)
+
+        return routed_out.view(orig_shape)
+
+
+# ---------------------------------------------------------------------------
+# Attention
+# ---------------------------------------------------------------------------
+
+
+class Step3p5Attention(nn.Module):
+    """GQA attention for Step-3.5.
+
+    Key differences from vanilla LLaMA attention:
+      - Per-layer rope_theta and partial_rotary_factor (from config lists).
+      - Two attention head configurations depending on full vs sliding.
+      - QK RMSNorm (zero-centered / GemmaRMSNorm style).
+      - Head-wise attention gating via g_proj (sigmoid).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: str = "bf16",
+        prefix: str = "",
+        layer_num: int = 0,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+
+        # Determine layer type and head counts ----------------------------
+        layer_types = getattr(config, "layer_types", [])
+        is_sliding = (
+            layer_types[layer_idx] == "sliding_attention" if layer_types else False
+        )
+        attn_other = getattr(config, "attention_other_setting", None)
+
+        if is_sliding and attn_other is not None:
+            self.total_num_heads = attn_other["num_attention_heads"]
+            self.total_num_kv_heads = attn_other["num_attention_groups"]
+        else:
+            self.total_num_heads = config.num_attention_heads
+            self.total_num_kv_heads = config.num_attention_groups
+
+        self.head_dim = getattr(config, "head_dim", 128)
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim ** -0.5
+
+        # RoPE configuration -----------------------------------------------
+        rope_theta_cfg = getattr(config, "rope_theta", 10000.0)
+        if isinstance(rope_theta_cfg, list):
+            rope_theta = rope_theta_cfg[layer_idx]
+        else:
+            rope_theta = rope_theta_cfg
+
+        partial_rotary_factors = getattr(config, "partial_rotary_factors", None)
+        if partial_rotary_factors is not None:
+            partial_rotary_factor = partial_rotary_factors[layer_idx]
+        else:
+            partial_rotary_factor = 1.0
+
+        rotary_dim = int(self.head_dim * partial_rotary_factor)
+
+        max_position_embeddings = getattr(
+            config, "max_position_embeddings", 262144
+        )
+
+        # Determine rope_scaling for this layer
+        rope_scaling = getattr(config, "rope_scaling", None)
+        yarn_only_types = getattr(config, "yarn_only_types", None)
+        if yarn_only_types and layer_types:
+            layer_type = layer_types[layer_idx]
+            if layer_type not in yarn_only_types:
+                rope_scaling = None
+
+        # Projections -------------------------------------------------------
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # QK Norm (zero-centered RMSNorm) -----------------------------------
+        rms_norm_eps = getattr(config, "rms_norm_eps", 1e-5)
+        self.q_norm = Step3p5RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = Step3p5RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+        # Head-wise attention gate -------------------------------------------
+        self.use_head_wise_attn_gate = getattr(
+            config, "use_head_wise_attn_gate", False
+        )
+        if self.use_head_wise_attn_gate:
+            self.g_proj = ColumnParallelLinear(
+                input_size=self.hidden_size,
+                output_size=self.total_num_heads,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.g_proj",
+            )
+
+        # Rotary embedding ---------------------------------------------------
+        # Note: rotary_dim is already computed as head_dim * partial_rotary_factor,
+        # so we do NOT pass partial_rotary_factor to get_rope (which would apply it twice).
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+
+        # Sliding window and sink tokens per layer ---------------------------
+        # DEBUG: temporarily disable sliding window to test if pa_decode_gluon is the issue
+        sliding_window = None
+        sinks = None
+        if is_sliding and not os.environ.get("ATOM_STEP3P5_NO_SLIDING"):
+            sliding_window = getattr(config, "sliding_window", None)
+            sink_size = getattr(config, "sink", 0)
+            if sink_size > 0:
+                sinks = nn.Parameter(
+                    torch.empty(self.num_heads, requires_grad=False)
+                )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            kv_cache_dtype=cache_config,
+            layer_num=layer_num,
+            per_layer_sliding_window=sliding_window,
+            sinks=sinks,
+            prefix=f"{prefix}.attn",
+            rotary_emb=self.rotary_emb,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        import os
+        debug_nan = os.environ.get("ATOM_DEBUG_NAN2")
+
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = torch.split(
+            qkv, [self.q_size, self.kv_size, self.kv_size], dim=-1
+        )
+        if debug_nan and (q.isnan().any() or k.isnan().any() or v.isnan().any()):
+            print(f"[NAN-ATTN] qkv has NaN: q={q.isnan().any()} k={k.isnan().any()} v={v.isnan().any()}")
+
+        # QK Norm – apply per-head RMSNorm
+        # Reshape to (..., num_heads, head_dim), apply norm, reshape back
+        q = self.q_norm(q.reshape(*q.shape[:-1], -1, self.head_dim)).flatten(-2)
+        k = self.k_norm(k.reshape(*k.shape[:-1], -1, self.head_dim)).flatten(-2)
+        if debug_nan and (q.isnan().any() or k.isnan().any()):
+            print(f"[NAN-ATTN] after qk_norm NaN: q={q.isnan().any()} k={k.isnan().any()}")
+
+        attn_output = self.attn(q, k, v, positions)
+        if debug_nan and attn_output.isnan().any():
+            print(f"[NAN-ATTN] attn output has NaN, num_heads={self.num_heads}, num_kv_heads={self.num_kv_heads}, q_size={self.q_size}, kv_size={self.kv_size}, attn_output.shape={attn_output.shape}")
+
+        # Head-wise gating
+        if self.use_head_wise_attn_gate:
+            gate = self.g_proj(hidden_states)  # (tokens, num_heads_tp)
+            if debug_nan and gate.isnan().any():
+                print(f"[NAN-ATTN] gate (g_proj) has NaN, gate.shape={gate.shape}")
+            # gate: (tokens, num_heads_tp) -> (tokens, num_heads_tp, 1)
+            gate = torch.sigmoid(gate).unsqueeze(-1)
+            reshaped = attn_output.reshape(*attn_output.shape[:-1], -1, self.head_dim)
+            if debug_nan:
+                print(f"[NAN-ATTN] attn_output.shape={attn_output.shape} reshaped.shape={reshaped.shape} gate.shape={gate.shape}")
+            attn_output = (reshaped * gate).flatten(-2)
+            if debug_nan and attn_output.isnan().any():
+                print(f"[NAN-ATTN] after gate multiply has NaN")
+
+        output = self.o_proj(attn_output)
+        if debug_nan and output.isnan().any():
+            print(f"[NAN-ATTN] o_proj output has NaN")
+        return output
+
+
+# ---------------------------------------------------------------------------
+# Decoder Layer
+# ---------------------------------------------------------------------------
+
+
+class Step3p5DecoderLayer(nn.Module):
+    """Single decoder layer for Step-3.5.
+
+    - Layers 0-2: dense MLP
+    - Layers 3-44: MoE (288 routed + 1 shared)
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: str = "bf16",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_num: int = 0,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        layer_idx = extract_layer_index(prefix)
+
+        # Attention
+        self.self_attn = Step3p5Attention(
+            config=config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            layer_num=layer_num,
+        )
+
+        # FFN: dense MLP or MoE depending on layer index
+        moe_layers_enum = getattr(config, "moe_layers_enum", None)
+        if moe_layers_enum is not None:
+            if isinstance(moe_layers_enum, str):
+                moe_layers_idx = [
+                    int(i) for i in moe_layers_enum.strip().split(",")
+                ]
+            else:
+                moe_layers_idx = list(moe_layers_enum)
+        else:
+            moe_layers_idx = list(range(3, config.num_hidden_layers))
+
+        self.is_moe_layer = layer_idx in moe_layers_idx
+
+        # Per-layer SwiGLU clamp limits
+        swiglu_limits = getattr(config, "swiglu_limits", None)
+        swiglu_limits_shared = getattr(config, "swiglu_limits_shared", None)
+        clamp_limit = swiglu_limits[layer_idx] if swiglu_limits else None
+        clamp_limit_shared = swiglu_limits_shared[layer_idx] if swiglu_limits_shared else None
+
+        if self.is_moe_layer:
+            self.moe = Step3p5MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.moe",
+            )
+            # Shared expert (always active, sibling of moe in checkpoint).
+            # Per-layer fuse decision: SwigluStep layers must keep the shared
+            # expert on the dense path because the routed CK kernel hard-codes
+            # the clamp at 7 (see _fuse_shared_at_layer).
+            if not _fuse_shared_at_layer(config, layer_idx):
+                self.share_expert = Step3p5MLP(
+                    hidden_size=self.hidden_size,
+                    intermediate_size=config.share_expert_dim,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.share_expert",
+                    clamp_limit=clamp_limit_shared,
+                )
+            else:
+                self.share_expert = None
+        else:
+            self.mlp = Step3p5MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                clamp_limit=clamp_limit_shared,  # HF uses swiglu_limits_shared for dense MLP
+            )
+
+        # Layer norms (zero-centered RMSNorm)
+        rms_norm_eps = getattr(config, "rms_norm_eps", 1e-5)
+        self.input_layernorm = Step3p5RMSNorm(
+            config.hidden_size, eps=rms_norm_eps
+        )
+        self.post_attention_layernorm = Step3p5RMSNorm(
+            config.hidden_size, eps=rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual
+            )
+
+        hidden_states = self.self_attn(
+            positions=positions, hidden_states=hidden_states
+        )
+
+        # FFN
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual
+        )
+
+        if self.is_moe_layer:
+            moe_output = self.moe(hidden_states)
+            if self.share_expert is not None:
+                shared_output = self.share_expert(hidden_states)
+                hidden_states = moe_output + shared_output
+            else:
+                hidden_states = moe_output
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+# ---------------------------------------------------------------------------
+# Full Model
+# ---------------------------------------------------------------------------
+
+
+@support_torch_compile
+class Step3p5Model(nn.Module):
+    def __init__(
+        self,
+        atom_config: Config,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = atom_config.hf_config
+        self.config = config
+        cache_config = atom_config.kv_cache_dtype
+        quant_config = atom_config.quant_config
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            getattr(config, "tie_word_embeddings", False)
+            and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix, layer_num=None: Step3p5DecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                layer_num=layer_num,
+            ),
+            prefix=f"{prefix}.layers",
+            layer_num_offset=0,
+        )
+
+        rms_norm_eps = getattr(config, "rms_norm_eps", 1e-5)
+        if get_pp_group().is_last_rank:
+            self.norm = Step3p5RMSNorm(config.hidden_size, eps=rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size
+            )
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        import os as _dbg_os
+        _dbg_layer = _dbg_os.environ.get("ATOM_DEBUG_LAYER")
+        for i, layer in enumerate(self.layers[self.start_layer : self.end_layer]):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+            if _dbg_layer:
+                _hs_nan = torch.isnan(hidden_states).any().item()
+                _res_nan = torch.isnan(residual).any().item() if residual is not None else False
+                if _hs_nan or _res_nan:
+                    print(f"[LAYER NaN] layer={i} hs_nan={_hs_nan} res_nan={_res_nan} hs_norm={hidden_states.float().norm():.3f}", flush=True)
+                    break
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+# ---------------------------------------------------------------------------
+# CausalLM wrapper
+# ---------------------------------------------------------------------------
+
+
+class Step3p5ForCausalLM(nn.Module):
+    """Step-3.5 model with language modelling head."""
+
+    packed_modules_mapping = {
+        "q_proj": ("qkv_proj", "q"),
+        "k_proj": ("qkv_proj", "k"),
+        "v_proj": ("qkv_proj", "v"),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        atom_config: Config,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = atom_config.hf_config
+        self.config = config
+
+        self.model = Step3p5Model(
+            atom_config=atom_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if getattr(config, "tie_word_embeddings", False):
+                self.lm_head.weight = self.model.embed_tokens.weight
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Optional[torch.Tensor]:
+        logits = self.lm_head(hidden_states)
+        return logits
+
+    def detect_fused_expert_format(self, weight_name: str) -> bool:
+        """Step-3.5-Flash expert weights are flat: moe.gate_proj.weight [E, I, H].
+        When shared expert fusion is enabled, share_expert weights are also loaded
+        as expert N in FusedMoE; otherwise they are loaded as a regular MLP.
+
+        Per-layer override: at SwigluStep layers (43-44) the shared expert is
+        not fused, so its weights must take the dense path even when the
+        global aiter fusion flag is on.
+        """
+        is_routed_expert = (
+            ".moe.gate_proj" in weight_name
+            or ".moe.up_proj" in weight_name
+            or ".moe.down_proj" in weight_name
+        )
+        if is_routed_expert:
+            return True
+        is_share_expert = (
+            ".share_expert.gate_proj" in weight_name
+            or ".share_expert.up_proj" in weight_name
+            or ".share_expert.down_proj" in weight_name
+        )
+        if is_share_expert:
+            layer_idx = extract_layer_index(weight_name)
+            return _fuse_shared_at_layer(self.config, layer_idx)
+        return False
+
+    def get_fused_expert_mapping(self) -> list[tuple[str, str, str]]:
+        """Mapping from flat checkpoint names to FusedMoE parameter names.
+
+        Weight names include the '.weight' suffix from the checkpoint so that
+        the replace() in loader.py produces the correct param name without the
+        extra '.weight' tail (e.g. 'moe.gate_proj.weight' -> 'moe.experts.w13_weight').
+        """
+        mapping = [
+            ("moe.experts.w13_weight", "moe.gate_proj.weight", "w1"),
+            ("moe.experts.w13_weight", "moe.up_proj.weight", "w3"),
+            ("moe.experts.w2_weight", "moe.down_proj.weight", "w2"),
+        ]
+        if is_rocm_aiter_fusion_shared_expert_enabled():
+            mapping += [
+                ("moe.experts.w13_weight", "share_expert.gate_proj.weight", "w1"),
+                ("moe.experts.w13_weight", "share_expert.up_proj.weight", "w3"),
+                ("moe.experts.w2_weight", "share_expert.down_proj.weight", "w2"),
+            ]
+        return mapping
+
+    def load_fused_expert_weights(
+        self,
+        original_name: str,
+        name: str,
+        params_dict: dict,
+        loaded_weight: torch.Tensor,
+        shard_id: str,
+        num_experts: int,
+    ) -> bool:
+        """Load flat expert weights [E, I, H] into FusedMoE per-expert params.
+
+        For routed experts: loaded_weight is [num_experts, ...], loaded per-expert.
+        For shared expert: loaded_weight is [I, H] or [H, I], loaded as expert num_experts.
+        """
+        # num_experts from loader may be 0 if hf_config uses non-standard attr name
+        if num_experts == 0:
+            num_experts = self.config.moe_num_experts
+
+        if name not in params_dict:
+            return False
+        param = params_dict[name]
+        weight_loader = param.weight_loader
+        loaded_local_expert = False
+
+        is_share_expert = "share_expert" in original_name
+
+        if is_share_expert:
+            # Defensive: if this layer keeps the shared expert dense (e.g.
+            # SwigluStep layers 43-44), do not route it through FusedMoE.
+            layer_idx = extract_layer_index(original_name)
+            if not _fuse_shared_at_layer(self.config, layer_idx):
+                return False
+            # Shared expert is loaded as expert index num_experts (288)
+            expert_id = num_experts
+            try:
+                success = weight_loader(
+                    param,
+                    loaded_weight,
+                    name,
+                    shard_id,
+                    expert_id,
+                    return_success=True,
+                )
+                if success:
+                    loaded_local_expert = True
+            except TypeError:
+                weight_loader(param, loaded_weight, name, shard_id, expert_id)
+                loaded_local_expert = True
+        else:
+            for expert_id in range(num_experts):
+                try:
+                    success = weight_loader(
+                        param,
+                        loaded_weight[expert_id],
+                        name,
+                        shard_id,
+                        expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        loaded_local_expert = True
+                except TypeError:
+                    weight_loader(param, loaded_weight[expert_id], name, shard_id, expert_id)
+                    loaded_local_expert = True
+
+        return loaded_local_expert
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        """Return the expert parameter mapping for weight loading.
+
+        Note: Step-3.5-Flash uses flat expert weights in the checkpoint
+        (moe.gate_proj.weight etc.), so get_expert_mapping is used only
+        as a sentinel to enable the expert loading path in loader.py.
+        The actual loading is handled by load_fused_expert_weights.
+        """
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.moe_num_experts
+            + (
+                1 if is_rocm_aiter_fusion_shared_expert_enabled() else 0
+            ),
+        )

From 26585d42d8978255936967f25384a0f062f1559a Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Fri, 24 Apr 2026 21:54:28 +0000
Subject: [PATCH 2/8] fix: pad inter_dim in UnquantizedFusedMoEMethod for
 gfx950 tp=4/8

CK 2-stage MoE kernel (gemm_moe_ck2stages.cu L98) computes stage1 N
as w1.size(1)/2 = inter_dim. The stage1 dispatch selects NPerBlock
based on inter_dim range:
  - inter <= 192: NPerBlock = 64  -> need inter % 64 == 0
  - inter >  192: NPerBlock = 128 -> need inter % 128 == 0

Step-3.5-Flash with tp=4 gives inter=320 (320%128=64 != 0, crash)
and with tp=8 gives inter=160 (160%64=32 != 0, crash).

Fix: in process_weights_after_loading, pad inter_dim before
shuffle_weights() using alignment = 64 if inter<=192 else 128:
  - inter=160 -> 192  (tp=8, 192%64=0)
  - inter=320 -> 384  (tp=4, 384%128=0, 384%64=0)

Zero-padding is safe: padded rows carry zero weight so contribute
nothing to fused_moe output.

Verified 2026-04-24 on gfx950 (MI350X):
  - cos_sim >= 0.9999 vs torch reference (M=1..256)
  - tp=4 inference: 4 prompts complete, no crash, output correct

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 atom/model_ops/moe.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 0c182359c..4bf08123b 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -485,6 +485,38 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         layer.w13_weight = atom_parameter(self._maybe_pad_weight(layer.w13_weight.data))
         layer.w2_weight = atom_parameter(self._maybe_pad_weight(layer.w2_weight.data))
+
+        # gfx950 CK a16w16 stage2 requires inter_dim % 64 == 0.
+        # For tp=4 (inter=320) and tp=8 (inter=160), pad inter_dim up to the
+        # next multiple of 64. Zero padding is safe because fused_moe clips
+        # routed-weight contributions and zero-padded rows contribute nothing.
+        # Verified 2026-04-24: cos_sim >= 0.9999 for inter=160->192 and
+        # inter=320->384 vs torch reference.
+        w13 = layer.w13_weight.data  # [E, 2*inter, hidden]
+        w2 = layer.w2_weight.data    # [E, hidden, inter]
+        inter_dim = w2.shape[2]
+        # Stage1 dispatch: inter<=192 uses NPerBlock=64, inter>192 uses NPerBlock=128.
+        # Stage2 dispatch: inter>192 uses KPerBlock=64.
+        # So required alignment: 64 when inter<=192, 128 when inter>192.
+        # (inter=160->192 satisfies 192%64=0; inter=320->384 satisfies 384%128=0 and 384%64=0)
+        align = 64 if inter_dim <= 192 else 128
+        inter_pad = (inter_dim + align - 1) // align * align
+        if inter_pad != inter_dim:
+            E, _, hidden = w13.shape
+            # pad w13: gate half [E, inter, hidden] and up half [E, inter, hidden]
+            w13_new = torch.zeros(
+                E, 2 * inter_pad, hidden, dtype=w13.dtype, device=w13.device
+            )
+            w13_new[:, :inter_dim, :] = w13[:, :inter_dim, :]       # gate
+            w13_new[:, inter_pad : inter_pad + inter_dim, :] = w13[:, inter_dim:, :]  # up
+            # pad w2: [E, hidden, inter_pad]
+            w2_new = torch.zeros(
+                E, hidden, inter_pad, dtype=w2.dtype, device=w2.device
+            )
+            w2_new[:, :, :inter_dim] = w2
+            layer.w13_weight = atom_parameter(w13_new)
+            layer.w2_weight = atom_parameter(w2_new)
+
         # Shuffle weights for CK/ASM kernels.
         # Previously skipped for gfx950 bf16 g1u1 on the assumption that the CK
         # 2-stage preshuffle_off (NSwizzle=0) kernel expected un-shuffled weights.

From 841dc4ee607f7e6fd07ce7337296fcd2b5b2ddd7 Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Fri, 24 Apr 2026 21:54:28 +0000
Subject: [PATCH 3/8] fix: pass correct block_shape in
 Fp8MoEMethod.get_fused_moe_quant_config

The else branch in get_fused_moe_quant_config was shared between
block_quant (per_1x128/per_1x32) and per_tensor paths, hardcoding
block_shape=None for all. Block-quantized FP8 models should receive
block_shape=[128,128] (per_1x128) or [1,32] (per_1x32) to correctly
configure the quant config, particularly for EP paths.

Split the else branch into explicit per_1x128/per_1x32/fallback cases
and unify the fp8_w8a8_moe_quant_config call.
---
 atom/model_ops/moe.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 4bf08123b..58da0700f 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -1769,14 +1769,19 @@ def get_fused_moe_quant_config(
                 a2_scale=layer.w2_input_scale,
                 per_act_token_quant=True,
             )
+        elif self.quant_type == QuantType.per_1x128:
+            block_shape = [128, 128]
+        elif self.quant_type == QuantType.per_1x32:
+            block_shape = [1, 32]
         else:
-            return fp8_w8a8_moe_quant_config(
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-                block_shape=None,
-            )
+            block_shape = None
+        return fp8_w8a8_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=block_shape,
+        )
 
     @mark_trace(prefix="fp8_moe", torch_compile=False)
     def apply(

From ccb646215ba9352b12c240e3bcd9ef496bc91db0 Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Sat, 25 Apr 2026 01:33:48 +0000
Subject: [PATCH 4/8] fix: support FP8 block-quantized inference at tp=4
 (inter_dim=320)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three coordinated fixes in Fp8MoEMethod for per_1x128 block scale:

1. create_weights: make ValueError check padding-aware
   Compute padded_inter = ceil(inter/block_n)*block_n and check against
   padded_inter instead of raw inter, allowing tp=4 (inter=320) to pass
   while preserving the guard for truly unaligned cases.

2. _process_block_quant: zero-pad weights before shuffle_weights
   After normalize and before shuffle, zero-pad w13 from [E,2*320,H] to
   [E,2*384,H] and w2 from [E,H,320] to [E,H,384], mirroring the BF16
   approach in UnquantizedFusedMoEMethod.process_weights_after_loading.
   Padding zeros contribute 0 to GEMM output (dequant(0, scale)=0).
   Scale tensors already use ceil(inter/block_n) and need no change.

3. _load_w13 / _load_w2: fix scale TP sharding floor→ceil (root cause)
   The per_1x128 scale for full inter=1280 has 10 N-blocks. TP=4 sharding
   with floor gives 10//4=2 blocks per rank; the 3rd (partial) block is
   never copied and stays at the torch.ones() init value of 1.0.
   With scale=1.0 instead of ~0.0002, dequant amplifies by ~5000×
   causing complete garbage output despite correct weight loading.
   Fix: use ceil division and add narrow() bounds protection for the
   last rank which may have fewer elements than the ceil size.
   Safe for tp=2 (10/2=5 exact, ceil==floor) and tp=1 (no sharding).

Verification:
  FP8 tp=4: 4 prompts, TTFT=92ms, TPOT=14ms, coherent output ✅
  BF16 tp=4 regression: TTFT=76-77ms, coherent output ✅
  FP8 tp=2 regression: TTFT=86ms, coherent output ✅
---
 atom/model_ops/moe.py | 76 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 12 deletions(-)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 58da0700f..569f6ca59 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -1559,25 +1559,38 @@ def create_weights(
                 block_n = 1
                 block_k = 32
             tp_size = get_tp_group().world_size
+            # Pad intermediate_size_per_partition to the nearest multiple of
+            # block_n so that both the CK blockscale kernel alignment constraints
+            # and the block-quantization scale alignment constraints are satisfied.
+            # For tp=4, inter_dim=320 is not divisible by block_n=128; padding to
+            # 384 (=3×128) satisfies both stage1 NPerBlock=128 and block_n=128.
+            # The weight loader already supports partial loading into a padded
+            # buffer via the MXFP4 alignment path (_load_w13 / _load_w2).
+            padded_inter = (
+                (intermediate_size_per_partition + block_n - 1) // block_n * block_n
+            )
             # NOTE: To ensure proper alignment of the block-wise quantization
             # scales, the output_size of the weights for both the gate and up
             # layers must be divisible by block_n.
             # Required by column parallel or enabling merged weights
-            if intermediate_size_per_partition % block_n != 0:
+            if padded_inter % block_n != 0:
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "
-                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"{padded_inter} is not divisible by "
                     f"weight quantization block_n = {block_n}."
                 )
-            if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
+            if tp_size > 1 and padded_inter % block_k != 0:
                 # Required by row parallel
                 raise ValueError(
                     f"The input_size of down's weight = "
-                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"{padded_inter} is not divisible by "
                     f"weight quantization block_k = {block_k}."
                 )
 
         # WEIGHTS
+        # Allocated at original (un-padded) size; inter_dim padding is applied
+        # later in _process_block_quant (after normalize, before shuffle_weights),
+        # mirroring the BF16 approach in UnquantizedFusedMoEMethod.
         w13_weight = atom_parameter(
             torch.empty(
                 num_experts,
@@ -1697,6 +1710,36 @@ def _process_block_quant(self, layer: nn.Module) -> None:
         assert self.quant_config.is_dynamic
         self._normalize_weights_and_scales(layer)
 
+        # Inter-dim padding for block-quantized FP8 (mirrors BF16 approach in
+        # UnquantizedFusedMoEMethod.process_weights_after_loading).
+        # When inter_dim is not a multiple of block_n (e.g. tp=4: 320 % 128 ≠ 0),
+        # zero-pad both weights to the nearest block_n multiple BEFORE shuffling.
+        # Padding area is zero so dequant(0, scale) = 0 is numerically safe.
+        # Scale tensors use ceil(inter/block_n) and are already shape-compatible.
+        inter_dim = layer.w2_weight.shape[-1]
+        block_n = 128 if self.quant_type == QuantType.per_1x128 else 32
+        align = 64 if inter_dim <= 192 else block_n
+        inter_pad = (inter_dim + align - 1) // align * align
+        if inter_pad != inter_dim:
+            E = layer.w13_weight.shape[0]
+            hidden = layer.w13_weight.shape[-1]
+            w13 = layer.w13_weight.data
+            w13_new = torch.zeros(
+                E, 2 * inter_pad, hidden,
+                dtype=w13.dtype, device=w13.device
+            )
+            w13_new[:, :inter_dim, :] = w13[:, :inter_dim, :]        # gate
+            w13_new[:, inter_pad:inter_pad + inter_dim, :] = w13[:, inter_dim:, :]  # up
+            layer.w13_weight = atom_parameter(w13_new)
+
+            w2 = layer.w2_weight.data
+            w2_new = torch.zeros(
+                E, hidden, inter_pad,
+                dtype=w2.dtype, device=w2.device
+            )
+            w2_new[:, :, :inter_dim] = w2
+            layer.w2_weight = atom_parameter(w2_new)
+
         if not self.need_normalize_e4m3fn_to_e4m3fnuz:
             layer.w13_weight = atom_parameter(layer.w13_weight.data)
             layer.w13_weight_scale = atom_parameter(layer.w13_weight_scale.data)
@@ -2255,10 +2298,16 @@ def _load_w13(
         expert_shard_size = expert_data.shape[shard_dim] // 2
         # Derive shard size from loaded_weight (unpadded checkpoint) to avoid
         # out-of-bounds when expert_data is padded (e.g. MXFP4 alignment).
-        load_shard_size = loaded_weight.shape[shard_dim] // self.tp_size
-        loaded_weight = loaded_weight.narrow(
-            shard_dim, load_shard_size * tp_rank, load_shard_size
-        )
+        # Use ceil so that the last partial scale block (e.g. per_1x128 with
+        # inter=1280 and tp=4: 10 blocks / 4 = 2.5 → ceil=3) is included.
+        # Without ceil, the 3rd scale block is never copied and stays at the
+        # torch.ones() initial value of 1.0, causing ~5000× dequant error.
+        load_shard_size = (
+            loaded_weight.shape[shard_dim] + self.tp_size - 1
+        ) // self.tp_size
+        start = load_shard_size * tp_rank
+        size = min(load_shard_size, loaded_weight.shape[shard_dim] - start)
+        loaded_weight = loaded_weight.narrow(shard_dim, start, size)
         # Narrow parameter and load.
         # w1, gate_proj: Load into first logical weight of w13.
         if shard_id == "w1":
@@ -2294,10 +2343,13 @@ def _load_w2(
         if not load_full:
             # Derive shard size from loaded_weight (unpadded checkpoint) to
             # avoid out-of-bounds when expert_data is padded (e.g. MXFP4).
-            load_shard_size = loaded_weight.shape[shard_dim] // self.tp_size
-            loaded_weight = loaded_weight.narrow(
-                shard_dim, load_shard_size * tp_rank, load_shard_size
-            )
+            # Use ceil (same reason as _load_w13: partial last scale block).
+            load_shard_size = (
+                loaded_weight.shape[shard_dim] + self.tp_size - 1
+            ) // self.tp_size
+            start = load_shard_size * tp_rank
+            size = min(load_shard_size, loaded_weight.shape[shard_dim] - start)
+            loaded_weight = loaded_weight.narrow(shard_dim, start, size)
             if load_shard_size != shard_size:
                 expert_data = expert_data.narrow(shard_dim, 0, load_shard_size)
         # w2, down_proj: Load into only logical weight of w2.

From 270fee71e45049e5a30fde1dc7ae0ff91fecd718 Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Mon, 27 Apr 2026 06:31:18 +0000
Subject: [PATCH 5/8] fix(moe): use align=64 for FP8 blockscale to remove
 inter_dim=320 padding

With NPerBlock=64 CK kernel support, inter_dim=320 (tp=4) is 64-aligned
and no longer requires zero-padding to 384. Changed align from
'64 if inter<=192 else block_n' to always 64, so:
- tp=4 (inter=320): 320%64=0 -> no padding (was 320->384, saved 17% compute)
- tp=8 (inter=160): 160%64=32 -> pad to 192 (unchanged)
- tp=2 (inter=640): 640%64=0 -> no padding (unchanged)

Scale tensor shape (ceil(320/128)=3) unchanged; no re-quantization needed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 atom/model_ops/moe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 569f6ca59..0f9fa1117 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -1718,7 +1718,9 @@ def _process_block_quant(self, layer: nn.Module) -> None:
         # Scale tensors use ceil(inter/block_n) and are already shape-compatible.
         inter_dim = layer.w2_weight.shape[-1]
         block_n = 128 if self.quant_type == QuantType.per_1x128 else 32
-        align = 64 if inter_dim <= 192 else block_n
+        # align=64: inter_dim=320 (tp=4) is 64-aligned -> no padding needed with NPerBlock=64 kernel
+        # inter_dim=160 (tp=8) -> pads to 192; inter_dim=640 (tp=2) -> no padding
+        align = 64
         inter_pad = (inter_dim + align - 1) // align * align
         if inter_pad != inter_dim:
             E = layer.w13_weight.shape[0]

From 3696345e707eb9a8e51cc4f44b1c78af75c6df8b Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Mon, 27 Apr 2026 06:47:07 +0000
Subject: [PATCH 6/8] revert(moe): restore FP8 blockscale inter_dim padding
 align logic

Stage2 KPerBlock=64 is not compilable on gfx950 (FP8 mfma KPack=32
constraint). Since stage1 output and stage2 weight K must match,
both w13 and w2 require the same inter_dim padding. Restoring:
  align = 64 if inter_dim <= 192 else block_n (=128)

Added comment explaining why full no-padding is currently blocked.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 atom/model_ops/moe.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 0f9fa1117..4e60046d5 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -1718,9 +1718,10 @@ def _process_block_quant(self, layer: nn.Module) -> None:
         # Scale tensors use ceil(inter/block_n) and are already shape-compatible.
         inter_dim = layer.w2_weight.shape[-1]
         block_n = 128 if self.quant_type == QuantType.per_1x128 else 32
-        # align=64: inter_dim=320 (tp=4) is 64-aligned -> no padding needed with NPerBlock=64 kernel
-        # inter_dim=160 (tp=8) -> pads to 192; inter_dim=640 (tp=2) -> no padding
-        align = 64
+        # NOTE: stage2 KPerBlock=64 is not supported on gfx950 FP8 mfma (KPack=32 constraint).
+        # Both w13 and w2 must use the same inter_dim, so full padding is still required.
+        # align=64 for inter<=192 (tp=8 inter=160->192), align=block_n(=128) for inter>192 (tp=4 inter=320->384)
+        align = 64 if inter_dim <= 192 else block_n
         inter_pad = (inter_dim + align - 1) // align * align
         if inter_pad != inter_dim:
             E = layer.w13_weight.shape[0]

From acff926de8b1699101962116470066d4e3c78b0e Mon Sep 17 00:00:00 2001
From: Jun Lin <junlin12@amd.com>
Date: Mon, 27 Apr 2026 07:36:36 +0000
Subject: [PATCH 7/8] fix(moe): correct FP8 blockscale inter_dim padding align
 for all tp configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_process_block_quant used 'align = 64 if inter_dim <= 192 else block_n',
copied from the BF16 path. For FP8 blockscale this is wrong:
- FP8 stage2 only has KPerBlock=128 (KPack=32 mfma constraint prevents KPerBlock=64)
- align=64 gives inter_pad=192 for tp=8 (inter=160), but 192 % 128 = 64 != 0
- device_moe_gemm_blockscale.hpp L448 rejects K % KPerBlock != 0 → kernel fails

Fix: always use align = block_n (=128 for per_1x128), so inter_pad is always
a multiple of 128 and stage2 KPerBlock=128 dispatch succeeds:
  tp=2: inter=640 → 640 (no padding, unchanged)
  tp=4: inter=320 → 384 (unchanged)
  tp=8: inter=160 → 256 (was 192, now correctly aligned)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 atom/model_ops/moe.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index 4e60046d5..ad708046d 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -1718,10 +1718,12 @@ def _process_block_quant(self, layer: nn.Module) -> None:
         # Scale tensors use ceil(inter/block_n) and are already shape-compatible.
         inter_dim = layer.w2_weight.shape[-1]
         block_n = 128 if self.quant_type == QuantType.per_1x128 else 32
-        # NOTE: stage2 KPerBlock=64 is not supported on gfx950 FP8 mfma (KPack=32 constraint).
-        # Both w13 and w2 must use the same inter_dim, so full padding is still required.
-        # align=64 for inter<=192 (tp=8 inter=160->192), align=block_n(=128) for inter>192 (tp=4 inter=320->384)
-        align = 64 if inter_dim <= 192 else block_n
+        # FP8 blockscale stage2 requires KPerBlock=128 (gfx950 FP8 mfma KPack=32 constraint
+        # prevents KPerBlock=64). align must always be block_n(=128) so that inter_pad%128==0.
+        # Bug fix: previously used align=64 for inter<=192 (copied from BF16 path), but
+        # 192%128=64!=0 → stage2 kernel dispatch fails. Correct: always align to block_n.
+        # tp=8 inter=160 → 256 (3×128→no, ceil(160/128)*128=256); tp=4 inter=320 → 384.
+        align = block_n
         inter_pad = (inter_dim + align - 1) // align * align
         if inter_pad != inter_dim:
             E = layer.w13_weight.shape[0]

From 969d5640feebfaf18a38b427a25fdb9b29c9236e Mon Sep 17 00:00:00 2001
From: junlin12 <junlin12@amd.com>
Date: Wed, 29 Apr 2026 09:05:37 +0000
Subject: [PATCH 8/8] fix(moe): handle D < tp_size in fp8 _load_w13/_load_w2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the per_1x128 scale block count is smaller than tp_size (observed
on Step-3.5-Flash-FP8 at tp=8 with inter_dim=1280 → D=10), the ceil
split leaves trailing ranks with start >= D so narrow(start, size) hits
size<0 and crashes weight load. Skip narrow + copy_ for those ranks.

For fp8 scale tensors (torch.ones() initialised in
Fp8MoEMethod._create_weights), additionally zero the rank's slot before
the early return. Otherwise the downstream fp8 dequant multiplies the
(uninitialised) fp8 weight by stale 1.0 instead of the correct
quantization scale, contaminating the column gather / row reduction and
producing garbled output. Matches MXFP4 scale init (moe.py:776,813).

Verified on stepfun-ai/Step-3.5-Flash-FP8 (gfx942 / MI308X):
- tp=8 A1/A2/A4 PASS — 4/4 prompts coherent (was: weight-load crash
  pre-patch; was: garbled output with early-return-only)
- tp=2/tp=4 A1/A2/A3 PASS — no regression, zero-trigger confirmed
  (D=10, starts=[0,3,6,9] for tp=4, starts=[0,5] for tp=2 — all < D)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 atom/model_ops/moe.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
index ad708046d..28d5d633d 100644
--- a/atom/model_ops/moe.py
+++ b/atom/model_ops/moe.py
@@ -2311,6 +2311,26 @@ def _load_w13(
             loaded_weight.shape[shard_dim] + self.tp_size - 1
         ) // self.tp_size
         start = load_shard_size * tp_rank
+        # When D < tp_size (e.g. per_1x128 scale block count smaller than
+        # tp_size, observed at tp=8 with inter=1280 → D=10), the ceil split
+        # gives some trailing ranks start >= D so they hold no slice of the
+        # loaded tensor. Skip narrow + copy_ for those ranks; the rank's
+        # slice of expert_data stays at its initialised value (0 for weight,
+        # 1.0 for scale) and the rank contributes a no-op to the column
+        # gather / row reduction.
+        if start >= loaded_weight.shape[shard_dim]:
+            # FP8 scale tensors are torch.ones() initialised. If we leave the
+            # trailing rank's slice at 1.0, the downstream FP8 dequant multiplies
+            # the (uninitialised) fp8 weight by 1.0 instead of the correct
+            # quantization scale, contaminating the column gather / row reduction.
+            # Zero the slot so dequant produces 0 and the rank contributes a
+            # true no-op (matches MXFP4 scale init at moe.py:776,813).
+            if expert_data.dtype == torch.float32:
+                if shard_id == "w1":
+                    expert_data.narrow(shard_dim, 0, expert_shard_size).zero_()
+                else:
+                    expert_data.narrow(shard_dim, expert_shard_size, expert_shard_size).zero_()
+            return
         size = min(load_shard_size, loaded_weight.shape[shard_dim] - start)
         loaded_weight = loaded_weight.narrow(shard_dim, start, size)
         # Narrow parameter and load.
@@ -2353,6 +2373,17 @@ def _load_w2(
                 loaded_weight.shape[shard_dim] + self.tp_size - 1
             ) // self.tp_size
             start = load_shard_size * tp_rank
+            # See _load_w13 comment above: when D < tp_size the ceil split
+            # leaves trailing ranks with no slice; skip narrow + copy_.
+            if start >= loaded_weight.shape[shard_dim]:
+                # Zero the scale slice so dequant=0 instead of multiplying by
+                # stale init=1.0; see _load_w13 comment for full rationale.
+                if expert_data.dtype == torch.float32:
+                    if load_shard_size != shard_size:
+                        expert_data.narrow(shard_dim, 0, load_shard_size).zero_()
+                    else:
+                        expert_data.zero_()
+                return
             size = min(load_shard_size, loaded_weight.shape[shard_dim] - start)
             loaded_weight = loaded_weight.narrow(shard_dim, start, size)
             if load_shard_size != shard_size: