huggingface · 3outeille · Mar 4, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
@@ -328,6 +328,15 @@ def job_name(self):
     parallelism=6,
 )
 
+tensor_parallel_ci_job = CircleCIJob(
+    "tensor_parallel_ci",
+    additional_env={"RUN_TENSOR_PARALLEL_TESTS": True},
+    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    install_steps=["uv pip install .", "uv pip install torchao"],
+    marker="is_tensor_parallel_test",
+    parallelism=6,
+)
+
 # We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
 # hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
 # the bash output redirection.)
@@ -358,7 +367,8 @@ def job_name(self):
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 TRAINING_CI_TESTS = [training_ci_job]
-ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] + TRAINING_CI_TESTS  # fmt: skip
+TENSOR_PARALLEL_CI_TESTS = [tensor_parallel_ci_job]
+ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] + TRAINING_CI_TESTS + TENSOR_PARALLEL_CI_TESTS  # fmt: skip
 
 
 def create_circleci_config(folder=None):

diff --git a/conftest.py b/conftest.py
@@ -91,6 +91,7 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
     config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
     config.addinivalue_line("markers", "training_ci: mark test for training CI validation")
+    config.addinivalue_line("markers", "tensor_parallel_ci: mark test for tensor parallel CI validation")
 
     os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"
 

diff --git a/docs/source/en/weightconverter.md b/docs/source/en/weightconverter.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,7 @@ markers = [
     "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
     "generate: marks tests that use the GenerationTesterMixin",
     "is_training_test: marks tests that use the TrainingTesterMixin (deselect with '-m \"not is_training_test\"')",
+    "is_tensor_parallel_test: marks tests that use the TensorParallelTesterMixin (deselect with '-m \"not is_tensor_parallel_test\"')",
 ]
 log_cli = 1
 log_cli_level = "WARNING"

diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -352,9 +352,21 @@ def _build_checkpoint_conversion_mapping():
         ),
     ]
 
-    mapping["ernie4_5_moe"] = mapping["qwen2_moe"].copy()
-    mapping["ernie4_5_moe"] += [
-        WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias")
+    mapping["ernie4_5_moe"] = [
+        WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias"),
+        WeightConverter(
+            source_patterns=[
+                "mlp.experts.*.gate_proj.weight",
+                "mlp.experts.*.up_proj.weight",
+            ],
+            target_patterns="mlp.experts.gate_up_proj",
+            operations=[MergeModulelist(dim=0), Concatenate(dim=1), Force16BytesAlignment()],
+        ),
+        WeightConverter(
+            source_patterns="mlp.experts.*.down_proj.weight",
+            target_patterns="mlp.experts.down_proj",
+            operations=[MergeModulelist(dim=0), Force16BytesAlignment()],
+        ),
     ]
     mapping["minimax_m2"] = mapping["mixtral"].copy()
     mapping["minimax_m2"] += [
@@ -363,6 +375,22 @@ def _build_checkpoint_conversion_mapping():
     mapping["exaone_moe"] = mapping["qwen2_moe"].copy()
     mapping["exaone_moe"] += [WeightRenaming("mlp.e_score_correction_bias", "mlp.gate.e_score_correction_bias")]
 
+    mapping["solar_open"] = [
+        WeightConverter(
+            source_patterns=[
+                "mlp.experts.*.gate_proj.weight",
+                "mlp.experts.*.up_proj.weight",
+            ],
+            target_patterns="mlp.experts.gate_up_proj",
+            operations=[MergeModulelist(dim=0), Concatenate(dim=1), Force16BytesAlignment()],
+        ),
+        WeightConverter(
+            source_patterns="mlp.experts.*.down_proj.weight",
+            target_patterns="mlp.experts.down_proj",
+            operations=[MergeModulelist(dim=0), Force16BytesAlignment()],
+        ),
+    ]
+
     mapping["qwen3_5_moe_text"] = mapping["qwen3_5_text"].copy()
     mapping["qwen3_5_moe_text"] += mapping["qwen2_moe"].copy()
 

diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
@@ -460,6 +460,7 @@ def backward(ctx, grad_output):
         device_mesh = ctx.device_mesh
         if device_mesh.size() == 1:
             return grad_output, None
+        grad_output = grad_output.contiguous()
         dist.all_reduce(grad_output, op=dist.ReduceOp.SUM, group=device_mesh.get_group())
         return grad_output, None
 
@@ -658,7 +659,7 @@ def shard_tensor(
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def prepare_module_tp(self, module: nn.Module, device_mesh) -> nn.Module:
+    def prepare_module_tp(self, module: nn.Module, device_mesh, **kwargs) -> nn.Module:
         distribute_module(
             module,
             device_mesh,
@@ -724,6 +725,86 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         return tuple(shape)
 
 
+class ReplicatedWithGradAllReduce(TensorParallelLayer):
+    """
+    Replicated parameter with gradient all-reduce.
+
+    For parameters like q_norm/k_norm that sit between colwise and rowwise
+    layers. The parameter is replicated (not sharded), but its gradient
+    accumulates from local heads only in TP mode. This class registers a
+    backward hook to all-reduce the parameter gradient.
+    """
+
+    @staticmethod
+    def _prepare_input_fn(mod, inputs, device_mesh):
+        return inputs
+
+    @staticmethod
+    def _prepare_output_fn(mod, outputs, device_mesh):
+        return outputs
+
+    def shard_tensor(self, param, tensor_idx=None, device=None, dtype=None):
+        return param[...].to(device=device, dtype=dtype)
+
+    def prepare_module_tp(self, module, device_mesh, **kwargs):
+        # Use a module-level backward hook (not param.register_hook) because parameters are replaced during weight loading after this method runs.
+        # Module hooks survive parameter replacement.
+        def _backward_hook(mod, grad_input, grad_output, mesh=device_mesh):
+            for param in mod.parameters():
+                if param.grad is not None:
+                    all_reduce_forward(param.grad, mesh)
+
+        module.register_full_backward_hook(_backward_hook)
+
+
+class MlaKvAProjParallel(TensorParallelLayer):
+    """
+    For MLA attention used in DeepSeek-V2 style models (deepseek_v2, longcat_flash, glm_moe_dsa, glm4_moe_lite):
+    kv_a_proj_with_mqa output is [kv_lora_rank + qk_rope_head_dim] (can have different naming but important thing
+    to understand is that it is split)
+    Example below (from modeling_longcat_flash.py):
+
+    kv_a_proj_with_mqa
+            |
+            split
+            /    \
+        k_pass    k_rot  <-- "bypasses kv_b_proj"
+        |          |        (goes straight to attention,
+    kv_a_layernorm |         never touches kv_b_proj)
+        |          |
+    kv_b_proj      |
+    (colwise)      |
+        |          |
+        k_pass     k_rot
+            \\      /
+               cat
+                |
+            key_states
+
+    k_pass is passed to kv_b_proj (colwise) which has built-in all_reduce_backward so we don't have a partial gradient for it.
+    However, k_rot goes straight to attention, never touches kv_b_proj. So we need to average gradient across all ranks otherwise we only get gradient for one rank (partial gradient).
+    """
+
+    def _prepare_output_fn(self, mod, output, device_mesh):
+        if not hasattr(mod.config, "qk_rope_head_dim"):
+            raise AttributeError(
+                f"Config for {type(mod).__name__} does not have `qk_rope_head_dim`. "
+                "MlaKvAProjParallel requires `qk_rope_head_dim` to be defined in the model config. "
+                "Please add it to the model's config or update the TP plan mapping."
+            )
+        rope_dim = mod.config.qk_rope_head_dim
+        pass_output, rope_output = output.split([output.shape[-1] - rope_dim, rope_dim], dim=-1)
+        rope_output = all_reduce_backward(rope_output, device_mesh)
+        return torch.cat([pass_output, rope_output], dim=-1)
+
+    def shard_tensor(self, param, tensor_idx=None, device=None, dtype=None):
+        return param[...].to(device=device, dtype=dtype)
+
+    def prepare_module_tp(self, module, device_mesh, config=None, **kwargs):
+        module.config = config
+        distribute_module(module, device_mesh, output_fn=self._prepare_output_fn)
+
+
 class RowwiseParallel(TensorParallelLayer):
     """
     Row-wise parallel: weight is sharded on dim -1 (input features).
@@ -1087,6 +1168,29 @@ def shard_tensor(
         return param[...].to(device=device, dtype=dtype)
 
 
+class MoeIdentityExpertParallel(TensorParallelLayer):
+    """
+    TP class for zero/identity experts in MoE layers.
+
+    Under TP, the parent MoeTensorParalellExperts does all_reduce_forward (sum)
+    on the expert module output. Identity experts produce the same output on
+    every rank, so the sum gives world_size * output. This class divides the
+    input by world_size to compensate.
+    """
+
+    @staticmethod
+    def _prepare_input_fn(mod, inputs, device_mesh):
+        input_tensor = inputs[0] if inputs else inputs
+        # TODO(fmom): when 2D-device mesh, need to select a //-ism axis to divide the input tensor by.
+        return input_tensor / device_mesh.size()
+
+    def shard_tensor(self, param, tensor_idx=None, device=None, dtype=None):
+        return param[...].to(device=device, dtype=dtype)
+
+    def prepare_module_tp(self, module, device_mesh, **kwargs):
+        distribute_module(module, device_mesh, input_fn=self._prepare_input_fn)
+
+
 class ParallelInterface(GeneralInterface):
     # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
     # a new instance is created (in order to locally override a given entry)
@@ -1103,6 +1207,9 @@ class ParallelInterface(GeneralInterface):
             "grouped_gemm": GroupedGemmParallel(),
             "ep_router": RouterParallel(),
             "moe_tp_experts": MoeTensorParalellExperts(),
+            "moe_identity_expert": MoeIdentityExpertParallel(),
+            "replicated_with_grad_allreduce": ReplicatedWithGradAllReduce(),
+            "mla_kv_a_proj": MlaKvAProjParallel(),
         }
         if is_torch_available() and _torch_distributed_available
         else {}
@@ -1120,6 +1227,8 @@ class ParallelInterface(GeneralInterface):
         "packed_rowwise": -1,
         "embedding_rowwise": 0,
         "sequence_parallel": None,
+        "replicated_with_grad_allreduce": None,
+        "mla_kv_a_proj": None,
     }
 
     # Bias sharding: colwise shards bias, rowwise doesn't (bias is replicated and all-reduced)
@@ -1132,6 +1241,8 @@ class ParallelInterface(GeneralInterface):
         "packed_rowwise": None,
         "embedding_rowwise": None,
         "sequence_parallel": None,
+        "replicated_with_grad_allreduce": None,
+        "mla_kv_a_proj": None,
     }
 
 
@@ -1258,13 +1369,14 @@ def add_tensor_parallel_hooks_to_module(
     if current_module_plan is not None:
         tp_layer = ALL_PARALLEL_STYLES[current_module_plan]
         try:
-            tp_layer.prepare_module_tp(module, device_mesh)
+            tp_layer.prepare_module_tp(module, device_mesh, config=model.config)
         except NotImplementedError as e:
             print(
                 f"Trying to prepare {layer_name}, but it's not supported. Corresponding module: {module} Fix it's TP plan: {e}"
             )
 
         module._hf_tp_plan = current_module_plan
+        module._hf_device_mesh = device_mesh
         module.__repr__ = lambda: f"{module.__repr__()}\nTP Plan: {current_module_plan}"
 
 

diff --git a/src/transformers/integrations/torchao.py b/src/transformers/integrations/torchao.py
@@ -148,13 +148,21 @@ def convert(
                         quantize_(module, c, (lambda x, fqn: True))
                         missing_keys.discard(full_layer_name)
                         module._is_hf_initialized = True
+                        # torchao quantizes weights into a module but some models access the weight directly
+                        # (e.g. module.o_proj.weight). The _is_hf_initialized flag is set at the module
+                        # level only, so we also set it on each parameter to prevent _init_weights from
+                        # calling normal_() on already-quantized Float8Tensors.
+                        for param in module.parameters(recurse=False):
+                            param._is_hf_initialized = True
                         return {"lm_head.weight": lm_head} if is_embedding_param and untie_embedding_weights else {}
                     else:
                         # need to apply to custom param name
                         custom_param_fqn_config = FqnToConfig({top_level_param_name: c})
                         quantize_(module, custom_param_fqn_config, filter_fn=None)
                         missing_keys.discard(full_layer_name)
                         module._is_hf_initialized = True
+                        for param in module.parameters(recurse=False):
+                            param._is_hf_initialized = True
                         return {}
                 return {full_layer_name: value}
 
@@ -189,6 +197,8 @@ def convert(
                     quantize_(module, c, filter_fn=lambda x, fqn: True)
                     missing_keys.discard(full_layer_name)
                     module._is_hf_initialized = True
+                    for param in module.parameters(recurse=False):
+                        param._is_hf_initialized = True
                     return {"lm_head.weight": lm_head} if is_embedding_param and untie_embedding_weights else {}
 
                 return {full_layer_name: value}
@@ -198,6 +208,8 @@ def convert(
         quantize_(module, self.hf_quantizer.quantization_config.get_apply_tensor_subclass())
         missing_keys.discard(full_layer_name)
         module._is_hf_initialized = True
+        for param in module.parameters(recurse=False):
+            param._is_hf_initialized = True
         return {"lm_head.weight": lm_head} if is_embedding_param and untie_embedding_weights else {}
 
 

diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py
@@ -101,6 +101,8 @@ class ApertusConfig(PreTrainedConfig):
         "layers.*.self_attn.q_proj": "colwise",
         "layers.*.self_attn.k_proj": "colwise",
         "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
+        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
         "layers.*.self_attn.o_proj": "rowwise",
         "layers.*.mlp.up_proj": "colwise",
         "layers.*.mlp.down_proj": "rowwise",

diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py
@@ -121,6 +121,8 @@ class ApertusConfig(PreTrainedConfig):
         "layers.*.self_attn.q_proj": "colwise",
         "layers.*.self_attn.k_proj": "colwise",
         "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
+        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
         "layers.*.self_attn.o_proj": "rowwise",
         "layers.*.mlp.up_proj": "colwise",
         "layers.*.mlp.down_proj": "rowwise",

diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py
@@ -123,12 +123,19 @@ class DeepseekV2Config(PreTrainedConfig):
 
     base_model_tp_plan = {
         "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.q_a_proj": "colwise",
         "layers.*.self_attn.q_b_proj": "colwise",
+        "layers.*.self_attn.kv_a_proj_with_mqa": "mla_kv_a_proj",
         "layers.*.self_attn.kv_b_proj": "colwise",
         "layers.*.self_attn.o_proj": "rowwise",
         "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
         "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.experts": "moe_tp_experts",
+        "layers.*.mlp.shared_experts.gate_proj": "colwise",
+        "layers.*.mlp.shared_experts.up_proj": "colwise",
+        "layers.*.mlp.shared_experts.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
     }
     base_model_pp_plan = {
         "embed_tokens": (["input_ids"], ["inputs_embeds"]),

diff --git a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py
@@ -360,6 +360,7 @@ def forward(
         k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
         k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
         q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device))
 
         k_pe = k_pe.expand(*k_nope.shape[:-1], -1)

diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py
@@ -140,12 +140,19 @@ class DeepseekV2Config(LlamaConfig):
 
     base_model_tp_plan = {
         "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.q_a_proj": "colwise",
         "layers.*.self_attn.q_b_proj": "colwise",
+        "layers.*.self_attn.kv_a_proj_with_mqa": "mla_kv_a_proj",
         "layers.*.self_attn.kv_b_proj": "colwise",
         "layers.*.self_attn.o_proj": "rowwise",
         "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
         "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.experts": "moe_tp_experts",
+        "layers.*.mlp.shared_experts.gate_proj": "colwise",
+        "layers.*.mlp.shared_experts.up_proj": "colwise",
+        "layers.*.mlp.shared_experts.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
     }
 
     model_type = "deepseek_v2"
@@ -384,6 +391,7 @@ def forward(
         k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
         k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
         q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device))
 
         k_pe = k_pe.expand(*k_nope.shape[:-1], -1)

diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
@@ -129,8 +129,9 @@ class DeepseekV3Config(PreTrainedConfig):
     model_type = "deepseek_v3"
     keys_to_ignore_at_inference = ["past_key_values"]
     base_model_tp_plan = {
-        "layers.*.mlp.experts.gate_up_proj": "rowwise",
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
         "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.experts": "moe_tp_experts",
         "layers.*.mlp.shared_experts.gate_proj": "colwise",
         "layers.*.mlp.shared_experts.up_proj": "colwise",
         "layers.*.mlp.shared_experts.down_proj": "rowwise",