From 14af3be0fc762369187b9e92366000b10891bffc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=87a=C4=9Fr=C4=B1=20Tu=C4=9Frul=20Canbol?=
 <103742287+ctcanbol@users.noreply.github.com>
Date: Thu, 7 Aug 2025 09:28:14 +0200
Subject: [PATCH 1/2] fix qwen3moe gguf architecture

---
 src/transformers/modeling_gguf_pytorch_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index feb6c6c3914f..7ef2725c10b0 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -246,6 +246,7 @@ def process(self, weights, name, **kwargs):
 TENSOR_PROCESSORS = {
     "llama": LlamaTensorProcessor,
     "qwen2moe": Qwen2MoeTensorProcessor,
+    "qwen3moe": Qwen2MoeTensorProcessor,
     "bloom": BloomTensorProcessor,
     "t5": T5TensorProcessor,
     "t5encoder": T5TensorProcessor,
@@ -295,6 +296,8 @@ def get_gguf_hf_weights_map(
         model_type = "command-r"
     elif model_type == "qwen2_moe":
         model_type = "qwen2moe"
+    elif model_type == "qwen3_moe":
+        model_type = "qwen3moe"
     elif model_type == "gemma3_text":
         model_type = "gemma3"
     arch = None
@@ -316,8 +319,8 @@ def get_gguf_hf_weights_map(
     gguf_to_hf_name_map = {}
     state_dict = hf_model.state_dict()
     for hf_name in state_dict:
-        # An exception for qwen2moe model, where the expert layers are packed
-        if model_type == "qwen2moe" and "mlp.experts." in hf_name:
+        # An exception for qwen2moe/qwen3moe model, where the expert layers are packed
+        if model_type in ("qwen2moe", "qwen3moe") and "mlp.experts." in hf_name:
             hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
 
         name, suffix = hf_name, ""
@@ -391,6 +394,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
 
     if "qwen2moe" in architecture:
         updated_architecture = "qwen2_moe"
+    elif "qwen3moe" in architecture:
+        updated_architecture = "qwen3_moe"
 
     # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors
     # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors

From f43d297e569998121f5e3fad9b0ce271298574c1 Mon Sep 17 00:00:00 2001
From: Jinuk Kim <jusjinuk@snu.ac.kr>
Date: Sun, 10 Aug 2025 17:15:22 +0900
Subject: [PATCH 2/2] Fix Qwen3Moe GGUF loading

---
 src/transformers/integrations/ggml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index ecf34bbf5e19..9d89abfaa24d 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -102,13 +102,14 @@
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
         "vocab_size": "vocab_size",
     },
-    "qwen3moe": {
+    "qwen3_moe": {
         "context_length": "max_position_embeddings",
         "block_count": "num_hidden_layers",
         "feed_forward_length": "intermediate_size",
         "embedding_length": "hidden_size",
         "rope.dimension_count": None,
         "rope.freq_base": "rope_theta",
+        "attention.key_length": "head_dim",
         "attention.head_count": "num_attention_heads",
         "attention.head_count_kv": "num_key_value_heads",
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",