From 483eeeb0230a79f9f3322fe3a1b5e5ca7ed7af24 Mon Sep 17 00:00:00 2001
From: garybadwal <gurpreet071999@gmail.com>
Date: Tue, 21 Apr 2026 14:13:34 +0530
Subject: [PATCH 1/2] feat: add Llama 4 support with configuration mapping and
 tensor processing

---
 src/transformers/integrations/ggml.py         | 18 +++++
 .../modeling_gguf_pytorch_utils.py            | 65 +++++++++++++++++++
 tests/quantization/ggml/test_ggml.py          | 57 ++++++++++++++++
 3 files changed, 140 insertions(+)
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index 29ec365e7ce2..7fd9553b17a1 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -305,6 +305,23 @@
         "vocab_size": "vocab_size",
         "expert_gating_func": "scoring_func",
     },
+    "llama4": {
+        "context_length": "max_position_embeddings",
+        "block_count": "num_hidden_layers",
+        "feed_forward_length": "intermediate_size_mlp",
+        "expert_feed_forward_length": "intermediate_size",
+        "embedding_length": "hidden_size",
+        "rope.dimension_count": None,
+        "rope.freq_base": "rope_theta",
+        "attention.key_length": "head_dim",
+        "attention.head_count": "num_attention_heads",
+        "attention.head_count_kv": "num_key_value_heads",
+        "attention.layer_norm_rms_epsilon": "rms_norm_eps",
+        "vocab_size": "vocab_size",
+        "expert_count": "num_local_experts",
+        "expert_used_count": "num_experts_per_tok",
+        "interleave_moe_layer_step": "interleave_moe_layer_step",
+    },
 }
 
 GGUF_TOKENIZER_MAPPING = {
@@ -772,6 +789,7 @@ def converted(self) -> Tokenizer:
 
 GGUF_TO_FAST_CONVERTERS = {
     "llama": GGUFLlamaConverter,
+    "llama4_text": GGUFLlamaConverter,
     "qwen2": GGUFQwen2Converter,
     "qwen2_moe": GGUFQwen2Converter,
     "qwen3": GGUFQwen2Converter,
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index 66306b6f71f6..2571230c32f2 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -352,8 +352,57 @@ def _set_moe_expert_tensor(self, weights: np.ndarray, parsed_parameters: dict[st
             out.copy_(torch_weights)
 
 
+class Llama4TensorProcessor(TensorProcessor):
+    HF_MOE_GATE_UP_PATTERN = re.compile(r"(?:model\.)?layers\.(?P<bid>\d+)\.feed_forward\.experts\.gate_up_proj$")
+    HF_MOE_DOWN_PATTERN = re.compile(r"(?:model\.)?layers\.(?P<bid>\d+)\.feed_forward\.experts\.down_proj$")
+    GGUF_MOE_WEIGHTS_PATTERN = re.compile(r".*\.ffn_(?P<w>gate|up|down)_exps\.weight$")
+
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def perform_fallback_tensor_mapping(
+        self, gguf_to_hf_name_map: dict[str, str], suffix: str, qual_name: str, hf_name: str
+    ):
+        if m := re.fullmatch(self.HF_MOE_GATE_UP_PATTERN, hf_name):
+            full_hf_name = qual_name + hf_name
+            gguf_to_hf_name_map[f"blk.{m['bid']}.ffn_gate_exps.weight"] = full_hf_name
+            gguf_to_hf_name_map[f"blk.{m['bid']}.ffn_up_exps.weight"] = full_hf_name
+        elif m := re.fullmatch(self.HF_MOE_DOWN_PATTERN, hf_name):
+            full_hf_name = qual_name + hf_name
+            gguf_to_hf_name_map[f"blk.{m['bid']}.ffn_down_exps.weight"] = full_hf_name
+
+    def process(self, weights, name: str, **kwargs):
+        if m := re.fullmatch(self.GGUF_MOE_WEIGHTS_PATTERN, name):
+            tensor_key_mapping = kwargs.get("tensor_key_mapping")
+            parsed_parameters = kwargs.get("parsed_parameters")
+            if tensor_key_mapping and name in tensor_key_mapping:
+                self._set_moe_expert_tensor(weights, parsed_parameters, tensor_key_mapping[name], m["w"])
+                return GGUFTensor(weights, None, {})
+        return GGUFTensor(weights, name, {})
+
+    def _set_moe_expert_tensor(self, weights: np.ndarray, parsed_parameters: dict[str, dict], hf_name: str, w: str):
+        torch_weights = torch.from_numpy(np.ascontiguousarray(np.swapaxes(weights, -1, -2)))
+        if w == "down":
+            parsed_parameters["tensors"][hf_name] = torch_weights
+            return
+        # Merge gate and up into gate_up_proj: [E, hidden, 2*expert_dim], gate first then up.
+        shape = list(torch_weights.shape)
+        shard_dim = -1
+        shard_size = shape[shard_dim]
+        shape[shard_dim] = shard_size * 2
+        if hf_name not in parsed_parameters["tensors"]:
+            parsed_parameters["tensors"][hf_name] = torch.zeros(shape, dtype=torch_weights.dtype)
+        out: torch.Tensor = parsed_parameters["tensors"][hf_name]
+        if w == "gate":
+            out = out.narrow(shard_dim, 0, shard_size)
+        else:  # w == "up"
+            out = out.narrow(shard_dim, shard_size, shard_size)
+        out.copy_(torch_weights)
+
+
 TENSOR_PROCESSORS = {
     "llama": LlamaTensorProcessor,
+    "llama4": Llama4TensorProcessor,
     "qwen2moe": Qwen2MoeTensorProcessor,
     "qwen3moe": Qwen2MoeTensorProcessor,
     "bloom": BloomTensorProcessor,
@@ -416,6 +465,10 @@ def get_gguf_hf_weights_map(
         model_type = "t5"
     elif model_type == "minimax_m2":
         model_type = "minimax-m2"
+    elif model_type == "llama4_text":
+        # GGUF Llama 4 files only contain text weights; the text-only config
+        # uses `llama4_text` in transformers but the GGUF arch key is `llama4`.
+        model_type = "llama4"
     arch = None
     for key, value in MODEL_ARCH_NAMES.items():
         if value == model_type:
@@ -583,6 +636,18 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
     if parsed_parameters["config"]["model_type"] == "gemma3":
         parsed_parameters["config"]["model_type"] = "gemma3_text"
 
+    # Llama 4 GGUF checkpoints only contain the text backbone. Rewrite the model_type to
+    # the text-only config and nest rope_theta under rope_parameters (Llama4TextConfig is
+    # @strict and stores rope params in a nested dict rather than a top-level field).
+    if parsed_parameters["config"]["model_type"] == "llama4":
+        parsed_parameters["config"]["model_type"] = "llama4_text"
+        rope_theta = parsed_parameters["config"].pop("rope_theta", None)
+        if rope_theta is not None:
+            parsed_parameters["config"]["rope_parameters"] = {
+                "rope_type": "default",
+                "rope_theta": float(rope_theta),
+            }
+
     # MiniMax-M2: convert expert_gating_func integer to scoring_func string
     if parsed_parameters["config"].get("model_type") == "minimax_m2":
         _gating_func_map = {0: "none", 1: "softmax", 2: "sigmoid"}
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index 763f8ac40502..cd14baf7587c 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -311,6 +311,7 @@ class GgufModelTests(unittest.TestCase):
     qwen3moe_model_id = "Qwen/Qwen3-30B-A3B-GGUF"
     umt5_encoder_model_id = "city96/umt5-xxl-encoder-gguf"
     lfm2_model_id = "LiquidAI/LFM2-1.2B-GGUF"
+    llama4_model_id = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF"
 
     q4_0_phi3_model_id = "Phi-3-mini-4k-instruct-q4.gguf"
     q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
@@ -351,6 +352,7 @@ class GgufModelTests(unittest.TestCase):
     q4_k_m_qwen3moe_model_id = "Qwen3-30B-A3B-Q4_K_M.gguf"
     q8_0_umt5_encoder_model_id = "umt5-xxl-encoder-Q8_0.gguf"
     q4_k_m_lfm2_model_id = "LFM2-1.2B-Q4_K_M.gguf"
+    q2_k_l_llama4_model_id = "Llama-4-Scout-17B-16E-Instruct-Q2_K_L.gguf"
 
     example_text = "Hello"
 
@@ -1129,3 +1131,58 @@ def test_lfm2_q4_k_m(self):
 
         EXPECTED_TEXT = "Hello Atari 2600! es un videoj"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    def test_llama4_config_mapping(self):
+        """Test that Llama 4 GGUF config mapping is correctly registered."""
+        from transformers.integrations.ggml import GGUF_CONFIG_MAPPING
+
+        self.assertIn("llama4", GGUF_CONFIG_MAPPING)
+        mapping = GGUF_CONFIG_MAPPING["llama4"]
+
+        expected_mappings = {
+            "context_length": "max_position_embeddings",
+            "block_count": "num_hidden_layers",
+            "feed_forward_length": "intermediate_size_mlp",
+            "expert_feed_forward_length": "intermediate_size",
+            "embedding_length": "hidden_size",
+            "rope.freq_base": "rope_theta",
+            "attention.key_length": "head_dim",
+            "attention.head_count": "num_attention_heads",
+            "attention.head_count_kv": "num_key_value_heads",
+            "attention.layer_norm_rms_epsilon": "rms_norm_eps",
+            "vocab_size": "vocab_size",
+            "expert_count": "num_local_experts",
+            "expert_used_count": "num_experts_per_tok",
+            "interleave_moe_layer_step": "interleave_moe_layer_step",
+        }
+        for gguf_key, transformers_key in expected_mappings.items():
+            self.assertEqual(mapping[gguf_key], transformers_key)
+
+        self.assertIsNone(mapping["rope.dimension_count"])
+
+    def test_llama4_architecture_mapping(self):
+        """Test that Llama 4 text-only GGUFs route to GGUFLlamaConverter and Llama4TensorProcessor."""
+        from transformers.integrations.ggml import GGUF_TO_FAST_CONVERTERS, GGUFLlamaConverter
+        from transformers.modeling_gguf_pytorch_utils import TENSOR_PROCESSORS, Llama4TensorProcessor
+
+        self.assertIn("llama4_text", GGUF_TO_FAST_CONVERTERS)
+        self.assertEqual(GGUF_TO_FAST_CONVERTERS["llama4_text"], GGUFLlamaConverter)
+        self.assertIn("llama4", TENSOR_PROCESSORS)
+        self.assertEqual(TENSOR_PROCESSORS["llama4"], Llama4TensorProcessor)
+
+    @unittest.skipUnless(is_gguf_available("0.17.0"), "test requires gguf version >= 0.17.0")
+    def test_llama4_q2_k_l(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.llama4_model_id, gguf_file=self.q2_k_l_llama4_model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.llama4_model_id,
+            gguf_file=self.q2_k_l_llama4_model_id,
+            dtype=torch.float16,
+        )
+
+        text = tokenizer(self.example_text, return_tensors="pt")["input_ids"]
+        out = model.generate(text, max_new_tokens=10)
+
+        # Llama 4 is large and heavily quantised; we only check that the load path works end-to-end
+        # and produces a non-empty decoded string rather than asserting exact text.
+        decoded = tokenizer.decode(out[0], skip_special_tokens=True)
+        self.assertTrue(len(decoded) > len(self.example_text))

From 68cdef013597f0d2eb800e16f4b2254d6328ae3a Mon Sep 17 00:00:00 2001
From: Gary Badwal <gurpreet071999@gmail.com>
Date: Sat, 25 Apr 2026 11:59:26 +0530
Subject: [PATCH 2/2] refactor: update Llama 4 tests to improve tokenizer
 validation and remove deprecated config mappings

---
 tests/quantization/ggml/test_ggml.py | 52 ++++++----------------------
 1 file changed, 11 insertions(+), 41 deletions(-)

diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index cd14baf7587c..d9582998dffd 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -1132,43 +1132,15 @@ def test_lfm2_q4_k_m(self):
         EXPECTED_TEXT = "Hello Atari 2600! es un videoj"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
-    def test_llama4_config_mapping(self):
-        """Test that Llama 4 GGUF config mapping is correctly registered."""
-        from transformers.integrations.ggml import GGUF_CONFIG_MAPPING
-
-        self.assertIn("llama4", GGUF_CONFIG_MAPPING)
-        mapping = GGUF_CONFIG_MAPPING["llama4"]
-
-        expected_mappings = {
-            "context_length": "max_position_embeddings",
-            "block_count": "num_hidden_layers",
-            "feed_forward_length": "intermediate_size_mlp",
-            "expert_feed_forward_length": "intermediate_size",
-            "embedding_length": "hidden_size",
-            "rope.freq_base": "rope_theta",
-            "attention.key_length": "head_dim",
-            "attention.head_count": "num_attention_heads",
-            "attention.head_count_kv": "num_key_value_heads",
-            "attention.layer_norm_rms_epsilon": "rms_norm_eps",
-            "vocab_size": "vocab_size",
-            "expert_count": "num_local_experts",
-            "expert_used_count": "num_experts_per_tok",
-            "interleave_moe_layer_step": "interleave_moe_layer_step",
-        }
-        for gguf_key, transformers_key in expected_mappings.items():
-            self.assertEqual(mapping[gguf_key], transformers_key)
-
-        self.assertIsNone(mapping["rope.dimension_count"])
-
-    def test_llama4_architecture_mapping(self):
-        """Test that Llama 4 text-only GGUFs route to GGUFLlamaConverter and Llama4TensorProcessor."""
-        from transformers.integrations.ggml import GGUF_TO_FAST_CONVERTERS, GGUFLlamaConverter
-        from transformers.modeling_gguf_pytorch_utils import TENSOR_PROCESSORS, Llama4TensorProcessor
-
-        self.assertIn("llama4_text", GGUF_TO_FAST_CONVERTERS)
-        self.assertEqual(GGUF_TO_FAST_CONVERTERS["llama4_text"], GGUFLlamaConverter)
-        self.assertIn("llama4", TENSOR_PROCESSORS)
-        self.assertEqual(TENSOR_PROCESSORS["llama4"], Llama4TensorProcessor)
+    @unittest.skipUnless(is_gguf_available("0.17.0"), "test requires gguf version >= 0.17.0")
+    def test_llama4_q2_k_l_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.llama4_model_id, gguf_file=self.q2_k_l_llama4_model_id)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tokenizer.save_pretrained(tmpdirname)
+            tokenizer = AutoTokenizer.from_pretrained(tmpdirname)
+            special_sentence = "สวัสดี"
+            predicted_text = tokenizer.decode(tokenizer.encode(special_sentence, return_tensors="pt")[0])
+            self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
 
     @unittest.skipUnless(is_gguf_available("0.17.0"), "test requires gguf version >= 0.17.0")
     def test_llama4_q2_k_l(self):
@@ -1182,7 +1154,5 @@ def test_llama4_q2_k_l(self):
         text = tokenizer(self.example_text, return_tensors="pt")["input_ids"]
         out = model.generate(text, max_new_tokens=10)
 
-        # Llama 4 is large and heavily quantised; we only check that the load path works end-to-end
-        # and produces a non-empty decoded string rather than asserting exact text.
-        decoded = tokenizer.decode(out[0], skip_special_tokens=True)
-        self.assertTrue(len(decoded) > len(self.example_text))
+        EXPECTED_TEXT = "Hello, I'm here to help. What"
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)