From b24d366e0a2874dec5e2004f452638ab0f4e3c77 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 17:19:02 +0100
Subject: [PATCH 01/25] convert ok

---
 convert_hf_to_gguf.py          | 14 ++++++++++++++
 gguf-py/gguf/constants.py      | 16 ++++++++++++++++
 gguf-py/gguf/tensor_mapping.py | 24 ++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 151608d56b8..c3d998d5e2c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4407,6 +4407,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("Glm4vForConditionalGeneration")
+class Glm4VisionModel(Qwen3VLVisionModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("model.visual."):
+            name = name.replace("model.visual.", "visual.")
+        if name.startswith("visual.merger."):
+            return [(self.map_tensor_name(name), data_torch)]
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Qwen3VLForConditionalGeneration")
 class Qwen3VLTextModel(Qwen3Model):
     model_arch = gguf.MODEL_ARCH.QWEN3VL
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 2b8489c591b..469d0dd4f2a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -639,6 +639,10 @@ class MODEL_TENSOR(IntEnum):
     V_MMPROJ             = auto()
     V_MMPROJ_FC          = auto()
     V_MMPROJ_MLP         = auto()
+    V_MMPROJ_FFN_UP      = auto()
+    V_MMPROJ_FFN_GATE    = auto()
+    V_MMPROJ_FFN_DOWN    = auto()
+    V_MMPROJ_POST_NORM   = auto()
     V_MMPROJ_PEG         = auto()
     V_ENC_EMBD_CLS       = auto()
     V_ENC_EMBD_PATCH     = auto()
@@ -685,6 +689,7 @@ class MODEL_TENSOR(IntEnum):
     V_MM_GATE            = auto() # cogvlm
     V_TOK_BOI            = auto() # cogvlm
     V_TOK_EOI            = auto() # cogvlm
+    V_MM_CONV            = auto() # glm4v
     # audio (mtmd)
     A_ENC_EMBD_POS       = auto()
     A_ENC_CONV1D         = auto()
@@ -1011,6 +1016,10 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
     MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
     MODEL_TENSOR.V_MMPROJ_MLP:              "mm.model.mlp.{bid}",
+    MODEL_TENSOR.V_MMPROJ_FFN_UP:           "mm.model.ffn_up",
+    MODEL_TENSOR.V_MMPROJ_FFN_GATE:         "mm.model.ffn_gate",
+    MODEL_TENSOR.V_MMPROJ_FFN_DOWN:         "mm.model.ffn_down",
+    MODEL_TENSOR.V_MMPROJ_POST_NORM:        "mm.model.post_norm",
     MODEL_TENSOR.V_MMPROJ_PEG:              "mm.model.peg.{bid}",
     MODEL_TENSOR.V_ENC_EMBD_CLS:            "v.class_embd",
     MODEL_TENSOR.V_ENC_EMBD_PATCH:          "v.patch_embd",
@@ -1057,6 +1066,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_GATE:                 "mm.gate",
     MODEL_TENSOR.V_TOK_BOI:                 "v.boi",
     MODEL_TENSOR.V_TOK_EOI:                 "v.eoi",
+    MODEL_TENSOR.V_MM_CONV:                 "mm.conv",
     # audio (mtmd)
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
     MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
@@ -1089,6 +1099,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MMPROJ,
         MODEL_TENSOR.V_MMPROJ_FC,
         MODEL_TENSOR.V_MMPROJ_MLP,
+        MODEL_TENSOR.V_MMPROJ_FFN_UP,
+        MODEL_TENSOR.V_MMPROJ_FFN_GATE,
+        MODEL_TENSOR.V_MMPROJ_FFN_DOWN,
+        MODEL_TENSOR.V_MMPROJ_POST_NORM,
         MODEL_TENSOR.V_MMPROJ_PEG,
         MODEL_TENSOR.V_ENC_EMBD_CLS,
         MODEL_TENSOR.V_ENC_EMBD_PATCH,
@@ -1135,6 +1149,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MM_GATE,
         MODEL_TENSOR.V_TOK_BOI,
         MODEL_TENSOR.V_TOK_EOI,
+        MODEL_TENSOR.V_MM_CONV,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_CONV1D,
@@ -3327,6 +3342,7 @@ class VisionProjectorType:
     LIGHTONOCR = "lightonocr"
     COGVLM = "cogvlm"
     JANUS_PRO = "janus_pro"
+    GLM4V = "glm4v"
 
 
 # Items here are (block size, type size)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index d9c87da1946..a7dab608d00 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1205,6 +1205,7 @@ class TensorNameMap:
         MODEL_TENSOR.V_MMPROJ_FC: (
             "model.connector.modality_projection.proj", # SmolVLM
             "model.vision.linear_proj.linear_proj", # cogvlm
+            "visual.merger.proj", # glm4v
         ),
 
         MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1214,6 +1215,22 @@ class TensorNameMap:
             "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
         ),
 
+        MODEL_TENSOR.V_MMPROJ_FFN_UP: (
+            "visual.merger.up_proj", # glm4v
+        ),
+
+        MODEL_TENSOR.V_MMPROJ_FFN_GATE: (
+            "visual.merger.gate_proj", # glm4v
+        ),
+
+        MODEL_TENSOR.V_MMPROJ_FFN_DOWN: (
+            "visual.merger.down_proj", # glm4v
+        ),
+
+        MODEL_TENSOR.V_MMPROJ_POST_NORM: (
+            "visual.merger.post_projection_norm", # glm4v
+        ),
+
         MODEL_TENSOR.V_MMPROJ_PEG: (
             "model.mm_projector.peg.peg.{bid}",
         ),
@@ -1247,6 +1264,7 @@ class TensorNameMap:
             "vision_tower.patch_embed.pos_emb", # kimi-vl
             "visual.pos_embed", # qwen3vl
             "model.vision.patch_embedding.position_embedding", # cogvlm
+            "visual.embeddings.position_embedding", # glm4v
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_QKV: (
@@ -1394,6 +1412,7 @@ class TensorNameMap:
             "vision_tower.ln_pre", # pixtral-hf
             "vision_encoder.ln_pre", # pixtral
             "vision_model.layernorm_pre", # llama4
+            "visual.post_conv_layernorm", # glm4v
         ),
 
         MODEL_TENSOR.V_POST_NORM: (
@@ -1402,6 +1421,7 @@ class TensorNameMap:
             "vision_model.layernorm_post", # llama4
             "visual.merger.ln_q", # qwen2vl
             "vision_tower.encoder.final_layernorm", # kimi-vl
+            "visual.post_layernorm", # glm4v
         ),
 
         MODEL_TENSOR.V_MM_INP_PROJ: (
@@ -1420,6 +1440,10 @@ class TensorNameMap:
             "multi_modal_projector.mm_soft_emb_norm",
         ),
 
+        MODEL_TENSOR.V_MM_CONV: (
+            "visual.downsample", # glm4v
+        ),
+
         MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
             "resampler.pos_embed_k",
         ),

From 7b13c8eae8c5fd0f8a6eb39f9277635c93068c2e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 17:21:44 +0100
Subject: [PATCH 02/25] no deepstack

---
 convert_hf_to_gguf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c3d998d5e2c..967cb51e501 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4410,6 +4410,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 @ModelBase.register("Glm4vForConditionalGeneration")
 class Glm4VisionModel(Qwen3VLVisionModel):
     def set_gguf_parameters(self):
+        self.is_deepstack_layers = False
         super().set_gguf_parameters()
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
 

From f3f8fb4bfb94b6e2ba3e245f9ce6633ea44dc982 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 17:30:23 +0100
Subject: [PATCH 03/25] less new tensors

---
 gguf-py/gguf/constants.py      | 15 +++------------
 gguf-py/gguf/tensor_mapping.py | 23 +++++++----------------
 tools/mtmd/clip-impl.h         |  5 +++++
 tools/mtmd/clip-model.h        |  6 ++++++
 4 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 469d0dd4f2a..0801d73794d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -639,10 +639,6 @@ class MODEL_TENSOR(IntEnum):
     V_MMPROJ             = auto()
     V_MMPROJ_FC          = auto()
     V_MMPROJ_MLP         = auto()
-    V_MMPROJ_FFN_UP      = auto()
-    V_MMPROJ_FFN_GATE    = auto()
-    V_MMPROJ_FFN_DOWN    = auto()
-    V_MMPROJ_POST_NORM   = auto()
     V_MMPROJ_PEG         = auto()
     V_ENC_EMBD_CLS       = auto()
     V_ENC_EMBD_PATCH     = auto()
@@ -664,6 +660,7 @@ class MODEL_TENSOR(IntEnum):
     V_LAYER_SCALE_2      = auto()
     V_PRE_NORM           = auto()
     V_POST_NORM          = auto()
+    V_MM_POST_NORM       = auto()
     V_MM_INP_NORM        = auto()
     V_MM_INP_PROJ        = auto() # gemma3
     V_MM_SOFT_EMB_NORM   = auto() # gemma3
@@ -1016,10 +1013,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
     MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
     MODEL_TENSOR.V_MMPROJ_MLP:              "mm.model.mlp.{bid}",
-    MODEL_TENSOR.V_MMPROJ_FFN_UP:           "mm.model.ffn_up",
-    MODEL_TENSOR.V_MMPROJ_FFN_GATE:         "mm.model.ffn_gate",
-    MODEL_TENSOR.V_MMPROJ_FFN_DOWN:         "mm.model.ffn_down",
-    MODEL_TENSOR.V_MMPROJ_POST_NORM:        "mm.model.post_norm",
     MODEL_TENSOR.V_MMPROJ_PEG:              "mm.model.peg.{bid}",
     MODEL_TENSOR.V_ENC_EMBD_CLS:            "v.class_embd",
     MODEL_TENSOR.V_ENC_EMBD_PATCH:          "v.patch_embd",
@@ -1041,6 +1034,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_LAYER_SCALE_2:           "v.blk.{bid}.ls2",
     MODEL_TENSOR.V_PRE_NORM:                "v.pre_ln",
     MODEL_TENSOR.V_POST_NORM:               "v.post_ln",
+    MODEL_TENSOR.V_MM_POST_NORM:            "mm.post_norm",
     MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
     MODEL_TENSOR.V_MM_INP_NORM:             "mm.input_norm",
     MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",
@@ -1099,10 +1093,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MMPROJ,
         MODEL_TENSOR.V_MMPROJ_FC,
         MODEL_TENSOR.V_MMPROJ_MLP,
-        MODEL_TENSOR.V_MMPROJ_FFN_UP,
-        MODEL_TENSOR.V_MMPROJ_FFN_GATE,
-        MODEL_TENSOR.V_MMPROJ_FFN_DOWN,
-        MODEL_TENSOR.V_MMPROJ_POST_NORM,
         MODEL_TENSOR.V_MMPROJ_PEG,
         MODEL_TENSOR.V_ENC_EMBD_CLS,
         MODEL_TENSOR.V_ENC_EMBD_PATCH,
@@ -1124,6 +1114,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_LAYER_SCALE_2,
         MODEL_TENSOR.V_PRE_NORM,
         MODEL_TENSOR.V_POST_NORM,
+        MODEL_TENSOR.V_MM_POST_NORM,
         MODEL_TENSOR.V_MM_INP_PROJ,
         MODEL_TENSOR.V_MM_INP_NORM,
         MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index a7dab608d00..676d7466838 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1215,22 +1215,6 @@ class TensorNameMap:
             "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
         ),
 
-        MODEL_TENSOR.V_MMPROJ_FFN_UP: (
-            "visual.merger.up_proj", # glm4v
-        ),
-
-        MODEL_TENSOR.V_MMPROJ_FFN_GATE: (
-            "visual.merger.gate_proj", # glm4v
-        ),
-
-        MODEL_TENSOR.V_MMPROJ_FFN_DOWN: (
-            "visual.merger.down_proj", # glm4v
-        ),
-
-        MODEL_TENSOR.V_MMPROJ_POST_NORM: (
-            "visual.merger.post_projection_norm", # glm4v
-        ),
-
         MODEL_TENSOR.V_MMPROJ_PEG: (
             "model.mm_projector.peg.peg.{bid}",
         ),
@@ -1424,6 +1408,10 @@ class TensorNameMap:
             "visual.post_layernorm", # glm4v
         ),
 
+        MODEL_TENSOR.V_MM_POST_NORM: (
+            "visual.merger.post_projection_norm", # glm4v
+        ),
+
         MODEL_TENSOR.V_MM_INP_PROJ: (
             "multi_modal_projector.mm_input_projection",
         ),
@@ -1515,14 +1503,17 @@ class TensorNameMap:
 
         MODEL_TENSOR.V_MM_UP: (
             "model.vision.linear_proj.dense_h_to_4h", # cogvlm
+            "visual.merger.up_proj", # glm4v
         ),
 
         MODEL_TENSOR.V_MM_DOWN: (
             "model.vision.linear_proj.dense_4h_to_h", # cogvlm
+            "visual.merger.down_proj", # glm4v
         ),
 
         MODEL_TENSOR.V_MM_GATE: (
             "model.vision.linear_proj.gate_proj", # cogvlm
+            "visual.merger.gate_proj", # glm4v
         ),
 
         MODEL_TENSOR.V_TOK_BOI: (
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 1726823ec69..6499943004a 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -86,6 +86,10 @@
 #define TN_LN_PRE          "%s.pre_ln.%s"
 #define TN_LN_POST         "%s.post_ln.%s"
 #define TN_LLAVA_PROJ      "mm.%d.%s"
+#define TN_MM_UP           "mm.up.%s"
+#define TN_MM_GATE         "mm.gate.%s"
+#define TN_MM_DOWN         "mm.down.%s"
+#define TN_MM_POST_NORM    "mm.post_norm.%s"
 #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
@@ -102,6 +106,7 @@
 #define TN_DEEPSTACK_NORM  "v.deepstack.%d.norm.%s"     // qwen3vl deepstack
 #define TN_DEEPSTACK_FC1   "v.deepstack.%d.fc1.%s"      // qwen3vl deepstack
 #define TN_DEEPSTACK_FC2   "v.deepstack.%d.fc2.%s"      // qwen3vl deepstack
+#define TN_MM_CONV         "mm.conv.%s"                 // glm4v
 
 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 51bcce1ebb0..61708901a8b 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -165,6 +165,12 @@ struct clip_model {
     ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
     ggml_tensor * mm_fc_w;
     ggml_tensor * mm_fc_b;
+    ggml_tensor * mm_ffn_up_w;
+    ggml_tensor * mm_ffn_up_b;
+    ggml_tensor * mm_ffn_gate_w;
+    ggml_tensor * mm_ffn_gate_b;
+    ggml_tensor * mm_ffn_down_w;
+    ggml_tensor * mm_ffn_down_b;
 
     // LLaVA projection
     ggml_tensor * mm_input_norm_w = nullptr;

From 4e81ab4b0c24ea71ae2f1b2d2e16960413b45965 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 18:06:24 +0100
Subject: [PATCH 04/25] cgraph ok

---
 tools/mtmd/clip-impl.h        |  2 +
 tools/mtmd/clip-model.h       | 18 ++++++---
 tools/mtmd/clip.cpp           | 27 ++++++++++++++
 tools/mtmd/models/qwen3vl.cpp | 70 +++++++++++++++++++++++++++++------
 4 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 6499943004a..b8d018e05b8 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -169,6 +169,7 @@ enum projector_type {
     PROJECTOR_TYPE_LIGHTONOCR,
     PROJECTOR_TYPE_COGVLM,
     PROJECTOR_TYPE_JANUS_PRO,
+    PROJECTOR_TYPE_GLM4V,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -195,6 +196,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
     { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
     { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
+    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 61708901a8b..e69af53cabf 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -165,12 +165,14 @@ struct clip_model {
     ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
     ggml_tensor * mm_fc_w;
     ggml_tensor * mm_fc_b;
-    ggml_tensor * mm_ffn_up_w;
-    ggml_tensor * mm_ffn_up_b;
-    ggml_tensor * mm_ffn_gate_w;
-    ggml_tensor * mm_ffn_gate_b;
-    ggml_tensor * mm_ffn_down_w;
-    ggml_tensor * mm_ffn_down_b;
+    ggml_tensor * mm_ffn_up_w = nullptr;
+    ggml_tensor * mm_ffn_up_b = nullptr;
+    ggml_tensor * mm_ffn_gate_w = nullptr;
+    ggml_tensor * mm_ffn_gate_b = nullptr;
+    ggml_tensor * mm_ffn_down_w = nullptr;
+    ggml_tensor * mm_ffn_down_b = nullptr;
+    ggml_tensor * mm_post_norm_w = nullptr;
+    ggml_tensor * mm_post_norm_b = nullptr;
 
     // LLaVA projection
     ggml_tensor * mm_input_norm_w = nullptr;
@@ -273,6 +275,10 @@ struct clip_model {
     ggml_tensor * mm_boi = nullptr;
     ggml_tensor * mm_eoi = nullptr;
 
+    // glm4v
+    ggml_tensor * mm_conv_w = nullptr;
+    ggml_tensor * mm_conv_b = nullptr;
+
     bool audio_has_avgpool() const {
         return proj_type == PROJECTOR_TYPE_QWEN2A
             || proj_type == PROJECTOR_TYPE_VOXTRAL;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index bb922e30b43..9156705af15 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -778,6 +778,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
             {
                 builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
             } break;
@@ -1128,6 +1129,13 @@ struct clip_model_loader {
                             LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                         }
                     } break;
+                case PROJECTOR_TYPE_GLM4V:
+                    {
+                        hparams.n_merge = 2; // default value for GLM4-V
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        hparams.set_limit_image_tokens(8, 4096);
+                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
+                    } break;
                 case PROJECTOR_TYPE_LLAMA4:
                     {
                         hparams.rope_theta = 10000.0f;
@@ -1432,6 +1440,20 @@ struct clip_model_loader {
                     model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                     model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                 } break;
+            case PROJECTOR_TYPE_GLM4V:
+                {
+                    model.projection     = get_tensor(TN_MM_PROJECTOR);
+                    model.mm_ffn_up_w    = get_tensor(string_format(TN_MM_UP,        "weight"));
+                    model.mm_ffn_up_b    = get_tensor(string_format(TN_MM_UP,        "bias"), false);
+                    model.mm_ffn_gate_w  = get_tensor(string_format(TN_MM_GATE,      "weight"));
+                    model.mm_ffn_gate_b  = get_tensor(string_format(TN_MM_GATE,      "bias"), false);
+                    model.mm_ffn_down_w  = get_tensor(string_format(TN_MM_DOWN,      "weight"));
+                    model.mm_ffn_down_b  = get_tensor(string_format(TN_MM_DOWN,      "bias"), false);
+                    model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+                    model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
+                    model.mm_conv_w      = get_tensor(string_format(TN_MM_CONV,      "weight"));
+                    model.mm_conv_b      = get_tensor(string_format(TN_MM_CONV,      "bias"), false);
+                } break;
             case PROJECTOR_TYPE_GEMMA3:
                 {
                     model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
@@ -2525,6 +2547,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
         case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
             {
                 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
                 clip_image_u8 resized;
@@ -2833,6 +2856,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
         case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
             {
                 // dynamic size (2 conv, so double patch size)
                 int x_patch = img->nx / (params.patch_size * 2);
@@ -3070,6 +3094,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             } break;
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
             {
                 const int merge_ratio = hparams.n_merge;
                 const int pw = image_size_width  / patch_size;
@@ -3341,6 +3366,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->model.mm_2_w->ne[1];
         case PROJECTOR_TYPE_COGVLM:
             return ctx->model.mm_4h_to_h_w->ne[1];
+        case PROJECTOR_TYPE_GLM4V:
+            return ctx->model.mm_ffn_down_w->ne[1];
         default:
             GGML_ABORT("Unknown projector type");
     }
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
index 35a42cb84d6..7c61fc0b4f6 100644
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -86,7 +86,9 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
         // self-attention
         {
             cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-            cur = ggml_add(ctx0, cur, layer.qkv_b);
+            if (layer.qkv_b) {
+                cur = ggml_add(ctx0, cur, layer.qkv_b);
+            }
 
             ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
                     /* nb1    */ ggml_row_size(cur->type, d_head),
@@ -172,20 +174,66 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
         inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
     }
 
-    // multimodal projection
-    ggml_tensor * embeddings = inpL;
-    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+    ggml_tensor * cur = inpL;
+
+    if (proj_type == PROJECTOR_TYPE_QWEN3VL) {
+        // Qwen3VL projector
+        cur = ggml_reshape_3d(ctx0, cur, n_embd * 4, n_pos / 4, batch_size);
+        cur = build_ffn(cur,
+            model.mm_0_w, model.mm_0_b,
+            nullptr, nullptr,
+            model.mm_1_w, model.mm_1_b,
+            ffn_op_type::FFN_GELU, -1);
+
+        if (deepstack_features != nullptr) {
+            // concat along the feature dimension
+            cur = ggml_concat(ctx0, cur, deepstack_features, 0);
+        }
+
+    } else if (proj_type == PROJECTOR_TYPE_GLM4V) {
+        // GLM4V projector
 
-    embeddings = build_ffn(embeddings,
-        model.mm_0_w, model.mm_0_b,
-        nullptr, nullptr,
-        model.mm_1_w, model.mm_1_b,
-        ffn_op_type::FFN_GELU, -1);
+        // patch merger
+        {
+            // reshape image tokens to 2D grid
+            cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+            cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+            cur = ggml_cont(ctx0, cur);
+
+            // merge patches
+            cur = ggml_conv_2d(ctx0, model.mm_conv_w, cur, 2, 2, 0, 0, 1, 1);
+            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); // [n_tokens, n_embd]
+            if (model.mm_conv_b) {
+                cur = ggml_add(ctx0, cur, ggml_transpose(ctx0, model.mm_conv_b));
+            }
+            cb(cur, "after_mm_conv", -1);
+        }
 
-    embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
+        // FC projector
+        {
+            cur = ggml_transpose(ctx0, cur); // [n_embd, n_tokens]
+            cur = ggml_mul_mat(ctx0, model.projection, cur);
+            cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+            cur = ggml_gelu(ctx0, cur);
+            cb(cur, "after_fc_proj", -1);
+        }
+
+        // FFN projector
+        {
+            cur = build_ffn(cur,
+                model.mm_ffn_up_w, model.mm_ffn_up_b,
+                model.mm_ffn_gate_w, model.mm_ffn_gate_b,
+                model.mm_ffn_down_w, model.mm_ffn_down_b,
+                ffn_op_type::FFN_GELU, -1);
+            cb(cur, "after_ffn_proj", -1);
+        }
+
+    } else {
+        GGML_ABORT("Unsupported projector type in Qwen3-VL graph");
+    }
 
     // build the graph
-    ggml_build_forward_expand(gf, embeddings);
+    ggml_build_forward_expand(gf, cur);
 
     return gf;
 }

From 306f342eaaf9fc1a68bf8f76c6e7d6efe5d62c51 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 19:04:48 +0100
Subject: [PATCH 05/25] add mrope for text model

---
 convert_hf_to_gguf.py         | 25 +++++++++++++++++++++----
 src/llama-model.cpp           | 15 ++++++++++-----
 src/models/glm4.cpp           | 26 ++++++++++++++++++++++----
 tools/mtmd/clip.cpp           | 27 +++++++++++++++++++++------
 tools/mtmd/clip.h             |  2 +-
 tools/mtmd/models/qwen3vl.cpp | 10 ++++++----
 tools/mtmd/mtmd.cpp           |  6 +++++-
 7 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 967cb51e501..1b69cac37d4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4408,12 +4408,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
 
 @ModelBase.register("Glm4vForConditionalGeneration")
-class Glm4VisionModel(Qwen3VLVisionModel):
+class Glm4VVisionModel(Qwen3VLVisionModel):
     def set_gguf_parameters(self):
-        self.is_deepstack_layers = False
-        super().set_gguf_parameters()
+        MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
+        assert self.hparams_vision is not None
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
 
+        hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
+        if hidden_act == "gelu":
+            self.gguf_writer.add_vision_use_gelu(True)
+        elif hidden_act == "silu":
+            self.gguf_writer.add_vision_use_silu(True)
+
+        rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
+        self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         if name.startswith("model.visual."):
             name = name.replace("model.visual.", "visual.")
@@ -7913,11 +7922,19 @@ def set_gguf_parameters(self):
         if (rope_dim := self.hparams.get("head_dim")) is None:
             rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        rope_scaling = self.hparams.get("rope_scaling") or {}
+        rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {}
         if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
             self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
             self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
             self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+        # handle M-RoPE, the same as Qwen-VL
+        if "mrope_section" in rope_scaling:
+            mrope_section = rope_scaling["mrope_section"]
+            # Pad to 4 dimensions [time, height, width, extra]
+            while len(mrope_section) < 4:
+                mrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
+            logger.info(f"MRoPE sections: {mrope_section[:4]}")
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         if name.startswith("model.visual."): # ignore visual part of Glm4v
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e4808b1e1eb..60c57407606 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1685,7 +1685,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_GLM4:
             {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
                 switch (hparams.n_layer) {
                     case 40: type = LLM_TYPE_9B; break;
                     case 61: type = LLM_TYPE_32B; break;
@@ -1694,8 +1695,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_GLM4_MOE:
             {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
 
                 // MoE parameters
                 ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
@@ -7758,7 +7760,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
-        case LLM_ARCH_GLM4:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
@@ -7820,7 +7821,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_LFM2:
         case LLM_ARCH_LFM2MOE:
         case LLM_ARCH_SMALLTHINKER:
-        case LLM_ARCH_GLM4_MOE:
         case LLM_ARCH_SEED_OSS:
         case LLM_ARCH_GROVEMOE:
         case LLM_ARCH_APERTUS:
@@ -7837,6 +7837,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_QWEN3VLMOE:
             return LLAMA_ROPE_TYPE_IMROPE;
 
+        case LLM_ARCH_GLM4:
+            return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
+        case LLM_ARCH_GLM4_MOE:
+            return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
             GGML_ABORT("unknown architecture");
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index f789b282488..2c9de2b7d35 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -7,6 +7,9 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
     ggml_tensor * cur;
     ggml_tensor * inpL;
@@ -63,11 +66,26 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
                 Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
                                     cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
             }
-            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                                 ext_factor, attn_factor, beta_fast, beta_slow);
 
-            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            if (rope_type == LLAMA_ROPE_TYPE_MROPE) {
+                // M-RoPE
+                Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                // Normal RoPE
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+            }
 
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9156705af15..e95495ef24e 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2790,16 +2790,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
 int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->model.hparams;
     const int n_total = clip_n_output_tokens(ctx, img);
-    if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
-        return img->nx / (params.patch_size * 2);
+    const auto & proj = ctx->proj_type();
+    switch (proj) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            return (img->nx / params.patch_size) / 2;
+        default:
+            break;
     }
     return n_total;
 }
 
 int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->model.hparams;
-    if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
-        return img->ny / (params.patch_size * 2);
+    const auto & proj = ctx->proj_type();
+    switch (proj) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            return (img->ny / params.patch_size) / 2;
+        default:
+            break;
     }
     return 1;
 }
@@ -3384,10 +3398,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }
 
-bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
+bool clip_is_mrope(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
         || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
+        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
 }
 
 bool clip_is_llava(const struct clip_ctx * ctx) {
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 5aae27a7004..68a0d6e857e 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
 
 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
-bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
 
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
index 7c61fc0b4f6..9305386c753 100644
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -112,10 +112,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
             // apply M-RoPE
             Qcur = ggml_rope_multi(
                 ctx0, Qcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1);
             Kcur = ggml_rope_multi(
                 ctx0, Kcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1);
 
             cb(Qcur, "Qcur_rope", il);
             cb(Kcur, "Kcur_rope", il);
@@ -156,7 +156,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
                 layer.deepstack_fc1_w, layer.deepstack_fc1_b,
                 nullptr, nullptr,
                 layer.deepstack_fc2_w, layer.deepstack_fc2_b,
-                ffn_op_type::FFN_GELU, il);
+                hparams.ffn_op, il);
 
             if(!deepstack_features) {
                 deepstack_features = feat;
@@ -192,6 +192,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
 
     } else if (proj_type == PROJECTOR_TYPE_GLM4V) {
         // GLM4V projector
+        // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
 
         // patch merger
         {
@@ -213,6 +214,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
         {
             cur = ggml_transpose(ctx0, cur); // [n_embd, n_tokens]
             cur = ggml_mul_mat(ctx0, model.projection, cur);
+            // default LayerNorm (post_projection_norm)
             cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
             cur = ggml_gelu(ctx0, cur);
             cb(cur, "after_fc_proj", -1);
@@ -224,7 +226,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
                 model.mm_ffn_up_w, model.mm_ffn_up_b,
                 model.mm_ffn_gate_w, model.mm_ffn_gate_b,
                 model.mm_ffn_down_w, model.mm_ffn_down_b,
-                ffn_op_type::FFN_GELU, -1);
+                hparams.ffn_op, -1);
             cb(cur, "after_ffn_proj", -1);
         }
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index d06fa42e616..69c4d00d575 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -218,7 +218,7 @@ struct mtmd_context {
 
     void init_vision() {
         GGML_ASSERT(ctx_v != nullptr);
-        use_mrope = clip_is_qwen2vl(ctx_v);
+        use_mrope = clip_is_mrope(ctx_v);
 
         projector_type proj = clip_get_projector_type(ctx_v);
         int minicpmv_version = clip_is_minicpmv(ctx_v);
@@ -310,6 +310,10 @@ struct mtmd_context {
             img_beg = "<|image_start|>";
             img_end = "<|image_end|>";
 
+        } else if (proj == PROJECTOR_TYPE_GLM4V) {
+            img_beg = "<|begin_of_image|>";
+            img_end = "<|end_of_image|>";
+
         }
     }
 

From 6a6e30172a7126a3cdcf05c1d47383c1fc731cdb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 19:47:13 +0100
Subject: [PATCH 06/25] faster patch merger

---
 convert_hf_to_gguf.py          |  5 +++++
 gguf-py/gguf/constants.py      |  3 ---
 gguf-py/gguf/tensor_mapping.py |  5 +----
 tools/mtmd/clip-impl.h         |  3 +--
 tools/mtmd/clip-model.h        |  7 ++-----
 tools/mtmd/clip.cpp            | 12 ++++++------
 tools/mtmd/models/qwen3vl.cpp  | 25 ++++++++++++++++---------
 7 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1b69cac37d4..dd2ce29a0d2 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4428,6 +4428,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             name = name.replace("model.visual.", "visual.")
         if name.startswith("visual.merger."):
             return [(self.map_tensor_name(name), data_torch)]
+        if "downsample.weight" in name:
+            # unfold the downsample to mistral-small format
+            c_out, c_in, kh, kw = data_torch.shape
+            data_unfold = data_torch.view(c_out, c_in * kh * kw)
+            return [(self.map_tensor_name(name), data_unfold)]
         return super().modify_tensors(data_torch, name, bid)
 
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0801d73794d..16721fab55a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -686,7 +686,6 @@ class MODEL_TENSOR(IntEnum):
     V_MM_GATE            = auto() # cogvlm
     V_TOK_BOI            = auto() # cogvlm
     V_TOK_EOI            = auto() # cogvlm
-    V_MM_CONV            = auto() # glm4v
     # audio (mtmd)
     A_ENC_EMBD_POS       = auto()
     A_ENC_CONV1D         = auto()
@@ -1060,7 +1059,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_GATE:                 "mm.gate",
     MODEL_TENSOR.V_TOK_BOI:                 "v.boi",
     MODEL_TENSOR.V_TOK_EOI:                 "v.eoi",
-    MODEL_TENSOR.V_MM_CONV:                 "mm.conv",
     # audio (mtmd)
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
     MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
@@ -1140,7 +1138,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MM_GATE,
         MODEL_TENSOR.V_TOK_BOI,
         MODEL_TENSOR.V_TOK_EOI,
-        MODEL_TENSOR.V_MM_CONV,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_CONV1D,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 676d7466838..0b46d3ef801 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1428,10 +1428,6 @@ class TensorNameMap:
             "multi_modal_projector.mm_soft_emb_norm",
         ),
 
-        MODEL_TENSOR.V_MM_CONV: (
-            "visual.downsample", # glm4v
-        ),
-
         MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
             "resampler.pos_embed_k",
         ),
@@ -1483,6 +1479,7 @@ class TensorNameMap:
         MODEL_TENSOR.V_MM_PATCH_MERGER: (
             "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
             "patch_merger.merging_layer", # mistral
+            "visual.downsample", # glm4v
         ),
 
         MODEL_TENSOR.V_DS_NORM: (
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index b8d018e05b8..260767126f1 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -99,14 +99,13 @@
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
-#define TN_MM_PATCH_MERGER "mm.patch_merger.weight"     // mistral small 3.1
+#define TN_MM_PATCH_MERGER "mm.patch_merger.%s"         // mistral small 3.1, glm4v
 #define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
 #define TN_TOK_GLM_BOI     "adapter.boi"                // glm-edge (these embeddings are not in text model)
 #define TN_TOK_GLM_EOI     "adapter.eoi"                // glm-edge (these embeddings are not in text model)
 #define TN_DEEPSTACK_NORM  "v.deepstack.%d.norm.%s"     // qwen3vl deepstack
 #define TN_DEEPSTACK_FC1   "v.deepstack.%d.fc1.%s"      // qwen3vl deepstack
 #define TN_DEEPSTACK_FC2   "v.deepstack.%d.fc2.%s"      // qwen3vl deepstack
-#define TN_MM_CONV         "mm.conv.%s"                 // glm4v
 
 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index e69af53cabf..a6c3f32ef20 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -254,9 +254,10 @@ struct clip_model {
     ggml_tensor * mm_input_proj_w = nullptr;
     ggml_tensor * mm_soft_emb_norm_w = nullptr;
 
-    // pixtral
+    // pixtral, glm4v
     ggml_tensor * token_embd_img_break = nullptr;
     ggml_tensor * mm_patch_merger_w = nullptr;
+    ggml_tensor * mm_patch_merger_b = nullptr;
 
     // ultravox / whisper encoder
     ggml_tensor * conv1d_1_w = nullptr;
@@ -275,10 +276,6 @@ struct clip_model {
     ggml_tensor * mm_boi = nullptr;
     ggml_tensor * mm_eoi = nullptr;
 
-    // glm4v
-    ggml_tensor * mm_conv_w = nullptr;
-    ggml_tensor * mm_conv_b = nullptr;
-
     bool audio_has_avgpool() const {
         return proj_type == PROJECTOR_TYPE_QWEN2A
             || proj_type == PROJECTOR_TYPE_VOXTRAL;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index e95495ef24e..3f846670c34 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1451,8 +1451,8 @@ struct clip_model_loader {
                     model.mm_ffn_down_b  = get_tensor(string_format(TN_MM_DOWN,      "bias"), false);
                     model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
                     model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
-                    model.mm_conv_w      = get_tensor(string_format(TN_MM_CONV,      "weight"));
-                    model.mm_conv_b      = get_tensor(string_format(TN_MM_CONV,      "bias"), false);
+                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
+                    model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
                 } break;
             case PROJECTOR_TYPE_GEMMA3:
                 {
@@ -1482,8 +1482,8 @@ struct clip_model_loader {
                     // [IMG_BREAK] token embedding
                     model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
                     // for mistral small 3.1
-                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
-                    model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
+                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
+                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
                 } break;
             case PROJECTOR_TYPE_LIGHTONOCR:
                 {
@@ -1491,8 +1491,8 @@ struct clip_model_loader {
                     model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
                     model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                     model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
-                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
-                    model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
+                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
+                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
                 } break;
             case PROJECTOR_TYPE_ULTRAVOX:
                 {
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
index 9305386c753..660c06ef392 100644
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -194,25 +194,32 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
         // GLM4V projector
         // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
 
-        // patch merger
+        // patch merger (copied from pixtral)
         {
+            int n_merge = hparams.n_merge;
+            GGML_ASSERT(n_merge > 0);
+
             // reshape image tokens to 2D grid
             cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
             cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
             cur = ggml_cont(ctx0, cur);
 
-            // merge patches
-            cur = ggml_conv_2d(ctx0, model.mm_conv_w, cur, 2, 2, 0, 0, 1, 1);
-            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); // [n_tokens, n_embd]
-            if (model.mm_conv_b) {
-                cur = ggml_add(ctx0, cur, ggml_transpose(ctx0, model.mm_conv_b));
-            }
-            cb(cur, "after_mm_conv", -1);
+            // torch.nn.functional.unfold is just an im2col under the hood
+            // we just need a dummy kernel to make it work
+            ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+            cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+            // project to n_embd
+            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+            cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+
+            // add bias
+            cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
+            cb(cur, "after_patch_merger", -1);
         }
 
         // FC projector
         {
-            cur = ggml_transpose(ctx0, cur); // [n_embd, n_tokens]
             cur = ggml_mul_mat(ctx0, model.projection, cur);
             // default LayerNorm (post_projection_norm)
             cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);

From c78c2e3f1a1d32fe37cc919f78f79eb43f8d3f8a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 21:08:50 +0100
Subject: [PATCH 07/25] add GGML_ROPE_TYPE_MRNORM

---
 ggml/include/ggml.h       |  7 ++++---
 ggml/src/ggml-cpu/ops.cpp | 25 +++++++++++++------------
 src/models/glm4.cpp       |  7 ++++---
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 686da3dbd10..76b3562626c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -243,10 +243,11 @@
 
 // TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
 #define GGML_ROPE_TYPE_NORMAL 0
-#define GGML_ROPE_TYPE_NEOX   2
-#define GGML_ROPE_TYPE_MROPE  8
-#define GGML_ROPE_TYPE_VISION 24
+#define GGML_ROPE_TYPE_NEOX   2  // binary: 000010
+#define GGML_ROPE_TYPE_MROPE  8  // binary: 001000
+#define GGML_ROPE_TYPE_VISION 24 // binary: 011000
 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
+#define GGML_ROPE_TYPE_MRNORM 40 // binary: 000001 (MROPE without NEOX - need to combine with one of the above)
 
 #define GGML_MROPE_SECTIONS   4
 
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 3032783971d..54512b1fdc7 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5764,20 +5764,21 @@ static void ggml_compute_forward_rope_flt(
                 T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
                 T * dst_data  = (T *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
 
-                switch (mode) {
-                    case GGML_ROPE_TYPE_NORMAL:
+                if (mode == GGML_ROPE_TYPE_NORMAL) {
+                    rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
+                } else if (mode == GGML_ROPE_TYPE_NEOX) {
+                    rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
+                } else if (mode & GGML_ROPE_TYPE_MROPE || mode & GGML_ROPE_TYPE_IMROPE) {
+                    if (mode & GGML_ROPE_TYPE_MRNORM) {
                         rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
-                        break;
-                    case GGML_ROPE_TYPE_NEOX:
-                    case GGML_ROPE_TYPE_MROPE:
-                    case GGML_ROPE_TYPE_IMROPE:
+                    } else {
                         rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
-                        break;
-                    case GGML_ROPE_TYPE_VISION:
-                        rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
-                        break;
-                    default:
-                        GGML_ABORT("rope type not supported");
+                    }
+                } else if (mode & GGML_ROPE_TYPE_VISION) {
+                    GGML_ASSERT(mode & GGML_ROPE_TYPE_NEOX && "non-neox is not yet supported");
+                    rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
+                } else {
+                    GGML_ABORT("rope type not supported");
                 }
 
                 if (!is_vision) {
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index 2c9de2b7d35..58c161167da 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -68,13 +68,14 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
             }
 
             if (rope_type == LLAMA_ROPE_TYPE_MROPE) {
-                // M-RoPE
+                // M-RoPE without using NEOX ordering
+                auto rope_type_adj = rope_type & GGML_ROPE_TYPE_MRNORM;
                 Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
-                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow);
 
                 Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
-                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow);
             } else {
                 // Normal RoPE

From 037e76e98261f0da18cde2d771890d9cb8717664 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 13 Dec 2025 21:34:41 +0100
Subject: [PATCH 08/25] add support for metal

---
 ggml/include/ggml.h                    | 2 +-
 ggml/src/ggml-metal/ggml-metal-impl.h  | 2 ++
 ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 +++++-
 ggml/src/ggml-metal/ggml-metal.metal   | 9 +++++----
 ggml/src/ggml.c                        | 3 ++-
 src/models/glm4.cpp                    | 2 +-
 6 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 76b3562626c..2b9e7ec5d5d 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -247,7 +247,7 @@
 #define GGML_ROPE_TYPE_MROPE  8  // binary: 001000
 #define GGML_ROPE_TYPE_VISION 24 // binary: 011000
 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
-#define GGML_ROPE_TYPE_MRNORM 40 // binary: 000001 (MROPE without NEOX - need to combine with one of the above)
+#define GGML_ROPE_TYPE_MRNORM 4  // binary: 000100 (MROPE without NEOX - need to combine with one of the above)
 
 #define GGML_MROPE_SECTIONS   4
 
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index 8944b07e907..a2d1235175e 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -258,6 +258,8 @@ typedef struct {
     int32_t  sect_2;
     int32_t  sect_3;
     bool     src2;
+    uint64_t offset; // args.n_dims/2 for NEOX ordering, 1 otherwise
+    uint64_t idx_scale; // 2 for NEOX ordering, 1 otherwise
 } ggml_metal_kargs_rope;
 
 typedef struct {
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index e99c1763f63..74ca6836400 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3141,7 +3141,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
 
     const int n_past     = ((const int32_t *) op->op_params)[0];
     const int n_dims     = ((const int32_t *) op->op_params)[1];
-  //const int mode       = ((const int32_t *) op->op_params)[2];
+    const int mode       = ((const int32_t *) op->op_params)[2];
     // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
     const int n_ctx_orig = ((const int32_t *) op->op_params)[4];
 
@@ -3165,6 +3165,8 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
     const int sect_2 = ((const int32_t *) op->op_params)[13];
     const int sect_3 = ((const int32_t *) op->op_params)[14];
 
+    bool is_normal_ordering = mode == GGML_ROPE_TYPE_NORMAL || mode & GGML_ROPE_TYPE_MRNORM;
+
     ggml_metal_kargs_rope args = {
         /*.ne00        =*/ ne00,
         /*.ne01        =*/ ne01,
@@ -3196,6 +3198,8 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
         /* sect_2      =*/ sect_2,
         /* sect_3      =*/ sect_3,
         /* src2        =*/ op->src[2] != nullptr,
+        /* offset      =*/ is_normal_ordering ? 1u : ((uint16_t)n_dims / 2),
+        /* idx_scale   =*/ is_normal_ordering ? 1u : 2u,
     };
 
     auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 51bcbae309f..c97c2388200 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4152,6 +4152,7 @@ kernel void kernel_rope_norm(
     }
 }
 
+// TODO @ngxson : merge with kernel_rope_norm using offset and idx_scale
 template<typename T>
 kernel void kernel_rope_neox(
         constant ggml_metal_kargs_rope & args,
@@ -4231,7 +4232,7 @@ kernel void kernel_rope_multi(
 
     for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
         if (i0 < args.n_dims) {
-            const int ic = i0/2;
+            const int ic = i0 / args.idx_scale;
 
             // mrope theta calculations
             // note: the rest is the same as kernel_rope_neox
@@ -4274,10 +4275,10 @@ kernel void kernel_rope_multi(
             device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
 
             const float x0 = src[0];
-            const float x1 = src[args.n_dims/2];
+            const float x1 = src[args.offset];
 
-            dst_data[0]             = x0*cos_theta - x1*sin_theta;
-            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+            dst_data[0]           = x0*cos_theta - x1*sin_theta;
+            dst_data[args.offset] = x0*sin_theta + x1*cos_theta;
         } else {
             device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
             device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f0913cd3596..6f950fcaef4 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4073,7 +4073,8 @@ static struct ggml_tensor * ggml_rope_impl(
 
     bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
     if (mrope_used) {
-        GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+        // mrope expecting multiple positions ids per token
+        GGML_ASSERT(a->ne[2] * GGML_MROPE_SECTIONS == b->ne[0]);
     } else {
         GGML_ASSERT(a->ne[2] == b->ne[0]);
     }
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index 58c161167da..41477eceff4 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -69,7 +69,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
 
             if (rope_type == LLAMA_ROPE_TYPE_MROPE) {
                 // M-RoPE without using NEOX ordering
-                auto rope_type_adj = rope_type & GGML_ROPE_TYPE_MRNORM;
+                auto rope_type_adj = rope_type | GGML_ROPE_TYPE_MRNORM;
                 Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
                             n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow);

From b4e65dc88971c64b8a3e38160e83f96ad58a960c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 13:20:18 +0100
Subject: [PATCH 09/25] move glm4v do dedicated graph

---
 ggml/src/ggml-cpu/ops.cpp     |   5 +-
 tools/mtmd/CMakeLists.txt     |   1 +
 tools/mtmd/clip.cpp           |   5 +-
 tools/mtmd/models/glm4v.cpp   | 143 ++++++++++++++++++++++++++++++++++
 tools/mtmd/models/models.h    |   5 ++
 tools/mtmd/models/qwen3vl.cpp |  85 ++++----------------
 6 files changed, 169 insertions(+), 75 deletions(-)
 create mode 100644 tools/mtmd/models/glm4v.cpp

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 54512b1fdc7..eef3dba06a2 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5768,15 +5768,14 @@ static void ggml_compute_forward_rope_flt(
                     rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
                 } else if (mode == GGML_ROPE_TYPE_NEOX) {
                     rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
+                } else if (mode & GGML_ROPE_TYPE_VISION) {
+                    rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
                 } else if (mode & GGML_ROPE_TYPE_MROPE || mode & GGML_ROPE_TYPE_IMROPE) {
                     if (mode & GGML_ROPE_TYPE_MRNORM) {
                         rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
                     } else {
                         rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
                     }
-                } else if (mode & GGML_ROPE_TYPE_VISION) {
-                    GGML_ASSERT(mode & GGML_ROPE_TYPE_NEOX && "non-neox is not yet supported");
-                    rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
                 } else {
                     GGML_ABORT("rope type not supported");
                 }
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 3ee42036fda..e7f3067a163 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -15,6 +15,7 @@ add_library(mtmd
             clip-graph.h
             models/models.h
             models/cogvlm.cpp
+            models/glm4v.cpp
             models/internvl.cpp
             models/kimivl.cpp
             models/llama4.cpp
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 3f846670c34..27509b80b7c 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -778,7 +778,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
             {
                 builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
             } break;
@@ -816,6 +815,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_llava>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_GLM4V:
+            {
+                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
+            } break;
         default:
             GGML_ABORT("missing cgraph builder");
     }
diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp
new file mode 100644
index 00000000000..a17b01bad66
--- /dev/null
+++ b/tools/mtmd/models/glm4v.cpp
@@ -0,0 +1,143 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_glm4v::build() {
+    GGML_ASSERT(model.patch_bias != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = NORM_TYPE_RMS;
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // add patch bias
+    if (model.patch_bias != nullptr) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+        cb(inp, "patch_bias", -1);
+    }
+
+    // pre-layernorm (aka post-conv)
+    if (model.pre_ln_w) {
+        inp = build_norm(inp, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    // calculate absolute position embedding and apply
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+    learned_pos_embd = ggml_cont_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+    learned_pos_embd = ggml_reshape_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+    learned_pos_embd = ggml_cont_3d(
+        ctx0, learned_pos_embd,
+        n_embd, n_patches_x * n_patches_y, batch_size);
+    inp = ggml_add(ctx0, inp, learned_pos_embd);
+    cb(inp, "inp_pos_emb", -1);
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        return ggml_rope_multi(
+                    ctx0, cur, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
+                    32768, hparams.rope_theta, 1, 0, 1, 32, 1);
+    };
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        cur = build_norm(cur, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // GLM4V projector
+    // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
+
+    // patch merger (copied from pixtral)
+    {
+        int n_merge = hparams.n_merge;
+        GGML_ASSERT(n_merge > 0);
+
+        // reshape image tokens to 2D grid
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+        cur = ggml_cont(ctx0, cur);
+
+        // torch.nn.functional.unfold is just an im2col under the hood
+        // we just need a dummy kernel to make it work
+        ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+        cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+        // project to n_embd
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+        cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+
+        // add bias
+        cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
+        cb(cur, "after_patch_merger", -1);
+    }
+
+    // FC projector
+    {
+        cur = ggml_mul_mat(ctx0, model.projection, cur);
+        // default LayerNorm (post_projection_norm)
+        cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+        cur = ggml_gelu(ctx0, cur);
+        cb(cur, "after_fc_proj", -1);
+    }
+
+    // FFN projector
+    {
+        cur = build_ffn(cur,
+            model.mm_ffn_up_w, model.mm_ffn_up_b,
+            model.mm_ffn_gate_w, model.mm_ffn_gate_b,
+            model.mm_ffn_down_w, model.mm_ffn_down_b,
+            hparams.ffn_op, -1);
+        cb(cur, "after_ffn_proj", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 4b35da259ce..0496d6b22f1 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph {
     clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
 };
+
+struct clip_graph_glm4v : clip_graph {
+    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
index 660c06ef392..35a42cb84d6 100644
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -86,9 +86,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
         // self-attention
         {
             cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-            if (layer.qkv_b) {
-                cur = ggml_add(ctx0, cur, layer.qkv_b);
-            }
+            cur = ggml_add(ctx0, cur, layer.qkv_b);
 
             ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
                     /* nb1    */ ggml_row_size(cur->type, d_head),
@@ -112,10 +110,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
             // apply M-RoPE
             Qcur = ggml_rope_multi(
                 ctx0, Qcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1);
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
             Kcur = ggml_rope_multi(
                 ctx0, Kcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1);
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
 
             cb(Qcur, "Qcur_rope", il);
             cb(Kcur, "Kcur_rope", il);
@@ -156,7 +154,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
                 layer.deepstack_fc1_w, layer.deepstack_fc1_b,
                 nullptr, nullptr,
                 layer.deepstack_fc2_w, layer.deepstack_fc2_b,
-                hparams.ffn_op, il);
+                ffn_op_type::FFN_GELU, il);
 
             if(!deepstack_features) {
                 deepstack_features = feat;
@@ -174,75 +172,20 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
         inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
     }
 
-    ggml_tensor * cur = inpL;
-
-    if (proj_type == PROJECTOR_TYPE_QWEN3VL) {
-        // Qwen3VL projector
-        cur = ggml_reshape_3d(ctx0, cur, n_embd * 4, n_pos / 4, batch_size);
-        cur = build_ffn(cur,
-            model.mm_0_w, model.mm_0_b,
-            nullptr, nullptr,
-            model.mm_1_w, model.mm_1_b,
-            ffn_op_type::FFN_GELU, -1);
-
-        if (deepstack_features != nullptr) {
-            // concat along the feature dimension
-            cur = ggml_concat(ctx0, cur, deepstack_features, 0);
-        }
-
-    } else if (proj_type == PROJECTOR_TYPE_GLM4V) {
-        // GLM4V projector
-        // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
-
-        // patch merger (copied from pixtral)
-        {
-            int n_merge = hparams.n_merge;
-            GGML_ASSERT(n_merge > 0);
-
-            // reshape image tokens to 2D grid
-            cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
-            cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
-            cur = ggml_cont(ctx0, cur);
-
-            // torch.nn.functional.unfold is just an im2col under the hood
-            // we just need a dummy kernel to make it work
-            ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
-            cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
-
-            // project to n_embd
-            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
-            cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
-
-            // add bias
-            cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
-            cb(cur, "after_patch_merger", -1);
-        }
-
-        // FC projector
-        {
-            cur = ggml_mul_mat(ctx0, model.projection, cur);
-            // default LayerNorm (post_projection_norm)
-            cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "after_fc_proj", -1);
-        }
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
 
-        // FFN projector
-        {
-            cur = build_ffn(cur,
-                model.mm_ffn_up_w, model.mm_ffn_up_b,
-                model.mm_ffn_gate_w, model.mm_ffn_gate_b,
-                model.mm_ffn_down_w, model.mm_ffn_down_b,
-                hparams.ffn_op, -1);
-            cb(cur, "after_ffn_proj", -1);
-        }
+    embeddings = build_ffn(embeddings,
+        model.mm_0_w, model.mm_0_b,
+        nullptr, nullptr,
+        model.mm_1_w, model.mm_1_b,
+        ffn_op_type::FFN_GELU, -1);
 
-    } else {
-        GGML_ABORT("Unsupported projector type in Qwen3-VL graph");
-    }
+    embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
 
     // build the graph
-    ggml_build_forward_expand(gf, cur);
+    ggml_build_forward_expand(gf, embeddings);
 
     return gf;
 }

From 7d6a1e07dea76517ff0a11a613f5249e39998ac7 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 16:40:47 +0100
Subject: [PATCH 10/25] convert: add norm_embd

---
 gguf-py/gguf/constants.py      | 3 +++
 gguf-py/gguf/tensor_mapping.py | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 16721fab55a..91f6662b67e 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -642,6 +642,7 @@ class MODEL_TENSOR(IntEnum):
     V_MMPROJ_PEG         = auto()
     V_ENC_EMBD_CLS       = auto()
     V_ENC_EMBD_PATCH     = auto()
+    V_ENC_EMBD_NORM      = auto()
     V_ENC_EMBD_POS       = auto()
     V_ENC_INPUT_NORM     = auto()
     V_ENC_ATTN_QKV       = auto()
@@ -1015,6 +1016,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MMPROJ_PEG:              "mm.model.peg.{bid}",
     MODEL_TENSOR.V_ENC_EMBD_CLS:            "v.class_embd",
     MODEL_TENSOR.V_ENC_EMBD_PATCH:          "v.patch_embd",
+    MODEL_TENSOR.V_ENC_EMBD_NORM:           "v.norm_embd",
     MODEL_TENSOR.V_ENC_EMBD_POS:            "v.position_embd",
     MODEL_TENSOR.V_ENC_ATTN_QKV:            "v.blk.{bid}.attn_qkv",
     MODEL_TENSOR.V_ENC_ATTN_Q:              "v.blk.{bid}.attn_q",
@@ -1094,6 +1096,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MMPROJ_PEG,
         MODEL_TENSOR.V_ENC_EMBD_CLS,
         MODEL_TENSOR.V_ENC_EMBD_PATCH,
+        MODEL_TENSOR.V_ENC_EMBD_NORM,
         MODEL_TENSOR.V_ENC_EMBD_POS,
         MODEL_TENSOR.V_ENC_INPUT_NORM,
         MODEL_TENSOR.V_ENC_ATTN_QKV,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 0b46d3ef801..e42e86a86bc 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1239,6 +1239,10 @@ class TensorNameMap:
             "model.vision.patch_embedding.proj", # cogvlm
         ),
 
+        MODEL_TENSOR.V_ENC_EMBD_NORM: (
+            "visual.post_conv_layernorm", # glm4v
+        ),
+
         MODEL_TENSOR.V_ENC_EMBD_POS: (
             "vision_tower.vision_model.embeddings.position_embedding",
             "model.vision_tower.embeddings.position_embeddings", # Intern-S1
@@ -1396,7 +1400,6 @@ class TensorNameMap:
             "vision_tower.ln_pre", # pixtral-hf
             "vision_encoder.ln_pre", # pixtral
             "vision_model.layernorm_pre", # llama4
-            "visual.post_conv_layernorm", # glm4v
         ),
 
         MODEL_TENSOR.V_POST_NORM: (

From 5047d8ede897e045c4c389266ba64ad534562faa Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 16:41:27 +0100
Subject: [PATCH 11/25] clip: add debugging fn

---
 tools/mtmd/clip-impl.h |  2 ++
 tools/mtmd/clip.cpp    | 35 +++++++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 260767126f1..d1cd04df7bc 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -499,6 +499,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
     }
 }
 
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
+
 //
 // API used internally with mtmd
 //
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 27509b80b7c..9d704307c2b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm(
         ? ggml_rms_norm(ctx0, cur, norm_eps)
         : ggml_norm(ctx0, cur, norm_eps);
 
-    if (mw || mb) {
-        cb(cur, "norm", il);
-    }
-
     if (mw) {
         cur = ggml_mul(ctx0, cur, mw);
-        if (mb) {
-            cb(cur, "norm_w", il);
-        }
+        cb(cur, "norm_w", il);
     }
 
     if (mb) {
         cur = ggml_add(ctx0, cur, mb);
+        cb(cur, "norm_b", il);
     }
 
     return cur;
@@ -1134,6 +1129,7 @@ struct clip_model_loader {
                     } break;
                 case PROJECTOR_TYPE_GLM4V:
                     {
+                        hparams.rope_theta = 10000.0f;
                         hparams.n_merge = 2; // default value for GLM4-V
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         hparams.set_limit_image_tokens(8, 4096);
@@ -1845,6 +1841,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
             if (ctx_params.warmup) {
                 loader.warmup(*ctx_vision);
             }
+
+            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.1f);
         }
 
         if (loader.has_audio) {
@@ -3337,7 +3335,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 
     // copy the embeddings to the location passed by the user
-    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+    if (vec != nullptr) {
+        ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+    }
 
     return true;
 }
@@ -3461,3 +3461,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
     batch->entries.push_back(clip_image_f32_ptr(audio));
     batch->is_audio = true;
 }
+
+//
+// API for debugging
+//
+
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
+    clip_image_f32 img;
+    img.nx = w;
+    img.ny = h;
+    img.buf.resize(h * w * 3);
+    for (int i = 0; i < h * w * 3; i++) {
+        img.buf[i] = static_cast<float>(fill_value);
+    }
+    bool cur_debug_graph = ctx->debug_graph;
+    ctx->debug_graph = true;
+    clip_image_encode(ctx, 1, &img, nullptr);
+    ctx->debug_graph = cur_debug_graph;
+    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
+}

From ad85426e515e3c46ca3a2850b726e81f368f184a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 22:16:47 +0100
Subject: [PATCH 12/25] working correctly

---
 convert_hf_to_gguf.py       |  5 ---
 src/models/glm4.cpp         |  2 +-
 tools/mtmd/clip-impl.h      |  1 +
 tools/mtmd/clip-model.h     |  2 ++
 tools/mtmd/clip.cpp         |  8 +++--
 tools/mtmd/models/glm4v.cpp | 65 ++++++++++++-------------------------
 6 files changed, 31 insertions(+), 52 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index dd2ce29a0d2..1b69cac37d4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4428,11 +4428,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             name = name.replace("model.visual.", "visual.")
         if name.startswith("visual.merger."):
             return [(self.map_tensor_name(name), data_torch)]
-        if "downsample.weight" in name:
-            # unfold the downsample to mistral-small format
-            c_out, c_in, kh, kw = data_torch.shape
-            data_unfold = data_torch.view(c_out, c_in * kh * kw)
-            return [(self.map_tensor_name(name), data_unfold)]
         return super().modify_tensors(data_torch, name, bid)
 
 
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index 41477eceff4..eb8b7a5eb4f 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -67,7 +67,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
                                     cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
             }
 
-            if (rope_type == LLAMA_ROPE_TYPE_MROPE) {
+            if (rope_type & LLAMA_ROPE_TYPE_MROPE) {
                 // M-RoPE without using NEOX ordering
                 auto rope_type_adj = rope_type | GGML_ROPE_TYPE_MRNORM;
                 Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index d1cd04df7bc..b18b698ba23 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -68,6 +68,7 @@
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
 #define TN_PATCH_BIAS      "v.patch_embd.bias"
+#define TN_NORM_EMBD       "v.norm_embd.%s"
 #define TN_ATTN_QKV        "%s.blk.%d.attn_qkv.%s"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index a6c3f32ef20..9c55c9a1bc9 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -151,6 +151,8 @@ struct clip_model {
     ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
     ggml_tensor * patch_bias = nullptr;
     ggml_tensor * position_embeddings = nullptr;
+    ggml_tensor * norm_embd_w = nullptr;
+    ggml_tensor * norm_embd_b = nullptr;
 
     ggml_tensor * pre_ln_w = nullptr;
     ggml_tensor * pre_ln_b = nullptr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9d704307c2b..62fba74f84a 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1251,6 +1251,9 @@ struct clip_model_loader {
         model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
         model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
 
+        model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
+        model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"),   false);
+
         model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
 
         // layers
@@ -1842,7 +1845,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                 loader.warmup(*ctx_vision);
             }
 
-            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.1f);
+            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
         }
 
         if (loader.has_audio) {
@@ -2548,7 +2551,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
         case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
+        //case PROJECTOR_TYPE_GLM4V:
             {
                 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
                 clip_image_u8 resized;
@@ -2618,6 +2621,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_GLM_EDGE:
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
+        case PROJECTOR_TYPE_GLM4V: // for debugging only
             {
                 clip_image_u8 resized_image;
                 int sz = params.image_size;
diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp
index a17b01bad66..60bc1cfe4ad 100644
--- a/tools/mtmd/models/glm4v.cpp
+++ b/tools/mtmd/models/glm4v.cpp
@@ -5,17 +5,18 @@ ggml_cgraph * clip_graph_glm4v::build() {
     GGML_ASSERT(model.position_embeddings != nullptr);
     GGML_ASSERT(model.class_embedding == nullptr);
 
-    const int batch_size       = 1;
-    const int n_pos            = n_patches;
-    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+    const int batch_size = 1;
 
     norm_type norm_t = NORM_TYPE_RMS;
 
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
     GGML_ASSERT(img.nx % (patch_size * 2) == 0);
     GGML_ASSERT(img.ny % (patch_size * 2) == 0);
 
@@ -38,15 +39,11 @@ ggml_cgraph * clip_graph_glm4v::build() {
     }
 
     // add patch bias
-    if (model.patch_bias != nullptr) {
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-        cb(inp, "patch_bias", -1);
-    }
+    inp = ggml_add(ctx0, inp, model.patch_bias);
+    cb(inp, "patch_bias", -1);
 
-    // pre-layernorm (aka post-conv)
-    if (model.pre_ln_w) {
-        inp = build_norm(inp, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-    }
+    // pos-conv norm
+    inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
 
     // calculate absolute position embedding and apply
     ggml_tensor * learned_pos_embd = resize_position_embeddings();
@@ -60,16 +57,9 @@ ggml_cgraph * clip_graph_glm4v::build() {
     learned_pos_embd = ggml_cont_3d(
         ctx0, learned_pos_embd,
         n_embd, n_patches_x * n_patches_y, batch_size);
-    inp = ggml_add(ctx0, inp, learned_pos_embd);
-    cb(inp, "inp_pos_emb", -1);
+    cb(learned_pos_embd, "learned_pos_embd", -1);
 
-    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    // build ViT with 2D position embeddings
     auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-        // first half is X axis and second half is Y axis
         return ggml_rope_multi(
                     ctx0, cur, positions, nullptr,
                     d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
@@ -78,43 +68,29 @@ ggml_cgraph * clip_graph_glm4v::build() {
 
     ggml_tensor * cur = build_vit(
                             inp, n_patches,
-                            NORM_TYPE_NORMAL,
+                            norm_t,
                             hparams.ffn_op,
                             learned_pos_embd,
                             add_pos);
 
     cb(cur, "vit_out", -1);
-
-    // post-layernorm
-    if (model.post_ln_w) {
-        cur = build_norm(cur, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-    }
+    // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
 
     // GLM4V projector
     // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
 
-    // patch merger (copied from pixtral)
+    // patch merger (downsample)
     {
         int n_merge = hparams.n_merge;
         GGML_ASSERT(n_merge > 0);
 
-        // reshape image tokens to 2D grid
-        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
-        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
-        cur = ggml_cont(ctx0, cur);
-
-        // torch.nn.functional.unfold is just an im2col under the hood
-        // we just need a dummy kernel to make it work
-        ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
-        cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
-
-        // project to n_embd
-        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
-        cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+        int n_token_out = n_patches / n_merge / n_merge;
+        cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
+        cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
 
-        // add bias
         cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
-        cb(cur, "after_patch_merger", -1);
     }
 
     // FC projector
@@ -122,7 +98,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
         cur = ggml_mul_mat(ctx0, model.projection, cur);
         // default LayerNorm (post_projection_norm)
         cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
-        cur = ggml_gelu(ctx0, cur);
+        cur = ggml_gelu_erf(ctx0, cur);
         cb(cur, "after_fc_proj", -1);
     }
 
@@ -134,6 +110,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
             model.mm_ffn_down_w, model.mm_ffn_down_b,
             hparams.ffn_op, -1);
         cb(cur, "after_ffn_proj", -1);
+        // cb(ggml_sum(ctx0, cur), "merged_sum", -1);
     }
 
     // build the graph

From f00127e35de44e79e57b07e48cfb42d737e23ef8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 22:17:15 +0100
Subject: [PATCH 13/25] fix style

---
 src/models/glm4.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index eb8b7a5eb4f..aa762dcd666 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -7,7 +7,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    
+
     int sections[4];
     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 

From 1514734c5efdcc8054b97cc9a4f48654fd837ee9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 22:31:06 +0100
Subject: [PATCH 14/25] use bicubic

---
 tools/mtmd/clip-graph.h     | 4 +++-
 tools/mtmd/clip.cpp         | 7 +++----
 tools/mtmd/models/glm4v.cpp | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index 6d303b4e48b..fb2af330558 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -9,6 +9,8 @@
 #include <vector>
 #include <functional>
 
+#define DEFAULT_INTERPOLATION_MODE  GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS
+
 struct clip_graph {
     const clip_model & model;
     const clip_hparams & hparams;
@@ -49,7 +51,7 @@ struct clip_graph {
     void cb(ggml_tensor * cur0, const char * name, int il) const;
 
     // siglip2 naflex
-    ggml_tensor * resize_position_embeddings();
+    ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
 
     // build vision transformer (ViT) cgraph
     // this function should cover most of the models
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 62fba74f84a..da8817a58b7 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
 }
 
 // siglip2 naflex
-ggml_tensor * clip_graph::resize_position_embeddings() {
+ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
     ggml_tensor * pos_embd = model.position_embeddings;
     const int height       = img.ny / patch_size;
     const int width        = img.nx / patch_size;
-    const uint32_t mode    = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
+    const uint32_t mode    = interpolation_mode;
     const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
 
     GGML_ASSERT(pos_embd);
@@ -2551,7 +2551,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
         case PROJECTOR_TYPE_QWEN3VL:
-        //case PROJECTOR_TYPE_GLM4V:
+        case PROJECTOR_TYPE_GLM4V:
             {
                 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
                 clip_image_u8 resized;
@@ -2621,7 +2621,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_GLM_EDGE:
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
-        case PROJECTOR_TYPE_GLM4V: // for debugging only
             {
                 clip_image_u8 resized_image;
                 int sz = params.image_size;
diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp
index 60bc1cfe4ad..f39b6922eb5 100644
--- a/tools/mtmd/models/glm4v.cpp
+++ b/tools/mtmd/models/glm4v.cpp
@@ -46,7 +46,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
     inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
 
     // calculate absolute position embedding and apply
-    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+    ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
     learned_pos_embd = ggml_cont_4d(
         ctx0, learned_pos_embd,
         n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);

From cadaedbb17466695dd863f005d5acd4c01063be9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 23:57:09 +0100
Subject: [PATCH 15/25] fix mrope metal

---
 ggml/src/ggml-cpu/ops.cpp            | 4 ++--
 ggml/src/ggml-metal/ggml-metal.metal | 7 ++++---
 tests/test-backend-ops.cpp           | 2 ++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index eef3dba06a2..ca93c0c9d6c 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5768,9 +5768,9 @@ static void ggml_compute_forward_rope_flt(
                     rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
                 } else if (mode == GGML_ROPE_TYPE_NEOX) {
                     rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
-                } else if (mode & GGML_ROPE_TYPE_VISION) {
+                } else if (is_vision) {
                     rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
-                } else if (mode & GGML_ROPE_TYPE_MROPE || mode & GGML_ROPE_TYPE_IMROPE) {
+                } else if (mrope_used) {
                     if (mode & GGML_ROPE_TYPE_MRNORM) {
                         rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
                     } else {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index c97c2388200..45df1549333 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4232,7 +4232,7 @@ kernel void kernel_rope_multi(
 
     for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
         if (i0 < args.n_dims) {
-            const int ic = i0 / args.idx_scale;
+            const int ic = i0/2;
 
             // mrope theta calculations
             // note: the rest is the same as kernel_rope_neox
@@ -4271,8 +4271,9 @@ kernel void kernel_rope_multi(
 
             rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
 
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+            const int i_base = i0 / args.idx_scale;
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i_base*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i_base*args.nb0);
 
             const float x0 = src[0];
             const float x1 = src[args.offset];
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 416218b5b86..bf0eba37749 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7703,6 +7703,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                                     test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
                                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
                                     test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM, 512, fs, ef, af, ff, v, fw));
                                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
                                     test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 7B)
                                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw));
@@ -8186,6 +8187,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
                     test_cases.emplace_back(new test_rope(type, { 80,  32, 512, 1},  20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm)
                     test_cases.emplace_back(new test_rope(type, { 64,   8, 512, 1},  64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B)
                     test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
+                    test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope,non-neox (glm4v)
                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
                     test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
                 }

From 4a0b89ab2d2198d0043d051b705fb69a70333147 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 14 Dec 2025 23:59:42 +0100
Subject: [PATCH 16/25] improve cpu

---
 ggml/src/ggml-cpu/ops.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index ca93c0c9d6c..877607c6147 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5764,20 +5764,22 @@ static void ggml_compute_forward_rope_flt(
                 T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
                 T * dst_data  = (T *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
 
-                if (mode == GGML_ROPE_TYPE_NORMAL) {
-                    rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
-                } else if (mode == GGML_ROPE_TYPE_NEOX) {
-                    rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
-                } else if (is_vision) {
-                    rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
-                } else if (mrope_used) {
-                    if (mode & GGML_ROPE_TYPE_MRNORM) {
+                switch (mode) {
+                    case GGML_ROPE_TYPE_NORMAL:
+                    case GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM:
+                    case GGML_ROPE_TYPE_IMROPE | GGML_ROPE_TYPE_MRNORM:
                         rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
-                    } else {
+                        break;
+                    case GGML_ROPE_TYPE_NEOX:
+                    case GGML_ROPE_TYPE_MROPE:
+                    case GGML_ROPE_TYPE_IMROPE:
                         rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
-                    }
-                } else {
-                    GGML_ABORT("rope type not supported");
+                        break;
+                    case GGML_ROPE_TYPE_VISION:
+                        rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
+                        break;
+                    default:
+                        GGML_ABORT("rope type not supported");
                 }
 
                 if (!is_vision) {

From d00d11e4505755aa170f53ecf489442a7bda174b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 01:17:19 +0100
Subject: [PATCH 17/25] convert to neox ordering on conversion

---
 convert_hf_to_gguf.py | 44 ++++++++++++++++++++++++++++++++++++++++++-
 src/models/glm4.cpp   |  6 ++----
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1b69cac37d4..a8362faccb3 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7900,6 +7900,16 @@ def prepare_tensors(self):
 @ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
 class Glm4Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GLM4
+    use_mrope = False
+    partial_rotary_factor = 0.5
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {}
+        if "mrope_section" in rope_scaling:
+            self.use_mrope = True
+            self.partial_rotary_factor = rope_scaling.get("partial_rotary_factor", 0.5)
+            logger.info("Using M-RoPE")
 
     def set_vocab(self):
         from transformers import AutoTokenizer
@@ -7928,7 +7938,7 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
             self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
         # handle M-RoPE, the same as Qwen-VL
-        if "mrope_section" in rope_scaling:
+        if self.use_mrope:
             mrope_section = rope_scaling["mrope_section"]
             # Pad to 4 dimensions [time, height, width, extra]
             while len(mrope_section) < 4:
@@ -7936,11 +7946,43 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
             logger.info(f"MRoPE sections: {mrope_section[:4]}")
 
+    @staticmethod
+    def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
+        orig_shape = weights.shape
+        if len(orig_shape) == 1:
+            weights = weights.unsqueeze(1)  # [out_dim, 1]
+        if len(weights.shape) != 2:
+            raise ValueError("Only 1D and 2D tensors are supported.")
+        n_effective_heads = weights.shape[0] // head_dim
+        if n_head_kv is not None and n_effective_heads != n_head:
+            if n_effective_heads != n_head_kv:
+                raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
+        rotary_dim = int(head_dim * partial_rotary_factor)
+        if rotary_dim % 2 != 0:
+            raise ValueError("rotary_dim must be even.")
+        reshaped = weights.reshape(n_effective_heads, head_dim, -1)
+        rot_part = reshaped[:, :rotary_dim, :]
+        non_rot_part = reshaped[:, rotary_dim:, :]
+        permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
+        combined = torch.cat((permuted_rot, non_rot_part), dim=1)
+        result = combined.reshape(weights.shape)
+        return result if len(orig_shape) != 1 else result.squeeze(1)
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         if name.startswith("model.visual."): # ignore visual part of Glm4v
             return []
         elif name.startswith("model.language_model."):
             name = name.replace("language_model.", "") # for Glm4v
+        if self.use_mrope:
+            n_head = self.hparams["num_attention_heads"]
+            n_kv_head = self.hparams["num_key_value_heads"]
+            n_embd = self.hparams["hidden_size"]
+            head_dim = n_embd // n_head
+            # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
+            if name.endswith(("q_proj.weight", "q_proj.bias")):
+                data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
+            if name.endswith(("k_proj.weight", "k_proj.bias")):
+                data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
         return super().modify_tensors(data_torch, name, bid)
 
 
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index aa762dcd666..49bbac1cd79 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -68,14 +68,12 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
             }
 
             if (rope_type & LLAMA_ROPE_TYPE_MROPE) {
-                // M-RoPE without using NEOX ordering
-                auto rope_type_adj = rope_type | GGML_ROPE_TYPE_MRNORM;
                 Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
-                            n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow);
 
                 Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
-                            n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow);
             } else {
                 // Normal RoPE

From f8aad31609fee0a854990b1e780c6f6be8660b6b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 01:21:39 +0100
Subject: [PATCH 18/25] revert backend changes

---
 ggml/include/ggml.h                    |  7 +++----
 ggml/src/ggml-cpu/ops.cpp              |  2 --
 ggml/src/ggml-metal/ggml-metal-impl.h  |  2 --
 ggml/src/ggml-metal/ggml-metal-ops.cpp |  6 +-----
 ggml/src/ggml-metal/ggml-metal.metal   | 12 +++++-------
 ggml/src/ggml.c                        |  3 +--
 tests/test-backend-ops.cpp             |  2 --
 7 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2b9e7ec5d5d..686da3dbd10 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -243,11 +243,10 @@
 
 // TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
 #define GGML_ROPE_TYPE_NORMAL 0
-#define GGML_ROPE_TYPE_NEOX   2  // binary: 000010
-#define GGML_ROPE_TYPE_MROPE  8  // binary: 001000
-#define GGML_ROPE_TYPE_VISION 24 // binary: 011000
+#define GGML_ROPE_TYPE_NEOX   2
+#define GGML_ROPE_TYPE_MROPE  8
+#define GGML_ROPE_TYPE_VISION 24
 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
-#define GGML_ROPE_TYPE_MRNORM 4  // binary: 000100 (MROPE without NEOX - need to combine with one of the above)
 
 #define GGML_MROPE_SECTIONS   4
 
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 877607c6147..3032783971d 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5766,8 +5766,6 @@ static void ggml_compute_forward_rope_flt(
 
                 switch (mode) {
                     case GGML_ROPE_TYPE_NORMAL:
-                    case GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM:
-                    case GGML_ROPE_TYPE_IMROPE | GGML_ROPE_TYPE_MRNORM:
                         rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
                         break;
                     case GGML_ROPE_TYPE_NEOX:
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index a2d1235175e..8944b07e907 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -258,8 +258,6 @@ typedef struct {
     int32_t  sect_2;
     int32_t  sect_3;
     bool     src2;
-    uint64_t offset; // args.n_dims/2 for NEOX ordering, 1 otherwise
-    uint64_t idx_scale; // 2 for NEOX ordering, 1 otherwise
 } ggml_metal_kargs_rope;
 
 typedef struct {
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 74ca6836400..e99c1763f63 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3141,7 +3141,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
 
     const int n_past     = ((const int32_t *) op->op_params)[0];
     const int n_dims     = ((const int32_t *) op->op_params)[1];
-    const int mode       = ((const int32_t *) op->op_params)[2];
+  //const int mode       = ((const int32_t *) op->op_params)[2];
     // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
     const int n_ctx_orig = ((const int32_t *) op->op_params)[4];
 
@@ -3165,8 +3165,6 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
     const int sect_2 = ((const int32_t *) op->op_params)[13];
     const int sect_3 = ((const int32_t *) op->op_params)[14];
 
-    bool is_normal_ordering = mode == GGML_ROPE_TYPE_NORMAL || mode & GGML_ROPE_TYPE_MRNORM;
-
     ggml_metal_kargs_rope args = {
         /*.ne00        =*/ ne00,
         /*.ne01        =*/ ne01,
@@ -3198,8 +3196,6 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
         /* sect_2      =*/ sect_2,
         /* sect_3      =*/ sect_3,
         /* src2        =*/ op->src[2] != nullptr,
-        /* offset      =*/ is_normal_ordering ? 1u : ((uint16_t)n_dims / 2),
-        /* idx_scale   =*/ is_normal_ordering ? 1u : 2u,
     };
 
     auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 45df1549333..51bcbae309f 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4152,7 +4152,6 @@ kernel void kernel_rope_norm(
     }
 }
 
-// TODO @ngxson : merge with kernel_rope_norm using offset and idx_scale
 template<typename T>
 kernel void kernel_rope_neox(
         constant ggml_metal_kargs_rope & args,
@@ -4271,15 +4270,14 @@ kernel void kernel_rope_multi(
 
             rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
 
-            const int i_base = i0 / args.idx_scale;
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i_base*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i_base*args.nb0);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
 
             const float x0 = src[0];
-            const float x1 = src[args.offset];
+            const float x1 = src[args.n_dims/2];
 
-            dst_data[0]           = x0*cos_theta - x1*sin_theta;
-            dst_data[args.offset] = x0*sin_theta + x1*cos_theta;
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
         } else {
             device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
             device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6f950fcaef4..f0913cd3596 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4073,8 +4073,7 @@ static struct ggml_tensor * ggml_rope_impl(
 
     bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
     if (mrope_used) {
-        // mrope expecting multiple positions ids per token
-        GGML_ASSERT(a->ne[2] * GGML_MROPE_SECTIONS == b->ne[0]);
+        GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
     } else {
         GGML_ASSERT(a->ne[2] == b->ne[0]);
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index bf0eba37749..416218b5b86 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7703,7 +7703,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                                     test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
                                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
                                     test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
-                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM, 512, fs, ef, af, ff, v, fw));
                                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
                                     test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 7B)
                                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw));
@@ -8187,7 +8186,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
                     test_cases.emplace_back(new test_rope(type, { 80,  32, 512, 1},  20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm)
                     test_cases.emplace_back(new test_rope(type, { 64,   8, 512, 1},  64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B)
                     test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
-                    test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope,non-neox (glm4v)
                     test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
                     test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
                 }

From 8700158852f8783261186869f793c9a222c52d23 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 01:28:35 +0100
Subject: [PATCH 19/25] force stop if using old weight

---
 src/models/glm4.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index 49bbac1cd79..403464e5e79 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -16,6 +16,12 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
 
     inpL = build_inp_embd(model.tok_embd);
 
+    bool use_mrope = rope_type & LLAMA_ROPE_TYPE_MROPE;
+    if (ubatch.embd && !use_mrope) {
+        // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+        GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+    }
+
     // inp_pos - contains the positions
     ggml_tensor * inp_pos = build_inp_pos();
 
@@ -67,7 +73,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
                                     cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
             }
 
-            if (rope_type & LLAMA_ROPE_TYPE_MROPE) {
+            if (use_mrope) {
                 Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
                             n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow);

From 33fb59ab11d45edb15c0d20e5119d0348e95e1a2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 14:42:44 +0100
Subject: [PATCH 20/25] support moe variant

---
 convert_hf_to_gguf.py   | 11 +++++++++++
 src/llama-hparams.cpp   |  4 ++++
 src/llama-hparams.h     |  2 ++
 src/llama-model.cpp     |  4 ++--
 src/models/glm4-moe.cpp | 39 ++++++++++++++++++++++++++++-----------
 src/models/glm4.cpp     |  2 +-
 6 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a8362faccb3..267e2609631 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8051,6 +8051,17 @@ def set_gguf_parameters(self):
         if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
             self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
 
+        # handle M-RoPE, the same as Qwen-VL
+        # note: unlike GLM4 non-MoE, we don't need to permute the weights here since GLM4_MOE uses Neox ordering already
+        rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {}
+        if "mrope_section" in rope_scaling:
+            mrope_section = rope_scaling["mrope_section"]
+            # Pad to 4 dimensions [time, height, width, extra]
+            while len(mrope_section) < 4:
+                mrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
+            logger.info(f"MRoPE sections: {mrope_section[:4]}")
+
     _experts: list[dict[str, Tensor]] | None = None
 
     def modify_tensors(
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 277d0bcfd3c..02fe0f1e0d2 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -241,3 +241,7 @@ float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale
 
     return attn_factor;
 }
+
+bool llama_hparams::use_mrope() const {
+    return rope_sections[0] > 0 && rope_sections[1] > 0;
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index c9960e91697..a8e53885983 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -275,6 +275,8 @@ struct llama_hparams {
     // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
     //      https://github.com/ggml-org/llama.cpp/pull/17945
     static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
+
+    bool use_mrope() const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 60c57407606..4f5bfcc1de8 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7838,9 +7838,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
             return LLAMA_ROPE_TYPE_IMROPE;
 
         case LLM_ARCH_GLM4:
-            return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
         case LLM_ARCH_GLM4_MOE:
-            return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
 
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp
index 33ee7070463..003f70f7396 100644
--- a/src/models/glm4-moe.cpp
+++ b/src/models/glm4-moe.cpp
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
     inpL = build_inp_embd(model.tok_embd);
 
+    bool use_mrope = hparams.use_mrope();
+    if (ubatch.embd && !use_mrope) {
+        // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+        GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+    }
+
     // inp_pos - contains the positions
     ggml_tensor * inp_pos = build_inp_pos();
 
@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
                 Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur_normed", il);
             }
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
+
+            if (use_mrope) {
+                Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                // Normal RoPE
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+            }
 
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index 403464e5e79..204aa3932af 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -16,7 +16,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
 
     inpL = build_inp_embd(model.tok_embd);
 
-    bool use_mrope = rope_type & LLAMA_ROPE_TYPE_MROPE;
+    bool use_mrope = hparams.use_mrope();
     if (ubatch.embd && !use_mrope) {
         // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
         GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");

From c8fd94f3ba382705c1d966bfd269c89bd1f79d8a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 14:52:10 +0100
Subject: [PATCH 21/25] fix conversion

---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 267e2609631..a77a042bf6c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7986,7 +7986,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Glm4MoeForCausalLM")
+@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
 class Glm4MoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.GLM4_MOE
 

From 785ccf428054307d2f149e9bd3d9222d2deb4403 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 15:27:09 +0100
Subject: [PATCH 22/25] fix convert (2)

---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a77a042bf6c..dbc87291bbf 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4407,7 +4407,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Glm4vForConditionalGeneration")
+@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
 class Glm4VVisionModel(Qwen3VLVisionModel):
     def set_gguf_parameters(self):
         MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters

From 7d53c0f09f9dd70d0ec1c9f1bebef1fbbc2a7a27 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <thichthat@gmail.com>
Date: Mon, 15 Dec 2025 22:00:18 +0100
Subject: [PATCH 23/25] Update tools/mtmd/clip-graph.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 tools/mtmd/clip-graph.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index fb2af330558..bebd05c9ef4 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -9,7 +9,7 @@
 #include <vector>
 #include <functional>
 
-#define DEFAULT_INTERPOLATION_MODE  GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS
+#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
 
 struct clip_graph {
     const clip_model & model;

From dd66aba5a1014a9db265075fc87b7507c7c338b7 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 15 Dec 2025 22:11:41 +0100
Subject: [PATCH 24/25] process mrope_section on TextModel base class

---
 convert_hf_to_gguf.py | 63 +++++++------------------------------------
 1 file changed, 10 insertions(+), 53 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1233faf978d..603689738b7 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -861,6 +861,14 @@ def set_gguf_parameters(self):
                 logger.warning(f"Unknown RoPE type: {rope_type}")
             logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
 
+        if "mrope_section" in self.rope_parameters:
+            mrope_section = self.rope_parameters["mrope_section"]
+            # Pad to 4 dimensions [time, height, width, extra]
+            while len(mrope_section) < 4:
+                mrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
+            logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
+
         if (rope_theta := rope_params.get("rope_theta")) is not None:
             self.gguf_writer.add_rope_freq_base(rope_theta)
             logger.info(f"gguf: rope theta = {rope_theta}")
@@ -3738,9 +3746,6 @@ class Qwen2VLModel(TextModel):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
-        mrope_section += [0] * max(0, 4 - len(mrope_section))
-        self.gguf_writer.add_rope_dimension_sections(mrope_section)
 
     def set_vocab(self):
         try:
@@ -4408,20 +4413,6 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
         # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
-        text_config = self.hparams.get("text_config", {})
-        # rope_scaling is deprecated in V5, use rope_parameters instead
-        rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
-
-        if rope_scaling.get("mrope_section"):
-            # mrope_section contains [time, height, width] dimensions
-            mrope_section = rope_scaling["mrope_section"]
-            # Pad to 4 dimensions [time, height, width, extra]
-            while len(mrope_section) < 4:
-                mrope_section.append(0)
-            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
-
-            logger.info(f"MRoPE sections: {mrope_section[:4]}")
-
         vision_config = self.hparams.get("vision_config", {})
         deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
         self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@@ -4440,22 +4431,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-
-        # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
-        text_config = self.hparams.get("text_config", {})
-        # rope_scaling is deprecated in V5, use rope_parameters instead
-        rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
-
-        if rope_scaling.get("mrope_section"):
-            # mrope_section contains [time, height, width] dimensions
-            mrope_section = rope_scaling["mrope_section"]
-            # Pad to 4 dimensions [time, height, width, extra]
-            while len(mrope_section) < 4:
-                mrope_section.append(0)
-            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
-
-            logger.info(f"MRoPE sections: {mrope_section[:4]}")
-
         vision_config = self.hparams.get("vision_config", {})
         deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
         self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@@ -7826,7 +7801,7 @@ def __init__(self, *args, **kwargs):
         self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
         if "mrope_section" in self.rope_parameters:
             self.use_mrope = True
-            logger.info("Using M-RoPE")
+            logger.info("Q/K weight will need to be permuted for M-RoPE")
 
     def set_vocab(self):
         from transformers import AutoTokenizer
@@ -7849,14 +7824,6 @@ def set_gguf_parameters(self):
         if (rope_dim := self.hparams.get("head_dim")) is None:
             rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
-        # handle M-RoPE, the same as Qwen-VL
-        if self.use_mrope:
-            mrope_section = self.rope_parameters["mrope_section"]
-            # Pad to 4 dimensions [time, height, width, extra]
-            while len(mrope_section) < 4:
-                mrope_section.append(0)
-            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
-            logger.info(f"MRoPE sections: {mrope_section[:4]}")
 
     @staticmethod
     def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
@@ -7963,19 +7930,9 @@ def set_gguf_parameters(self):
         if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
             self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
 
-        # handle M-RoPE, the same as Qwen-VL
-        # note: unlike GLM4 non-MoE, we don't need to permute the weights here since GLM4_MOE uses Neox ordering already
-        rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {}
-        if "mrope_section" in rope_scaling:
-            mrope_section = rope_scaling["mrope_section"]
-            # Pad to 4 dimensions [time, height, width, extra]
-            while len(mrope_section) < 4:
-                mrope_section.append(0)
-            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
-            logger.info(f"MRoPE sections: {mrope_section[:4]}")
-
     _experts: list[dict[str, Tensor]] | None = None
 
+    # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:

From f969d4f63b86331037fd821d20a0bd4c3d8bafa2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 16 Dec 2025 00:29:25 +0100
Subject: [PATCH 25/25] resolve conflict merge

---
 src/llama-hparams.cpp | 10 ----------
 src/llama-hparams.h   |  7 -------
 2 files changed, 17 deletions(-)

diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 4e612953da5..0beab1bce2a 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -231,16 +231,6 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
     return false;
 }
 
-float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
-    GGML_ASSERT(ext_factor >= 0.0f);
-
-    if (ext_factor != 0.0f) {
-        attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
-    }
-
-    return attn_factor;
-}
-
 bool llama_hparams::use_mrope() const {
     return rope_sections[0] > 0 && rope_sections[1] > 0;
 }
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index f1adbb0ac6d..f6e95b5d2a6 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -271,13 +271,6 @@ struct llama_hparams {
     // TODO: pack the SWA params in a struct?
     static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
 
-    // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
-    // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
-    //
-    // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
-    //      https://github.com/ggml-org/llama.cpp/pull/17945
-    static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
-
     bool use_mrope() const;
 };