From b24d366e0a2874dec5e2004f452638ab0f4e3c77 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 17:19:02 +0100 Subject: [PATCH 01/25] convert ok --- convert_hf_to_gguf.py | 14 ++++++++++++++ gguf-py/gguf/constants.py | 16 ++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 24 ++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 151608d56b8..c3d998d5e2c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4407,6 +4407,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Glm4vForConditionalGeneration") +class Glm4VisionModel(Qwen3VLVisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.") + if name.startswith("visual.merger."): + return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLForConditionalGeneration") class Qwen3VLTextModel(Qwen3Model): model_arch = gguf.MODEL_ARCH.QWEN3VL diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2b8489c591b..469d0dd4f2a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -639,6 +639,10 @@ class MODEL_TENSOR(IntEnum): V_MMPROJ = auto() V_MMPROJ_FC = auto() V_MMPROJ_MLP = auto() + V_MMPROJ_FFN_UP = auto() + V_MMPROJ_FFN_GATE = auto() + V_MMPROJ_FFN_DOWN = auto() + V_MMPROJ_POST_NORM = auto() V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() @@ -685,6 +689,7 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm + V_MM_CONV = auto() # glm4v # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_CONV1D = auto() @@ -1011,6 +1016,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_FFN_UP: "mm.model.ffn_up", + MODEL_TENSOR.V_MMPROJ_FFN_GATE: "mm.model.ffn_gate", + MODEL_TENSOR.V_MMPROJ_FFN_DOWN: "mm.model.ffn_down", + MODEL_TENSOR.V_MMPROJ_POST_NORM: "mm.model.post_norm", MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", @@ -1057,6 +1066,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", + MODEL_TENSOR.V_MM_CONV: "mm.conv", # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", @@ -1089,6 +1099,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ, MODEL_TENSOR.V_MMPROJ_FC, MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_FFN_UP, + MODEL_TENSOR.V_MMPROJ_FFN_GATE, + MODEL_TENSOR.V_MMPROJ_FFN_DOWN, + MODEL_TENSOR.V_MMPROJ_POST_NORM, MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, @@ -1135,6 +1149,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, + MODEL_TENSOR.V_MM_CONV, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_CONV1D, @@ -3327,6 +3342,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + GLM4V = "glm4v" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d9c87da1946..a7dab608d00 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1205,6 +1205,7 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM "model.vision.linear_proj.linear_proj", # cogvlm + "visual.merger.proj", # glm4v ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1214,6 +1215,22 @@ class TensorNameMap: "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro ), + MODEL_TENSOR.V_MMPROJ_FFN_UP: ( + "visual.merger.up_proj", # glm4v + ), + + MODEL_TENSOR.V_MMPROJ_FFN_GATE: ( + "visual.merger.gate_proj", # glm4v + ), + + MODEL_TENSOR.V_MMPROJ_FFN_DOWN: ( + "visual.merger.down_proj", # glm4v + ), + + MODEL_TENSOR.V_MMPROJ_POST_NORM: ( + "visual.merger.post_projection_norm", # glm4v + ), + MODEL_TENSOR.V_MMPROJ_PEG: ( "model.mm_projector.peg.peg.{bid}", ), @@ -1247,6 +1264,7 @@ class TensorNameMap: "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl "model.vision.patch_embedding.position_embedding", # cogvlm + "visual.embeddings.position_embedding", # glm4v ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1394,6 +1412,7 @@ class TensorNameMap: "vision_tower.ln_pre", # pixtral-hf "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 + "visual.post_conv_layernorm", # glm4v ), MODEL_TENSOR.V_POST_NORM: ( @@ -1402,6 +1421,7 @@ class TensorNameMap: "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl "vision_tower.encoder.final_layernorm", # kimi-vl + "visual.post_layernorm", # glm4v ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1420,6 +1440,10 @@ class TensorNameMap: "multi_modal_projector.mm_soft_emb_norm", ), + MODEL_TENSOR.V_MM_CONV: ( + "visual.downsample", # glm4v + ), + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( "resampler.pos_embed_k", ), From 7b13c8eae8c5fd0f8a6eb39f9277635c93068c2e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 17:21:44 +0100 Subject: [PATCH 02/25] no deepstack --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c3d998d5e2c..967cb51e501 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4410,6 +4410,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Glm4vForConditionalGeneration") class Glm4VisionModel(Qwen3VLVisionModel): def set_gguf_parameters(self): + self.is_deepstack_layers = False super().set_gguf_parameters() self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) From f3f8fb4bfb94b6e2ba3e245f9ce6633ea44dc982 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 17:30:23 +0100 Subject: [PATCH 03/25] less new tensors --- gguf-py/gguf/constants.py | 15 +++------------ gguf-py/gguf/tensor_mapping.py | 23 +++++++---------------- tools/mtmd/clip-impl.h | 5 +++++ tools/mtmd/clip-model.h | 6 ++++++ 4 files changed, 21 insertions(+), 28 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 469d0dd4f2a..0801d73794d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -639,10 +639,6 @@ class MODEL_TENSOR(IntEnum): V_MMPROJ = auto() V_MMPROJ_FC = auto() V_MMPROJ_MLP = auto() - V_MMPROJ_FFN_UP = auto() - V_MMPROJ_FFN_GATE = auto() - V_MMPROJ_FFN_DOWN = auto() - V_MMPROJ_POST_NORM = auto() V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() @@ -664,6 +660,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_SCALE_2 = auto() V_PRE_NORM = auto() V_POST_NORM = auto() + V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 V_MM_SOFT_EMB_NORM = auto() # gemma3 @@ -1016,10 +1013,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", - MODEL_TENSOR.V_MMPROJ_FFN_UP: "mm.model.ffn_up", - MODEL_TENSOR.V_MMPROJ_FFN_GATE: "mm.model.ffn_gate", - MODEL_TENSOR.V_MMPROJ_FFN_DOWN: "mm.model.ffn_down", - MODEL_TENSOR.V_MMPROJ_POST_NORM: "mm.model.post_norm", MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", @@ -1041,6 +1034,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", @@ -1099,10 +1093,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ, MODEL_TENSOR.V_MMPROJ_FC, MODEL_TENSOR.V_MMPROJ_MLP, - MODEL_TENSOR.V_MMPROJ_FFN_UP, - MODEL_TENSOR.V_MMPROJ_FFN_GATE, - MODEL_TENSOR.V_MMPROJ_FFN_DOWN, - MODEL_TENSOR.V_MMPROJ_POST_NORM, MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, @@ -1124,6 +1114,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_LAYER_SCALE_2, MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_POST_NORM, MODEL_TENSOR.V_MM_INP_PROJ, MODEL_TENSOR.V_MM_INP_NORM, MODEL_TENSOR.V_MM_SOFT_EMB_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7dab608d00..676d7466838 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1215,22 +1215,6 @@ class TensorNameMap: "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro ), - MODEL_TENSOR.V_MMPROJ_FFN_UP: ( - "visual.merger.up_proj", # glm4v - ), - - MODEL_TENSOR.V_MMPROJ_FFN_GATE: ( - "visual.merger.gate_proj", # glm4v - ), - - MODEL_TENSOR.V_MMPROJ_FFN_DOWN: ( - "visual.merger.down_proj", # glm4v - ), - - MODEL_TENSOR.V_MMPROJ_POST_NORM: ( - "visual.merger.post_projection_norm", # glm4v - ), - MODEL_TENSOR.V_MMPROJ_PEG: ( "model.mm_projector.peg.peg.{bid}", ), @@ -1424,6 +1408,10 @@ class TensorNameMap: "visual.post_layernorm", # glm4v ), + MODEL_TENSOR.V_MM_POST_NORM: ( + "visual.merger.post_projection_norm", # glm4v + ), + MODEL_TENSOR.V_MM_INP_PROJ: ( "multi_modal_projector.mm_input_projection", ), @@ -1515,14 +1503,17 @@ class TensorNameMap: MODEL_TENSOR.V_MM_UP: ( "model.vision.linear_proj.dense_h_to_4h", # cogvlm + "visual.merger.up_proj", # glm4v ), MODEL_TENSOR.V_MM_DOWN: ( "model.vision.linear_proj.dense_4h_to_h", # cogvlm + "visual.merger.down_proj", # glm4v ), MODEL_TENSOR.V_MM_GATE: ( "model.vision.linear_proj.gate_proj", # cogvlm + "visual.merger.gate_proj", # glm4v ), MODEL_TENSOR.V_TOK_BOI: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 1726823ec69..6499943004a 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -86,6 +86,10 @@ #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MM_UP "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_DOWN "mm.down.%s" +#define TN_MM_POST_NORM "mm.post_norm.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" @@ -102,6 +106,7 @@ #define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack #define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack #define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack +#define TN_MM_CONV "mm.conv.%s" // glm4v // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 51bcce1ebb0..61708901a8b 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -165,6 +165,12 @@ struct clip_model { ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_b; + ggml_tensor * mm_ffn_up_w; + ggml_tensor * mm_ffn_up_b; + ggml_tensor * mm_ffn_gate_w; + ggml_tensor * mm_ffn_gate_b; + ggml_tensor * mm_ffn_down_w; + ggml_tensor * mm_ffn_down_b; // LLaVA projection ggml_tensor * mm_input_norm_w = nullptr; From 4e81ab4b0c24ea71ae2f1b2d2e16960413b45965 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 18:06:24 +0100 Subject: [PATCH 04/25] cgraph ok --- tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip-model.h | 18 ++++++--- tools/mtmd/clip.cpp | 27 ++++++++++++++ tools/mtmd/models/qwen3vl.cpp | 70 +++++++++++++++++++++++++++++------ 4 files changed, 100 insertions(+), 17 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 6499943004a..b8d018e05b8 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -169,6 +169,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_UNKNOWN, }; @@ -195,6 +196,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_GLM4V, "glm4v"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 61708901a8b..e69af53cabf 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -165,12 +165,14 @@ struct clip_model { ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_b; - ggml_tensor * mm_ffn_up_w; - ggml_tensor * mm_ffn_up_b; - ggml_tensor * mm_ffn_gate_w; - ggml_tensor * mm_ffn_gate_b; - ggml_tensor * mm_ffn_down_w; - ggml_tensor * mm_ffn_down_b; + ggml_tensor * mm_ffn_up_w = nullptr; + ggml_tensor * mm_ffn_up_b = nullptr; + ggml_tensor * mm_ffn_gate_w = nullptr; + ggml_tensor * mm_ffn_gate_b = nullptr; + ggml_tensor * mm_ffn_down_w = nullptr; + ggml_tensor * mm_ffn_down_b = nullptr; + ggml_tensor * mm_post_norm_w = nullptr; + ggml_tensor * mm_post_norm_b = nullptr; // LLaVA projection ggml_tensor * mm_input_norm_w = nullptr; @@ -273,6 +275,10 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // glm4v + ggml_tensor * mm_conv_w = nullptr; + ggml_tensor * mm_conv_b = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index bb922e30b43..9156705af15 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -778,6 +778,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { builder = std::make_unique(ctx, img); } break; @@ -1128,6 +1129,13 @@ struct clip_model_loader { LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); } } break; + case PROJECTOR_TYPE_GLM4V: + { + hparams.n_merge = 2; // default value for GLM4-V + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + } break; case PROJECTOR_TYPE_LLAMA4: { hparams.rope_theta = 10000.0f; @@ -1432,6 +1440,20 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_GLM4V: + { + model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); + model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false); + model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false); + model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight")); + model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false); + model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false); + model.mm_conv_w = get_tensor(string_format(TN_MM_CONV, "weight")); + model.mm_conv_b = get_tensor(string_format(TN_MM_CONV, "bias"), false); + } break; case PROJECTOR_TYPE_GEMMA3: { model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); @@ -2525,6 +2547,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); clip_image_u8 resized; @@ -2833,6 +2856,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { // dynamic size (2 conv, so double patch size) int x_patch = img->nx / (params.patch_size * 2); @@ -3070,6 +3094,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { const int merge_ratio = hparams.n_merge; const int pw = image_size_width / patch_size; @@ -3341,6 +3366,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_GLM4V: + return ctx->model.mm_ffn_down_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 35a42cb84d6..7c61fc0b4f6 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -86,7 +86,9 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // self-attention { cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); + if (layer.qkv_b) { + cur = ggml_add(ctx0, cur, layer.qkv_b); + } ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, /* nb1 */ ggml_row_size(cur->type, d_head), @@ -172,20 +174,66 @@ ggml_cgraph * clip_graph_qwen3vl::build() { inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); } - // multimodal projection - ggml_tensor * embeddings = inpL; - embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + ggml_tensor * cur = inpL; + + if (proj_type == PROJECTOR_TYPE_QWEN3VL) { + // Qwen3VL projector + cur = ggml_reshape_3d(ctx0, cur, n_embd * 4, n_pos / 4, batch_size); + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + ffn_op_type::FFN_GELU, -1); + + if (deepstack_features != nullptr) { + // concat along the feature dimension + cur = ggml_concat(ctx0, cur, deepstack_features, 0); + } + + } else if (proj_type == PROJECTOR_TYPE_GLM4V) { + // GLM4V projector - embeddings = build_ffn(embeddings, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - ffn_op_type::FFN_GELU, -1); + // patch merger + { + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // merge patches + cur = ggml_conv_2d(ctx0, model.mm_conv_w, cur, 2, 2, 0, 0, 1, 1); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); // [n_tokens, n_embd] + if (model.mm_conv_b) { + cur = ggml_add(ctx0, cur, ggml_transpose(ctx0, model.mm_conv_b)); + } + cb(cur, "after_mm_conv", -1); + } - embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension + // FC projector + { + cur = ggml_transpose(ctx0, cur); // [n_embd, n_tokens] + cur = ggml_mul_mat(ctx0, model.projection, cur); + cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = ggml_gelu(ctx0, cur); + cb(cur, "after_fc_proj", -1); + } + + // FFN projector + { + cur = build_ffn(cur, + model.mm_ffn_up_w, model.mm_ffn_up_b, + model.mm_ffn_gate_w, model.mm_ffn_gate_b, + model.mm_ffn_down_w, model.mm_ffn_down_b, + ffn_op_type::FFN_GELU, -1); + cb(cur, "after_ffn_proj", -1); + } + + } else { + GGML_ABORT("Unsupported projector type in Qwen3-VL graph"); + } // build the graph - ggml_build_forward_expand(gf, embeddings); + ggml_build_forward_expand(gf, cur); return gf; } From 306f342eaaf9fc1a68bf8f76c6e7d6efe5d62c51 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 19:04:48 +0100 Subject: [PATCH 05/25] add mrope for text model --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++---- src/llama-model.cpp | 15 ++++++++++----- src/models/glm4.cpp | 26 ++++++++++++++++++++++---- tools/mtmd/clip.cpp | 27 +++++++++++++++++++++------ tools/mtmd/clip.h | 2 +- tools/mtmd/models/qwen3vl.cpp | 10 ++++++---- tools/mtmd/mtmd.cpp | 6 +++++- 7 files changed, 86 insertions(+), 25 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 967cb51e501..1b69cac37d4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4408,12 +4408,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Glm4vForConditionalGeneration") -class Glm4VisionModel(Qwen3VLVisionModel): +class Glm4VVisionModel(Qwen3VLVisionModel): def set_gguf_parameters(self): - self.is_deepstack_layers = False - super().set_gguf_parameters() + MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters + assert self.hparams_vision is not None self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): name = name.replace("model.visual.", "visual.") @@ -7913,11 +7922,19 @@ def set_gguf_parameters(self): if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - rope_scaling = self.hparams.get("rope_scaling") or {} + rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {} if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + # handle M-RoPE, the same as Qwen-VL + if "mrope_section" in rope_scaling: + mrope_section = rope_scaling["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"MRoPE sections: {mrope_section[:4]}") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e4808b1e1eb..60c57407606 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1685,7 +1685,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_GLM4: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); switch (hparams.n_layer) { case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; @@ -1694,8 +1695,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_GLM4_MOE: { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); // MoE parameters ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); @@ -7758,7 +7760,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: - case LLM_ARCH_GLM4: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_HYBRID: @@ -7820,7 +7821,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_LFM2: case LLM_ARCH_LFM2MOE: case LLM_ARCH_SMALLTHINKER: - case LLM_ARCH_GLM4_MOE: case LLM_ARCH_SEED_OSS: case LLM_ARCH_GROVEMOE: case LLM_ARCH_APERTUS: @@ -7837,6 +7837,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3VLMOE: return LLAMA_ROPE_TYPE_IMROPE; + case LLM_ARCH_GLM4: + return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM; + case LLM_ARCH_GLM4_MOE: + return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index f789b282488..2c9de2b7d35 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -7,6 +7,9 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); ggml_tensor * cur; ggml_tensor * inpL; @@ -63,11 +66,26 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); } - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); + if (rope_type == LLAMA_ROPE_TYPE_MROPE) { + // M-RoPE + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } else { + // Normal RoPE + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9156705af15..e95495ef24e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2790,16 +2790,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) { int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; const int n_total = clip_n_output_tokens(ctx, img); - if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->nx / (params.patch_size * 2); + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + return (img->nx / params.patch_size) / 2; + default: + break; } return n_total; } int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; - if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->ny / (params.patch_size * 2); + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + return (img->ny / params.patch_size) / 2; + default: + break; } return 1; } @@ -3384,10 +3398,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; } -bool clip_is_qwen2vl(const struct clip_ctx * ctx) { +bool clip_is_mrope(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL - || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL; + || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL + || ctx->proj_type() == PROJECTOR_TYPE_GLM4V; } bool clip_is_llava(const struct clip_ctx * ctx) { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 5aae27a7004..68a0d6e857e 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct int clip_is_minicpmv(const struct clip_ctx * ctx); bool clip_is_glm(const struct clip_ctx * ctx); -bool clip_is_qwen2vl(const struct clip_ctx * ctx); +bool clip_is_mrope(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 7c61fc0b4f6..9305386c753 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -112,10 +112,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // apply M-RoPE Qcur = ggml_rope_multi( ctx0, Qcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1); Kcur = ggml_rope_multi( ctx0, Kcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1); cb(Qcur, "Qcur_rope", il); cb(Kcur, "Kcur_rope", il); @@ -156,7 +156,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { layer.deepstack_fc1_w, layer.deepstack_fc1_b, nullptr, nullptr, layer.deepstack_fc2_w, layer.deepstack_fc2_b, - ffn_op_type::FFN_GELU, il); + hparams.ffn_op, il); if(!deepstack_features) { deepstack_features = feat; @@ -192,6 +192,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { } else if (proj_type == PROJECTOR_TYPE_GLM4V) { // GLM4V projector + // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 // patch merger { @@ -213,6 +214,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { { cur = ggml_transpose(ctx0, cur); // [n_embd, n_tokens] cur = ggml_mul_mat(ctx0, model.projection, cur); + // default LayerNorm (post_projection_norm) cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); cur = ggml_gelu(ctx0, cur); cb(cur, "after_fc_proj", -1); @@ -224,7 +226,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { model.mm_ffn_up_w, model.mm_ffn_up_b, model.mm_ffn_gate_w, model.mm_ffn_gate_b, model.mm_ffn_down_w, model.mm_ffn_down_b, - ffn_op_type::FFN_GELU, -1); + hparams.ffn_op, -1); cb(cur, "after_ffn_proj", -1); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d06fa42e616..69c4d00d575 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -218,7 +218,7 @@ struct mtmd_context { void init_vision() { GGML_ASSERT(ctx_v != nullptr); - use_mrope = clip_is_qwen2vl(ctx_v); + use_mrope = clip_is_mrope(ctx_v); projector_type proj = clip_get_projector_type(ctx_v); int minicpmv_version = clip_is_minicpmv(ctx_v); @@ -310,6 +310,10 @@ struct mtmd_context { img_beg = "<|image_start|>"; img_end = "<|image_end|>"; + } else if (proj == PROJECTOR_TYPE_GLM4V) { + img_beg = "<|begin_of_image|>"; + img_end = "<|end_of_image|>"; + } } From 6a6e30172a7126a3cdcf05c1d47383c1fc731cdb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 19:47:13 +0100 Subject: [PATCH 06/25] faster patch merger --- convert_hf_to_gguf.py | 5 +++++ gguf-py/gguf/constants.py | 3 --- gguf-py/gguf/tensor_mapping.py | 5 +---- tools/mtmd/clip-impl.h | 3 +-- tools/mtmd/clip-model.h | 7 ++----- tools/mtmd/clip.cpp | 12 ++++++------ tools/mtmd/models/qwen3vl.cpp | 25 ++++++++++++++++--------- 7 files changed, 31 insertions(+), 29 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1b69cac37d4..dd2ce29a0d2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4428,6 +4428,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("model.visual.", "visual.") if name.startswith("visual.merger."): return [(self.map_tensor_name(name), data_torch)] + if "downsample.weight" in name: + # unfold the downsample to mistral-small format + c_out, c_in, kh, kw = data_torch.shape + data_unfold = data_torch.view(c_out, c_in * kh * kw) + return [(self.map_tensor_name(name), data_unfold)] return super().modify_tensors(data_torch, name, bid) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0801d73794d..16721fab55a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -686,7 +686,6 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm - V_MM_CONV = auto() # glm4v # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_CONV1D = auto() @@ -1060,7 +1059,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", - MODEL_TENSOR.V_MM_CONV: "mm.conv", # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", @@ -1140,7 +1138,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, - MODEL_TENSOR.V_MM_CONV, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_CONV1D, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 676d7466838..0b46d3ef801 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1428,10 +1428,6 @@ class TensorNameMap: "multi_modal_projector.mm_soft_emb_norm", ), - MODEL_TENSOR.V_MM_CONV: ( - "visual.downsample", # glm4v - ), - MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( "resampler.pos_embed_k", ), @@ -1483,6 +1479,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_PATCH_MERGER: ( "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf "patch_merger.merging_layer", # mistral + "visual.downsample", # glm4v ), MODEL_TENSOR.V_DS_NORM: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index b8d018e05b8..260767126f1 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -99,14 +99,13 @@ #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 -#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 +#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) #define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) #define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack #define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack #define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack -#define TN_MM_CONV "mm.conv.%s" // glm4v // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index e69af53cabf..a6c3f32ef20 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -254,9 +254,10 @@ struct clip_model { ggml_tensor * mm_input_proj_w = nullptr; ggml_tensor * mm_soft_emb_norm_w = nullptr; - // pixtral + // pixtral, glm4v ggml_tensor * token_embd_img_break = nullptr; ggml_tensor * mm_patch_merger_w = nullptr; + ggml_tensor * mm_patch_merger_b = nullptr; // ultravox / whisper encoder ggml_tensor * conv1d_1_w = nullptr; @@ -275,10 +276,6 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; - // glm4v - ggml_tensor * mm_conv_w = nullptr; - ggml_tensor * mm_conv_b = nullptr; - bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index e95495ef24e..3f846670c34 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1451,8 +1451,8 @@ struct clip_model_loader { model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false); model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false); - model.mm_conv_w = get_tensor(string_format(TN_MM_CONV, "weight")); - model.mm_conv_b = get_tensor(string_format(TN_MM_CONV, "bias"), false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight")); + model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias")); } break; case PROJECTOR_TYPE_GEMMA3: { @@ -1482,8 +1482,8 @@ struct clip_model_loader { // [IMG_BREAK] token embedding model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); // for mistral small 3.1 - model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); - model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); } break; case PROJECTOR_TYPE_LIGHTONOCR: { @@ -1491,8 +1491,8 @@ struct clip_model_loader { model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); - model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); - model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); } break; case PROJECTOR_TYPE_ULTRAVOX: { diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 9305386c753..660c06ef392 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -194,25 +194,32 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // GLM4V projector // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 - // patch merger + // patch merger (copied from pixtral) { + int n_merge = hparams.n_merge; + GGML_ASSERT(n_merge > 0); + // reshape image tokens to 2D grid cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] cur = ggml_cont(ctx0, cur); - // merge patches - cur = ggml_conv_2d(ctx0, model.mm_conv_w, cur, 2, 2, 0, 0, 1, 1); - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); // [n_tokens, n_embd] - if (model.mm_conv_b) { - cur = ggml_add(ctx0, cur, ggml_transpose(ctx0, model.mm_conv_b)); - } - cb(cur, "after_mm_conv", -1); + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + + // add bias + cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); + cb(cur, "after_patch_merger", -1); } // FC projector { - cur = ggml_transpose(ctx0, cur); // [n_embd, n_tokens] cur = ggml_mul_mat(ctx0, model.projection, cur); // default LayerNorm (post_projection_norm) cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); From c78c2e3f1a1d32fe37cc919f78f79eb43f8d3f8a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 21:08:50 +0100 Subject: [PATCH 07/25] add GGML_ROPE_TYPE_MRNORM --- ggml/include/ggml.h | 7 ++++--- ggml/src/ggml-cpu/ops.cpp | 25 +++++++++++++------------ src/models/glm4.cpp | 7 ++++--- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 686da3dbd10..76b3562626c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -243,10 +243,11 @@ // TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726 #define GGML_ROPE_TYPE_NORMAL 0 -#define GGML_ROPE_TYPE_NEOX 2 -#define GGML_ROPE_TYPE_MROPE 8 -#define GGML_ROPE_TYPE_VISION 24 +#define GGML_ROPE_TYPE_NEOX 2 // binary: 000010 +#define GGML_ROPE_TYPE_MROPE 8 // binary: 001000 +#define GGML_ROPE_TYPE_VISION 24 // binary: 011000 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000 +#define GGML_ROPE_TYPE_MRNORM 40 // binary: 000001 (MROPE without NEOX - need to combine with one of the above) #define GGML_MROPE_SECTIONS 4 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3032783971d..54512b1fdc7 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5764,20 +5764,21 @@ static void ggml_compute_forward_rope_flt( T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); T * dst_data = (T *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - switch (mode) { - case GGML_ROPE_TYPE_NORMAL: + if (mode == GGML_ROPE_TYPE_NORMAL) { + rotate_pairs(n_dims, 1, cache, src, dst_data, 1); + } else if (mode == GGML_ROPE_TYPE_NEOX) { + rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); + } else if (mode & GGML_ROPE_TYPE_MROPE || mode & GGML_ROPE_TYPE_IMROPE) { + if (mode & GGML_ROPE_TYPE_MRNORM) { rotate_pairs(n_dims, 1, cache, src, dst_data, 1); - break; - case GGML_ROPE_TYPE_NEOX: - case GGML_ROPE_TYPE_MROPE: - case GGML_ROPE_TYPE_IMROPE: + } else { rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); - break; - case GGML_ROPE_TYPE_VISION: - rotate_pairs(ne0, n_dims, cache, src, dst_data); - break; - default: - GGML_ABORT("rope type not supported"); + } + } else if (mode & GGML_ROPE_TYPE_VISION) { + GGML_ASSERT(mode & GGML_ROPE_TYPE_NEOX && "non-neox is not yet supported"); + rotate_pairs(ne0, n_dims, cache, src, dst_data); + } else { + GGML_ABORT("rope type not supported"); } if (!is_vision) { diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 2c9de2b7d35..58c161167da 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -68,13 +68,14 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params } if (rope_type == LLAMA_ROPE_TYPE_MROPE) { - // M-RoPE + // M-RoPE without using NEOX ordering + auto rope_type_adj = rope_type & GGML_ROPE_TYPE_MRNORM; Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else { // Normal RoPE From 037e76e98261f0da18cde2d771890d9cb8717664 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 13 Dec 2025 21:34:41 +0100 Subject: [PATCH 08/25] add support for metal --- ggml/include/ggml.h | 2 +- ggml/src/ggml-metal/ggml-metal-impl.h | 2 ++ ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 +++++- ggml/src/ggml-metal/ggml-metal.metal | 9 +++++---- ggml/src/ggml.c | 3 ++- src/models/glm4.cpp | 2 +- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 76b3562626c..2b9e7ec5d5d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -247,7 +247,7 @@ #define GGML_ROPE_TYPE_MROPE 8 // binary: 001000 #define GGML_ROPE_TYPE_VISION 24 // binary: 011000 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000 -#define GGML_ROPE_TYPE_MRNORM 40 // binary: 000001 (MROPE without NEOX - need to combine with one of the above) +#define GGML_ROPE_TYPE_MRNORM 4 // binary: 000100 (MROPE without NEOX - need to combine with one of the above) #define GGML_MROPE_SECTIONS 4 diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 8944b07e907..a2d1235175e 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -258,6 +258,8 @@ typedef struct { int32_t sect_2; int32_t sect_3; bool src2; + uint64_t offset; // args.n_dims/2 for NEOX ordering, 1 otherwise + uint64_t idx_scale; // 2 for NEOX ordering, 1 otherwise } ggml_metal_kargs_rope; typedef struct { diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e99c1763f63..74ca6836400 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -3141,7 +3141,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) { const int n_past = ((const int32_t *) op->op_params)[0]; const int n_dims = ((const int32_t *) op->op_params)[1]; - //const int mode = ((const int32_t *) op->op_params)[2]; + const int mode = ((const int32_t *) op->op_params)[2]; // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal const int n_ctx_orig = ((const int32_t *) op->op_params)[4]; @@ -3165,6 +3165,8 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) { const int sect_2 = ((const int32_t *) op->op_params)[13]; const int sect_3 = ((const int32_t *) op->op_params)[14]; + bool is_normal_ordering = mode == GGML_ROPE_TYPE_NORMAL || mode & GGML_ROPE_TYPE_MRNORM; + ggml_metal_kargs_rope args = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, @@ -3196,6 +3198,8 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) { /* sect_2 =*/ sect_2, /* sect_3 =*/ sect_3, /* src2 =*/ op->src[2] != nullptr, + /* offset =*/ is_normal_ordering ? 1u : ((uint16_t)n_dims / 2), + /* idx_scale =*/ is_normal_ordering ? 1u : 2u, }; auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 51bcbae309f..c97c2388200 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4152,6 +4152,7 @@ kernel void kernel_rope_norm( } } +// TODO @ngxson : merge with kernel_rope_norm using offset and idx_scale template kernel void kernel_rope_neox( constant ggml_metal_kargs_rope & args, @@ -4231,7 +4232,7 @@ kernel void kernel_rope_multi( for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { if (i0 < args.n_dims) { - const int ic = i0/2; + const int ic = i0 / args.idx_scale; // mrope theta calculations // note: the rest is the same as kernel_rope_neox @@ -4274,10 +4275,10 @@ kernel void kernel_rope_multi( device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); const float x0 = src[0]; - const float x1 = src[args.n_dims/2]; + const float x1 = src[args.offset]; - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.offset] = x0*sin_theta + x1*cos_theta; } else { device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f0913cd3596..6f950fcaef4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4073,7 +4073,8 @@ static struct ggml_tensor * ggml_rope_impl( bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; if (mrope_used) { - GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token + // mrope expecting multiple positions ids per token + GGML_ASSERT(a->ne[2] * GGML_MROPE_SECTIONS == b->ne[0]); } else { GGML_ASSERT(a->ne[2] == b->ne[0]); } diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 58c161167da..41477eceff4 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -69,7 +69,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params if (rope_type == LLAMA_ROPE_TYPE_MROPE) { // M-RoPE without using NEOX ordering - auto rope_type_adj = rope_type & GGML_ROPE_TYPE_MRNORM; + auto rope_type_adj = rope_type | GGML_ROPE_TYPE_MRNORM; Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); From b4e65dc88971c64b8a3e38160e83f96ad58a960c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 13:20:18 +0100 Subject: [PATCH 09/25] move glm4v do dedicated graph --- ggml/src/ggml-cpu/ops.cpp | 5 +- tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip.cpp | 5 +- tools/mtmd/models/glm4v.cpp | 143 ++++++++++++++++++++++++++++++++++ tools/mtmd/models/models.h | 5 ++ tools/mtmd/models/qwen3vl.cpp | 85 ++++---------------- 6 files changed, 169 insertions(+), 75 deletions(-) create mode 100644 tools/mtmd/models/glm4v.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 54512b1fdc7..eef3dba06a2 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5768,15 +5768,14 @@ static void ggml_compute_forward_rope_flt( rotate_pairs(n_dims, 1, cache, src, dst_data, 1); } else if (mode == GGML_ROPE_TYPE_NEOX) { rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); + } else if (mode & GGML_ROPE_TYPE_VISION) { + rotate_pairs(ne0, n_dims, cache, src, dst_data); } else if (mode & GGML_ROPE_TYPE_MROPE || mode & GGML_ROPE_TYPE_IMROPE) { if (mode & GGML_ROPE_TYPE_MRNORM) { rotate_pairs(n_dims, 1, cache, src, dst_data, 1); } else { rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); } - } else if (mode & GGML_ROPE_TYPE_VISION) { - GGML_ASSERT(mode & GGML_ROPE_TYPE_NEOX && "non-neox is not yet supported"); - rotate_pairs(ne0, n_dims, cache, src, dst_data); } else { GGML_ABORT("rope type not supported"); } diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 3ee42036fda..e7f3067a163 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -15,6 +15,7 @@ add_library(mtmd clip-graph.h models/models.h models/cogvlm.cpp + models/glm4v.cpp models/internvl.cpp models/kimivl.cpp models/llama4.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3f846670c34..27509b80b7c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -778,7 +778,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_QWEN3VL: - case PROJECTOR_TYPE_GLM4V: { builder = std::make_unique(ctx, img); } break; @@ -816,6 +815,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GLM4V: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp new file mode 100644 index 00000000000..a17b01bad66 --- /dev/null +++ b/tools/mtmd/models/glm4v.cpp @@ -0,0 +1,143 @@ +#include "models.h" + +ggml_cgraph * clip_graph_glm4v::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = NORM_TYPE_RMS; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + if (model.patch_bias != nullptr) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + + // pre-layernorm (aka post-conv) + if (model.pre_ln_w) { + inp = build_norm(inp, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "inp_pos_emb", -1); + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + return ggml_rope_multi( + ctx0, cur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, + 32768, hparams.rope_theta, 1, 0, 1, 32, 1); + }; + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + + // post-layernorm + if (model.post_ln_w) { + cur = build_norm(cur, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // GLM4V projector + // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 + + // patch merger (copied from pixtral) + { + int n_merge = hparams.n_merge; + GGML_ASSERT(n_merge > 0); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + + // add bias + cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); + cb(cur, "after_patch_merger", -1); + } + + // FC projector + { + cur = ggml_mul_mat(ctx0, model.projection, cur); + // default LayerNorm (post_projection_norm) + cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = ggml_gelu(ctx0, cur); + cb(cur, "after_fc_proj", -1); + } + + // FFN projector + { + cur = build_ffn(cur, + model.mm_ffn_up_w, model.mm_ffn_up_b, + model.mm_ffn_gate_w, model.mm_ffn_gate_b, + model.mm_ffn_down_w, model.mm_ffn_down_b, + hparams.ffn_op, -1); + cb(cur, "after_ffn_proj", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 4b35da259ce..0496d6b22f1 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph { clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_glm4v : clip_graph { + clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 660c06ef392..35a42cb84d6 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -86,9 +86,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // self-attention { cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - if (layer.qkv_b) { - cur = ggml_add(ctx0, cur, layer.qkv_b); - } + cur = ggml_add(ctx0, cur, layer.qkv_b); ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, /* nb1 */ ggml_row_size(cur->type, d_head), @@ -112,10 +110,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // apply M-RoPE Qcur = ggml_rope_multi( ctx0, Qcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); Kcur = ggml_rope_multi( ctx0, Kcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, hparams.rope_theta, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); cb(Qcur, "Qcur_rope", il); cb(Kcur, "Kcur_rope", il); @@ -156,7 +154,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { layer.deepstack_fc1_w, layer.deepstack_fc1_b, nullptr, nullptr, layer.deepstack_fc2_w, layer.deepstack_fc2_b, - hparams.ffn_op, il); + ffn_op_type::FFN_GELU, il); if(!deepstack_features) { deepstack_features = feat; @@ -174,75 +172,20 @@ ggml_cgraph * clip_graph_qwen3vl::build() { inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); } - ggml_tensor * cur = inpL; - - if (proj_type == PROJECTOR_TYPE_QWEN3VL) { - // Qwen3VL projector - cur = ggml_reshape_3d(ctx0, cur, n_embd * 4, n_pos / 4, batch_size); - cur = build_ffn(cur, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - ffn_op_type::FFN_GELU, -1); - - if (deepstack_features != nullptr) { - // concat along the feature dimension - cur = ggml_concat(ctx0, cur, deepstack_features, 0); - } - - } else if (proj_type == PROJECTOR_TYPE_GLM4V) { - // GLM4V projector - // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 - - // patch merger (copied from pixtral) - { - int n_merge = hparams.n_merge; - GGML_ASSERT(n_merge > 0); - - // reshape image tokens to 2D grid - cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); - cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] - cur = ggml_cont(ctx0, cur); - - // torch.nn.functional.unfold is just an im2col under the hood - // we just need a dummy kernel to make it work - ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); - cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); - - // project to n_embd - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); - - // add bias - cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); - cb(cur, "after_patch_merger", -1); - } - - // FC projector - { - cur = ggml_mul_mat(ctx0, model.projection, cur); - // default LayerNorm (post_projection_norm) - cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); - cur = ggml_gelu(ctx0, cur); - cb(cur, "after_fc_proj", -1); - } + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); - // FFN projector - { - cur = build_ffn(cur, - model.mm_ffn_up_w, model.mm_ffn_up_b, - model.mm_ffn_gate_w, model.mm_ffn_gate_b, - model.mm_ffn_down_w, model.mm_ffn_down_b, - hparams.ffn_op, -1); - cb(cur, "after_ffn_proj", -1); - } + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + ffn_op_type::FFN_GELU, -1); - } else { - GGML_ABORT("Unsupported projector type in Qwen3-VL graph"); - } + embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension // build the graph - ggml_build_forward_expand(gf, cur); + ggml_build_forward_expand(gf, embeddings); return gf; } From 7d6a1e07dea76517ff0a11a613f5249e39998ac7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 16:40:47 +0100 Subject: [PATCH 10/25] convert: add norm_embd --- gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 16721fab55a..91f6662b67e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -642,6 +642,7 @@ class MODEL_TENSOR(IntEnum): V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_NORM = auto() V_ENC_EMBD_POS = auto() V_ENC_INPUT_NORM = auto() V_ENC_ATTN_QKV = auto() @@ -1015,6 +1016,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", @@ -1094,6 +1096,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_NORM, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_INPUT_NORM, MODEL_TENSOR.V_ENC_ATTN_QKV, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0b46d3ef801..e42e86a86bc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1239,6 +1239,10 @@ class TensorNameMap: "model.vision.patch_embedding.proj", # cogvlm ), + MODEL_TENSOR.V_ENC_EMBD_NORM: ( + "visual.post_conv_layernorm", # glm4v + ), + MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embeddings", # Intern-S1 @@ -1396,7 +1400,6 @@ class TensorNameMap: "vision_tower.ln_pre", # pixtral-hf "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 - "visual.post_conv_layernorm", # glm4v ), MODEL_TENSOR.V_POST_NORM: ( From 5047d8ede897e045c4c389266ba64ad534562faa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 16:41:27 +0100 Subject: [PATCH 11/25] clip: add debugging fn --- tools/mtmd/clip-impl.h | 2 ++ tools/mtmd/clip.cpp | 35 +++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 260767126f1..d1cd04df7bc 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -499,6 +499,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { } } +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value); + // // API used internally with mtmd // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 27509b80b7c..9d704307c2b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm( ? ggml_rms_norm(ctx0, cur, norm_eps) : ggml_norm(ctx0, cur, norm_eps); - if (mw || mb) { - cb(cur, "norm", il); - } - if (mw) { cur = ggml_mul(ctx0, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } + cb(cur, "norm_w", il); } if (mb) { cur = ggml_add(ctx0, cur, mb); + cb(cur, "norm_b", il); } return cur; @@ -1134,6 +1129,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_GLM4V: { + hparams.rope_theta = 10000.0f; hparams.n_merge = 2; // default value for GLM4-V get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(8, 4096); @@ -1845,6 +1841,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params if (ctx_params.warmup) { loader.warmup(*ctx_vision); } + + // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.1f); } if (loader.has_audio) { @@ -3337,7 +3335,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + if (vec != nullptr) { + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + } return true; } @@ -3461,3 +3461,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel batch->entries.push_back(clip_image_f32_ptr(audio)); batch->is_audio = true; } + +// +// API for debugging +// + +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { + clip_image_f32 img; + img.nx = w; + img.ny = h; + img.buf.resize(h * w * 3); + for (int i = 0; i < h * w * 3; i++) { + img.buf[i] = static_cast(fill_value); + } + bool cur_debug_graph = ctx->debug_graph; + ctx->debug_graph = true; + clip_image_encode(ctx, 1, &img, nullptr); + ctx->debug_graph = cur_debug_graph; + GGML_ASSERT(img.buf.empty() && "expected, always stop here"); +} From ad85426e515e3c46ca3a2850b726e81f368f184a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 22:16:47 +0100 Subject: [PATCH 12/25] working correctly --- convert_hf_to_gguf.py | 5 --- src/models/glm4.cpp | 2 +- tools/mtmd/clip-impl.h | 1 + tools/mtmd/clip-model.h | 2 ++ tools/mtmd/clip.cpp | 8 +++-- tools/mtmd/models/glm4v.cpp | 65 ++++++++++++------------------------- 6 files changed, 31 insertions(+), 52 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dd2ce29a0d2..1b69cac37d4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4428,11 +4428,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("model.visual.", "visual.") if name.startswith("visual.merger."): return [(self.map_tensor_name(name), data_torch)] - if "downsample.weight" in name: - # unfold the downsample to mistral-small format - c_out, c_in, kh, kw = data_torch.shape - data_unfold = data_torch.view(c_out, c_in * kh * kw) - return [(self.map_tensor_name(name), data_unfold)] return super().modify_tensors(data_torch, name, bid) diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 41477eceff4..eb8b7a5eb4f 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -67,7 +67,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); } - if (rope_type == LLAMA_ROPE_TYPE_MROPE) { + if (rope_type & LLAMA_ROPE_TYPE_MROPE) { // M-RoPE without using NEOX ordering auto rope_type_adj = rope_type | GGML_ROPE_TYPE_MRNORM; Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index d1cd04df7bc..b18b698ba23 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -68,6 +68,7 @@ #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_NORM_EMBD "v.norm_embd.%s" #define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index a6c3f32ef20..9c55c9a1bc9 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -151,6 +151,8 @@ struct clip_model { ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) ggml_tensor * patch_bias = nullptr; ggml_tensor * position_embeddings = nullptr; + ggml_tensor * norm_embd_w = nullptr; + ggml_tensor * norm_embd_b = nullptr; ggml_tensor * pre_ln_w = nullptr; ggml_tensor * pre_ln_b = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9d704307c2b..62fba74f84a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1251,6 +1251,9 @@ struct clip_model_loader { model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); + model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false); + model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false); + model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); // layers @@ -1842,7 +1845,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params loader.warmup(*ctx_vision); } - // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.1f); + // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); } if (loader.has_audio) { @@ -2548,7 +2551,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: - case PROJECTOR_TYPE_GLM4V: + //case PROJECTOR_TYPE_GLM4V: { GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); clip_image_u8 resized; @@ -2618,6 +2621,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution + case PROJECTOR_TYPE_GLM4V: // for debugging only { clip_image_u8 resized_image; int sz = params.image_size; diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index a17b01bad66..60bc1cfe4ad 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -5,17 +5,18 @@ ggml_cgraph * clip_graph_glm4v::build() { GGML_ASSERT(model.position_embeddings != nullptr); GGML_ASSERT(model.class_embedding == nullptr); - const int batch_size = 1; - const int n_pos = n_patches; - const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + const int batch_size = 1; norm_type norm_t = NORM_TYPE_RMS; - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + GGML_ASSERT(img.nx % (patch_size * 2) == 0); GGML_ASSERT(img.ny % (patch_size * 2) == 0); @@ -38,15 +39,11 @@ ggml_cgraph * clip_graph_glm4v::build() { } // add patch bias - if (model.patch_bias != nullptr) { - inp = ggml_add(ctx0, inp, model.patch_bias); - cb(inp, "patch_bias", -1); - } + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); - // pre-layernorm (aka post-conv) - if (model.pre_ln_w) { - inp = build_norm(inp, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - } + // pos-conv norm + inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); // calculate absolute position embedding and apply ggml_tensor * learned_pos_embd = resize_position_embeddings(); @@ -60,16 +57,9 @@ ggml_cgraph * clip_graph_glm4v::build() { learned_pos_embd = ggml_cont_3d( ctx0, learned_pos_embd, n_embd, n_patches_x * n_patches_y, batch_size); - inp = ggml_add(ctx0, inp, learned_pos_embd); - cb(inp, "inp_pos_emb", -1); + cb(learned_pos_embd, "learned_pos_embd", -1); - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - // build ViT with 2D position embeddings auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - // first half is X axis and second half is Y axis return ggml_rope_multi( ctx0, cur, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, @@ -78,43 +68,29 @@ ggml_cgraph * clip_graph_glm4v::build() { ggml_tensor * cur = build_vit( inp, n_patches, - NORM_TYPE_NORMAL, + norm_t, hparams.ffn_op, learned_pos_embd, add_pos); cb(cur, "vit_out", -1); - - // post-layernorm - if (model.post_ln_w) { - cur = build_norm(cur, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); - } + // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1); // GLM4V projector // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 - // patch merger (copied from pixtral) + // patch merger (downsample) { int n_merge = hparams.n_merge; GGML_ASSERT(n_merge > 0); - // reshape image tokens to 2D grid - cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); - cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] - cur = ggml_cont(ctx0, cur); - - // torch.nn.functional.unfold is just an im2col under the hood - // we just need a dummy kernel to make it work - ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); - cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); - - // project to n_embd - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + int n_token_out = n_patches / n_merge / n_merge; + cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out] + cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out] - // add bias cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); - cb(cur, "after_patch_merger", -1); } // FC projector @@ -122,7 +98,7 @@ ggml_cgraph * clip_graph_glm4v::build() { cur = ggml_mul_mat(ctx0, model.projection, cur); // default LayerNorm (post_projection_norm) cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); - cur = ggml_gelu(ctx0, cur); + cur = ggml_gelu_erf(ctx0, cur); cb(cur, "after_fc_proj", -1); } @@ -134,6 +110,7 @@ ggml_cgraph * clip_graph_glm4v::build() { model.mm_ffn_down_w, model.mm_ffn_down_b, hparams.ffn_op, -1); cb(cur, "after_ffn_proj", -1); + // cb(ggml_sum(ctx0, cur), "merged_sum", -1); } // build the graph From f00127e35de44e79e57b07e48cfb42d737e23ef8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 22:17:15 +0100 Subject: [PATCH 13/25] fix style --- src/models/glm4.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index eb8b7a5eb4f..aa762dcd666 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -7,7 +7,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - + int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); From 1514734c5efdcc8054b97cc9a4f48654fd837ee9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 22:31:06 +0100 Subject: [PATCH 14/25] use bicubic --- tools/mtmd/clip-graph.h | 4 +++- tools/mtmd/clip.cpp | 7 +++---- tools/mtmd/models/glm4v.cpp | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 6d303b4e48b..fb2af330558 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -9,6 +9,8 @@ #include #include +#define DEFAULT_INTERPOLATION_MODE GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS + struct clip_graph { const clip_model & model; const clip_hparams & hparams; @@ -49,7 +51,7 @@ struct clip_graph { void cb(ggml_tensor * cur0, const char * name, int il) const; // siglip2 naflex - ggml_tensor * resize_position_embeddings(); + ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE); // build vision transformer (ViT) cgraph // this function should cover most of the models diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 62fba74f84a..da8817a58b7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { } // siglip2 naflex -ggml_tensor * clip_graph::resize_position_embeddings() { +ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; const int height = img.ny / patch_size; const int width = img.nx / patch_size; - const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; + const uint32_t mode = interpolation_mode; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); GGML_ASSERT(pos_embd); @@ -2551,7 +2551,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: - //case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_GLM4V: { GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); clip_image_u8 resized; @@ -2621,7 +2621,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution - case PROJECTOR_TYPE_GLM4V: // for debugging only { clip_image_u8 resized_image; int sz = params.image_size; diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 60bc1cfe4ad..f39b6922eb5 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -46,7 +46,7 @@ ggml_cgraph * clip_graph_glm4v::build() { inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); // calculate absolute position embedding and apply - ggml_tensor * learned_pos_embd = resize_position_embeddings(); + ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); learned_pos_embd = ggml_cont_4d( ctx0, learned_pos_embd, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); From cadaedbb17466695dd863f005d5acd4c01063be9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 23:57:09 +0100 Subject: [PATCH 15/25] fix mrope metal --- ggml/src/ggml-cpu/ops.cpp | 4 ++-- ggml/src/ggml-metal/ggml-metal.metal | 7 ++++--- tests/test-backend-ops.cpp | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index eef3dba06a2..ca93c0c9d6c 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5768,9 +5768,9 @@ static void ggml_compute_forward_rope_flt( rotate_pairs(n_dims, 1, cache, src, dst_data, 1); } else if (mode == GGML_ROPE_TYPE_NEOX) { rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); - } else if (mode & GGML_ROPE_TYPE_VISION) { + } else if (is_vision) { rotate_pairs(ne0, n_dims, cache, src, dst_data); - } else if (mode & GGML_ROPE_TYPE_MROPE || mode & GGML_ROPE_TYPE_IMROPE) { + } else if (mrope_used) { if (mode & GGML_ROPE_TYPE_MRNORM) { rotate_pairs(n_dims, 1, cache, src, dst_data, 1); } else { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index c97c2388200..45df1549333 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4232,7 +4232,7 @@ kernel void kernel_rope_multi( for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { if (i0 < args.n_dims) { - const int ic = i0 / args.idx_scale; + const int ic = i0/2; // mrope theta calculations // note: the rest is the same as kernel_rope_neox @@ -4271,8 +4271,9 @@ kernel void kernel_rope_multi( rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + const int i_base = i0 / args.idx_scale; + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i_base*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i_base*args.nb0); const float x0 = src[0]; const float x1 = src[args.offset]; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 416218b5b86..bf0eba37749 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7703,6 +7703,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B) test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 20, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 32, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); + test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 32, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM, 512, fs, ef, af, ff, v, fw)); test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B) test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 7B) test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 20, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); @@ -8186,6 +8187,7 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_rope(type, { 80, 32, 512, 1}, 20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm) test_cases.emplace_back(new test_rope(type, { 64, 8, 512, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, {128, 12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B) + test_cases.emplace_back(new test_rope(type, {128, 12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope,non-neox (glm4v) test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B) test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT) } From 4a0b89ab2d2198d0043d051b705fb69a70333147 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 14 Dec 2025 23:59:42 +0100 Subject: [PATCH 16/25] improve cpu --- ggml/src/ggml-cpu/ops.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index ca93c0c9d6c..877607c6147 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5764,20 +5764,22 @@ static void ggml_compute_forward_rope_flt( T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); T * dst_data = (T *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - if (mode == GGML_ROPE_TYPE_NORMAL) { - rotate_pairs(n_dims, 1, cache, src, dst_data, 1); - } else if (mode == GGML_ROPE_TYPE_NEOX) { - rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); - } else if (is_vision) { - rotate_pairs(ne0, n_dims, cache, src, dst_data); - } else if (mrope_used) { - if (mode & GGML_ROPE_TYPE_MRNORM) { + switch (mode) { + case GGML_ROPE_TYPE_NORMAL: + case GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM: + case GGML_ROPE_TYPE_IMROPE | GGML_ROPE_TYPE_MRNORM: rotate_pairs(n_dims, 1, cache, src, dst_data, 1); - } else { + break; + case GGML_ROPE_TYPE_NEOX: + case GGML_ROPE_TYPE_MROPE: + case GGML_ROPE_TYPE_IMROPE: rotate_pairs(n_dims, n_dims/2, cache, src, dst_data); - } - } else { - GGML_ABORT("rope type not supported"); + break; + case GGML_ROPE_TYPE_VISION: + rotate_pairs(ne0, n_dims, cache, src, dst_data); + break; + default: + GGML_ABORT("rope type not supported"); } if (!is_vision) { From d00d11e4505755aa170f53ecf489442a7bda174b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 01:17:19 +0100 Subject: [PATCH 17/25] convert to neox ordering on conversion --- convert_hf_to_gguf.py | 44 ++++++++++++++++++++++++++++++++++++++++++- src/models/glm4.cpp | 6 ++---- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1b69cac37d4..a8362faccb3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7900,6 +7900,16 @@ def prepare_tensors(self): @ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {} + if "mrope_section" in rope_scaling: + self.use_mrope = True + self.partial_rotary_factor = rope_scaling.get("partial_rotary_factor", 0.5) + logger.info("Using M-RoPE") def set_vocab(self): from transformers import AutoTokenizer @@ -7928,7 +7938,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) # handle M-RoPE, the same as Qwen-VL - if "mrope_section" in rope_scaling: + if self.use_mrope: mrope_section = rope_scaling["mrope_section"] # Pad to 4 dimensions [time, height, width, extra] while len(mrope_section) < 4: @@ -7936,11 +7946,43 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) logger.info(f"MRoPE sections: {mrope_section[:4]}") + @staticmethod + def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: + orig_shape = weights.shape + if len(orig_shape) == 1: + weights = weights.unsqueeze(1) # [out_dim, 1] + if len(weights.shape) != 2: + raise ValueError("Only 1D and 2D tensors are supported.") + n_effective_heads = weights.shape[0] // head_dim + if n_head_kv is not None and n_effective_heads != n_head: + if n_effective_heads != n_head_kv: + raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") + rotary_dim = int(head_dim * partial_rotary_factor) + if rotary_dim % 2 != 0: + raise ValueError("rotary_dim must be even.") + reshaped = weights.reshape(n_effective_heads, head_dim, -1) + rot_part = reshaped[:, :rotary_dim, :] + non_rot_part = reshaped[:, rotary_dim:, :] + permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) + combined = torch.cat((permuted_rot, non_rot_part), dim=1) + result = combined.reshape(weights.shape) + return result if len(orig_shape) != 1 else result.squeeze(1) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v return [] elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for Glm4v + if self.use_mrope: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + head_dim = n_embd // n_head + # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) return super().modify_tensors(data_torch, name, bid) diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index aa762dcd666..49bbac1cd79 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -68,14 +68,12 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params } if (rope_type & LLAMA_ROPE_TYPE_MROPE) { - // M-RoPE without using NEOX ordering - auto rope_type_adj = rope_type | GGML_ROPE_TYPE_MRNORM; Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, - n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, - n_rot, sections, rope_type_adj, n_ctx_orig, freq_base, freq_scale, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else { // Normal RoPE From f8aad31609fee0a854990b1e780c6f6be8660b6b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 01:21:39 +0100 Subject: [PATCH 18/25] revert backend changes --- ggml/include/ggml.h | 7 +++---- ggml/src/ggml-cpu/ops.cpp | 2 -- ggml/src/ggml-metal/ggml-metal-impl.h | 2 -- ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 +----- ggml/src/ggml-metal/ggml-metal.metal | 12 +++++------- ggml/src/ggml.c | 3 +-- tests/test-backend-ops.cpp | 2 -- 7 files changed, 10 insertions(+), 24 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2b9e7ec5d5d..686da3dbd10 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -243,11 +243,10 @@ // TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726 #define GGML_ROPE_TYPE_NORMAL 0 -#define GGML_ROPE_TYPE_NEOX 2 // binary: 000010 -#define GGML_ROPE_TYPE_MROPE 8 // binary: 001000 -#define GGML_ROPE_TYPE_VISION 24 // binary: 011000 +#define GGML_ROPE_TYPE_NEOX 2 +#define GGML_ROPE_TYPE_MROPE 8 +#define GGML_ROPE_TYPE_VISION 24 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000 -#define GGML_ROPE_TYPE_MRNORM 4 // binary: 000100 (MROPE without NEOX - need to combine with one of the above) #define GGML_MROPE_SECTIONS 4 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 877607c6147..3032783971d 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5766,8 +5766,6 @@ static void ggml_compute_forward_rope_flt( switch (mode) { case GGML_ROPE_TYPE_NORMAL: - case GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM: - case GGML_ROPE_TYPE_IMROPE | GGML_ROPE_TYPE_MRNORM: rotate_pairs(n_dims, 1, cache, src, dst_data, 1); break; case GGML_ROPE_TYPE_NEOX: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index a2d1235175e..8944b07e907 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -258,8 +258,6 @@ typedef struct { int32_t sect_2; int32_t sect_3; bool src2; - uint64_t offset; // args.n_dims/2 for NEOX ordering, 1 otherwise - uint64_t idx_scale; // 2 for NEOX ordering, 1 otherwise } ggml_metal_kargs_rope; typedef struct { diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 74ca6836400..e99c1763f63 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -3141,7 +3141,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) { const int n_past = ((const int32_t *) op->op_params)[0]; const int n_dims = ((const int32_t *) op->op_params)[1]; - const int mode = ((const int32_t *) op->op_params)[2]; + //const int mode = ((const int32_t *) op->op_params)[2]; // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal const int n_ctx_orig = ((const int32_t *) op->op_params)[4]; @@ -3165,8 +3165,6 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) { const int sect_2 = ((const int32_t *) op->op_params)[13]; const int sect_3 = ((const int32_t *) op->op_params)[14]; - bool is_normal_ordering = mode == GGML_ROPE_TYPE_NORMAL || mode & GGML_ROPE_TYPE_MRNORM; - ggml_metal_kargs_rope args = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, @@ -3198,8 +3196,6 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) { /* sect_2 =*/ sect_2, /* sect_3 =*/ sect_3, /* src2 =*/ op->src[2] != nullptr, - /* offset =*/ is_normal_ordering ? 1u : ((uint16_t)n_dims / 2), - /* idx_scale =*/ is_normal_ordering ? 1u : 2u, }; auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 45df1549333..51bcbae309f 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4152,7 +4152,6 @@ kernel void kernel_rope_norm( } } -// TODO @ngxson : merge with kernel_rope_norm using offset and idx_scale template kernel void kernel_rope_neox( constant ggml_metal_kargs_rope & args, @@ -4271,15 +4270,14 @@ kernel void kernel_rope_multi( rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); - const int i_base = i0 / args.idx_scale; - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i_base*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i_base*args.nb0); + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); const float x0 = src[0]; - const float x1 = src[args.offset]; + const float x1 = src[args.n_dims/2]; - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[args.offset] = x0*sin_theta + x1*cos_theta; + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; } else { device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6f950fcaef4..f0913cd3596 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4073,8 +4073,7 @@ static struct ggml_tensor * ggml_rope_impl( bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; if (mrope_used) { - // mrope expecting multiple positions ids per token - GGML_ASSERT(a->ne[2] * GGML_MROPE_SECTIONS == b->ne[0]); + GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token } else { GGML_ASSERT(a->ne[2] == b->ne[0]); } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index bf0eba37749..416218b5b86 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7703,7 +7703,6 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B) test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 20, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 32, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); - test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 32, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM, 512, fs, ef, af, ff, v, fw)); test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B) test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 7B) test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 20, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); @@ -8187,7 +8186,6 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_rope(type, { 80, 32, 512, 1}, 20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm) test_cases.emplace_back(new test_rope(type, { 64, 8, 512, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, {128, 12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B) - test_cases.emplace_back(new test_rope(type, {128, 12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE | GGML_ROPE_TYPE_MRNORM, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope,non-neox (glm4v) test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B) test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT) } From 8700158852f8783261186869f793c9a222c52d23 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 01:28:35 +0100 Subject: [PATCH 19/25] force stop if using old weight --- src/models/glm4.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 49bbac1cd79..403464e5e79 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -16,6 +16,12 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params inpL = build_inp_embd(model.tok_embd); + bool use_mrope = rope_type & LLAMA_ROPE_TYPE_MROPE; + if (ubatch.embd && !use_mrope) { + // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results + GGML_ABORT("This GGUF does not support multimodal. Please reconvert it."); + } + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -67,7 +73,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); } - if (rope_type & LLAMA_ROPE_TYPE_MROPE) { + if (use_mrope) { Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); From 33fb59ab11d45edb15c0d20e5119d0348e95e1a2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 14:42:44 +0100 Subject: [PATCH 20/25] support moe variant --- convert_hf_to_gguf.py | 11 +++++++++++ src/llama-hparams.cpp | 4 ++++ src/llama-hparams.h | 2 ++ src/llama-model.cpp | 4 ++-- src/models/glm4-moe.cpp | 39 ++++++++++++++++++++++++++++----------- src/models/glm4.cpp | 2 +- 6 files changed, 48 insertions(+), 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a8362faccb3..267e2609631 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8051,6 +8051,17 @@ def set_gguf_parameters(self): if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + # handle M-RoPE, the same as Qwen-VL + # note: unlike GLM4 non-MoE, we don't need to permute the weights here since GLM4_MOE uses Neox ordering already + rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {} + if "mrope_section" in rope_scaling: + mrope_section = rope_scaling["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"MRoPE sections: {mrope_section[:4]}") + _experts: list[dict[str, Tensor]] | None = None def modify_tensors( diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 277d0bcfd3c..02fe0f1e0d2 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -241,3 +241,7 @@ float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale return attn_factor; } + +bool llama_hparams::use_mrope() const { + return rope_sections[0] > 0 && rope_sections[1] > 0; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c9960e91697..a8e53885983 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -275,6 +275,8 @@ struct llama_hparams { // ref: https://github.com/ggml-org/llama.cpp/discussions/7416 // https://github.com/ggml-org/llama.cpp/pull/17945 static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor); + + bool use_mrope() const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 60c57407606..4f5bfcc1de8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7838,9 +7838,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { return LLAMA_ROPE_TYPE_IMROPE; case LLM_ARCH_GLM4: - return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM; + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM; case LLM_ARCH_GLM4_MOE: - return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 33ee7070463..003f70f7396 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + ggml_tensor * cur; ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); + bool use_mrope = hparams.use_mrope(); + if (ubatch.embd && !use_mrope) { + // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results + GGML_ABORT("This GGUF does not support multimodal. Please reconvert it."); + } + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); } - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + + if (use_mrope) { + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } else { + // Normal RoPE + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 403464e5e79..204aa3932af 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -16,7 +16,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params inpL = build_inp_embd(model.tok_embd); - bool use_mrope = rope_type & LLAMA_ROPE_TYPE_MROPE; + bool use_mrope = hparams.use_mrope(); if (ubatch.embd && !use_mrope) { // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results GGML_ABORT("This GGUF does not support multimodal. Please reconvert it."); From c8fd94f3ba382705c1d966bfd269c89bd1f79d8a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 14:52:10 +0100 Subject: [PATCH 21/25] fix conversion --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 267e2609631..a77a042bf6c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7986,7 +7986,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Glm4MoeForCausalLM") +@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") class Glm4MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.GLM4_MOE From 785ccf428054307d2f149e9bd3d9222d2deb4403 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 15:27:09 +0100 Subject: [PATCH 22/25] fix convert (2) --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a77a042bf6c..dbc87291bbf 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4407,7 +4407,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Glm4vForConditionalGeneration") +@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") class Glm4VVisionModel(Qwen3VLVisionModel): def set_gguf_parameters(self): MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters From 7d53c0f09f9dd70d0ec1c9f1bebef1fbbc2a7a27 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 15 Dec 2025 22:00:18 +0100 Subject: [PATCH 23/25] Update tools/mtmd/clip-graph.h Co-authored-by: Georgi Gerganov --- tools/mtmd/clip-graph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index fb2af330558..bebd05c9ef4 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -9,7 +9,7 @@ #include #include -#define DEFAULT_INTERPOLATION_MODE GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS +#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS) struct clip_graph { const clip_model & model; From dd66aba5a1014a9db265075fc87b7507c7c338b7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Dec 2025 22:11:41 +0100 Subject: [PATCH 24/25] process mrope_section on TextModel base class --- convert_hf_to_gguf.py | 63 +++++++------------------------------------ 1 file changed, 10 insertions(+), 53 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1233faf978d..603689738b7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -861,6 +861,14 @@ def set_gguf_parameters(self): logger.warning(f"Unknown RoPE type: {rope_type}") logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + if "mrope_section" in self.rope_parameters: + mrope_section = self.rope_parameters["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"gguf: mrope sections: {mrope_section[:4]}") + if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") @@ -3738,9 +3746,6 @@ class Qwen2VLModel(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() - mrope_section = self.hparams["rope_scaling"]["mrope_section"] - mrope_section += [0] * max(0, 4 - len(mrope_section)) - self.gguf_writer.add_rope_dimension_sections(mrope_section) def set_vocab(self): try: @@ -4408,20 +4413,6 @@ def set_gguf_parameters(self): super().set_gguf_parameters() # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - text_config = self.hparams.get("text_config", {}) - # rope_scaling is deprecated in V5, use rope_parameters instead - rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {} - - if rope_scaling.get("mrope_section"): - # mrope_section contains [time, height, width] dimensions - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - - logger.info(f"MRoPE sections: {mrope_section[:4]}") - vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -4440,22 +4431,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel): def set_gguf_parameters(self): super().set_gguf_parameters() - - # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - text_config = self.hparams.get("text_config", {}) - # rope_scaling is deprecated in V5, use rope_parameters instead - rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {} - - if rope_scaling.get("mrope_section"): - # mrope_section contains [time, height, width] dimensions - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - - logger.info(f"MRoPE sections: {mrope_section[:4]}") - vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -7826,7 +7801,7 @@ def __init__(self, *args, **kwargs): self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) if "mrope_section" in self.rope_parameters: self.use_mrope = True - logger.info("Using M-RoPE") + logger.info("Q/K weight will need to be permuted for M-RoPE") def set_vocab(self): from transformers import AutoTokenizer @@ -7849,14 +7824,6 @@ def set_gguf_parameters(self): if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) - # handle M-RoPE, the same as Qwen-VL - if self.use_mrope: - mrope_section = self.rope_parameters["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - logger.info(f"MRoPE sections: {mrope_section[:4]}") @staticmethod def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: @@ -7963,19 +7930,9 @@ def set_gguf_parameters(self): if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - # handle M-RoPE, the same as Qwen-VL - # note: unlike GLM4 non-MoE, we don't need to permute the weights here since GLM4_MOE uses Neox ordering already - rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {} - if "mrope_section" in rope_scaling: - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - logger.info(f"MRoPE sections: {mrope_section[:4]}") - _experts: list[dict[str, Tensor]] | None = None + # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: From f969d4f63b86331037fd821d20a0bd4c3d8bafa2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 16 Dec 2025 00:29:25 +0100 Subject: [PATCH 25/25] resolve conflict merge --- src/llama-hparams.cpp | 10 ---------- src/llama-hparams.h | 7 ------- 2 files changed, 17 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 4e612953da5..0beab1bce2a 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -231,16 +231,6 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama return false; } -float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) { - GGML_ASSERT(ext_factor >= 0.0f); - - if (ext_factor != 0.0f) { - attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); - } - - return attn_factor; -} - bool llama_hparams::use_mrope() const { return rope_sections[0] > 0 && rope_sections[1] > 0; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index f1adbb0ac6d..f6e95b5d2a6 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -271,13 +271,6 @@ struct llama_hparams { // TODO: pack the SWA params in a struct? static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1); - // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor: - // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544 - // - // ref: https://github.com/ggml-org/llama.cpp/discussions/7416 - // https://github.com/ggml-org/llama.cpp/pull/17945 - static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor); - bool use_mrope() const; };