From 5471f507f72cd31b3c28bc5bb1f8edd1f57e62d7 Mon Sep 17 00:00:00 2001 From: ravenouse Date: Fri, 31 Oct 2025 16:07:00 -0700 Subject: [PATCH 01/10] Add support for Janus Pro --- convert_hf_to_gguf.py | 110 +++++++++++++++++++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 6 +- tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip.cpp | 73 ++++++++++++++++++++++ 4 files changed, 189 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b7593666843..5cd6799e9cb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9493,6 +9493,116 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors + +@ModelBase.register("JanusForConditionalGeneration") +class JanusProModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip vision, aligner, and generation tensors as they will be handled by `JanusProVisionModel` + skip_prefixes = ( + 'model.vision_model.', + 'model.aligner.', + 'model.vqmodel.', + 'model.generation_embeddings.', + 'model.generation_aligner.', + 'model.generation_head.', + ) + if name.startswith(skip_prefixes): + return [] + + if name.startswith('model.language_model.'): + name = name.replace('model.language_model.', 'model.') + elif name.startswith('language_model.'): + name = name.replace('language_model.', '') + + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("JanusForConditionalGeneration") +class JanusProVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + if "intermediate_size" not in self.hparams_vision: + mlp_ratio = self.hparams_vision.get("mlp_ratio") + hidden_size = self.hparams_vision.get("hidden_size") + if mlp_ratio is not None and hidden_size is not None: + self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO) + + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]: + """Map aligner tensors to projector format""" + suffix = ".bias" if name.endswith(".bias") else ".weight" + + if name.startswith("model.aligner."): + local_name = name[len("model.aligner."):] + elif name.startswith("aligner."): + local_name = name[len("aligner."):] + else: + raise ValueError(f"Unsupported Janus aligner prefix: {name}") + + if local_name.startswith("fc1."): + mm_index = 0 + elif local_name.startswith("hidden_layers."): + parts = local_name.split(".", 2) + if len(parts) < 3: + raise ValueError(f"Unexpected Janus aligner tensor name: {name}") + mm_index = int(parts[1]) + 1 + else: + raise ValueError(f"Unsupported Janus aligner tensor: {name}") + + tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix) + return [(tensor_name, data_torch)] + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # Skip language model tensors as they will be handled by `JanusProModel` + if name.startswith(('model.language_model.', 'language_model.')): + return [] + + # Skip generation-related components + skip_generation_prefixes = ( + 'model.vqmodel.', + 'vqmodel.', + 'model.generation_embeddings.', + 'generation_embeddings.', + 'model.generation_aligner.', + 'generation_aligner.', + 'model.generation_head.', + 'generation_head.', + ) + if name.startswith(skip_generation_prefixes): + return [] + + # Handle aligner tensors + if name.startswith(('model.aligner.', 'aligner.')): + return list(self._map_aligner_tensor(data_torch, name)) + + # Handle vision tensors + if name.startswith(('model.vision_model.', 'vision_model.')): + return [(self.map_tensor_name(name), data_torch)] + + return [] + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d7dcd8efb84..059079ccabb 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1154,6 +1154,7 @@ class TensorNameMap: "model.mm_projector.mlp.mlp.{bid}", "vision_model.vision_adapter.mlp.fc{bid}", # llama 4 "mlp1.{bid}", # InternVL + "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro ), MODEL_TENSOR.V_MMPROJ_PEG: ( @@ -1170,7 +1171,7 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.patch_embedding", "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", - "model.vision_model.embeddings.patch_embedding", # SmolVLM + "model.vision_model.embeddings.patch_embedding", # SmolVLM, Janus Pro "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 @@ -1182,7 +1183,7 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", - "model.vision_model.embeddings.position_embedding", # SmolVLM + "model.vision_model.embeddings.position_embedding", # SmolVLM, Janus Pro "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl ), @@ -1252,6 +1253,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index ad2108d1798..fecf611d4b3 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -140,6 +140,7 @@ enum projector_type { PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_LIGHTONOCR, + PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_UNKNOWN, }; @@ -163,6 +164,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LFM2, "lfm2"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, + { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b44f0a3a28a..733d3e769fa 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1509,6 +1509,45 @@ struct clip_graph { return gf; } + ggml_cgraph * build_janus_pro() { + GGML_ASSERT(model.class_embedding == nullptr); // No CLS token + + ggml_tensor * inp = build_inp(); + + const int n_pos = n_patches; + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + cur = ggml_mul_mat(ctx0, model.mm_0_w, cur); + if (model.mm_0_b) { + cur = ggml_add(ctx0, cur, model.mm_0_b); + } + cb(cur, "aligner_0", -1); + + cur = ggml_gelu(ctx0, cur); + + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + if (model.mm_1_b) { + cur = ggml_add(ctx0, cur, model.mm_1_b); + } + cb(cur, "aligner_1", -1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + // whisper encoder with custom projector ggml_cgraph * build_whisper_enc() { const int n_frames = img.nx; @@ -2126,6 +2165,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_kimivl(); } break; + case PROJECTOR_TYPE_JANUS_PRO: + { + res = graph.build_janus_pro(); + } break; default: { res = graph.build_llava(); @@ -2442,6 +2485,14 @@ struct clip_model_loader { hparams.ffn_op = FFN_GELU_ERF; log_ffn_op = "gelu_erf"; // temporary solution for logging } break; + case PROJECTOR_TYPE_JANUS_PRO: + { + // Janus Pro uses mean = std = [0.5, 0.5, 0.5] + // ref: https://huggingface.co/deepseek-community/Janus-Pro-1B/blob/main/preprocessor_config.json + // ref: https://huggingface.co/deepseek-community/Janus-Pro-7B/blob/main/preprocessor_config.json + hparams.image_mean[0] = hparams.image_mean[1] = hparams.image_mean[2] = 0.5f; + hparams.image_std[0] = hparams.image_std[1] = hparams.image_std[2] = 0.5f; + } break; default: break; } @@ -2777,6 +2828,13 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); } break; + case PROJECTOR_TYPE_JANUS_PRO: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -3637,6 +3695,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); return true; + } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) { + // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384 + const std::array pad_color = {127, 127, 127}; + clip_image_u8 resized_image; + int sz = params.image_size; // 384 + image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}, pad_color); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + return true; + } else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL || ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR ) { @@ -3817,6 +3886,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im switch (proj) { case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_JANUS_PRO: { // do nothing } break; @@ -4286,6 +4356,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("pos_w", pos_data); } break; case PROJECTOR_TYPE_GLM_EDGE: + case PROJECTOR_TYPE_JANUS_PRO: { // llava and other models std::vector positions(n_pos); @@ -4427,6 +4498,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_JANUS_PRO: + return ctx->model.mm_1_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } From 01bd163bf50a5f4432f3eb842d8c9a6aa3d6fa9e Mon Sep 17 00:00:00 2001 From: Zhiyong Wang <85110830+ravenouse@users.noreply.github.com> Date: Sat, 1 Nov 2025 09:05:02 -0700 Subject: [PATCH 02/10] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 059079ccabb..ade6fbf6834 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1171,7 +1171,7 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.patch_embedding", "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", - "model.vision_model.embeddings.patch_embedding", # SmolVLM, Janus Pro + "model.vision_model.embeddings.patch_embedding", # SmolVLM "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 From d6069df3e8f05b595cbf8ffca7b388439b65790b Mon Sep 17 00:00:00 2001 From: Zhiyong Wang <85110830+ravenouse@users.noreply.github.com> Date: Sat, 1 Nov 2025 09:06:17 -0700 Subject: [PATCH 03/10] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ade6fbf6834..54e956f0d81 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1183,7 +1183,7 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", - "model.vision_model.embeddings.position_embedding", # SmolVLM, Janus Pro + "model.vision_model.embeddings.position_embedding", # SmolVLM "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl ), From d92205eaa65e1d204d8d9451da81bb6f5d81f093 Mon Sep 17 00:00:00 2001 From: ravenouse Date: Sat, 1 Nov 2025 09:16:39 -0700 Subject: [PATCH 04/10] Address reviewer suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5cd6799e9cb..a6d35cfd60c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9498,11 +9498,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter class JanusProModel(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch - def set_gguf_parameters(self): - super().set_gguf_parameters() - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Skip vision, aligner, and generation tensors as they will be handled by `JanusProVisionModel` + # Skip vision, aligner, and generation tensors skip_prefixes = ( 'model.vision_model.', 'model.aligner.', From e260b0e789ffd26e0cc10c47e47c70ae70ce2f02 Mon Sep 17 00:00:00 2001 From: ravenouse Date: Sun, 2 Nov 2025 00:10:45 -0700 Subject: [PATCH 05/10] Add JANUS_PRO constant --- gguf-py/gguf/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 94fcfaf69cf..35e5250c098 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3063,6 +3063,7 @@ class VisionProjectorType: LFM2 = "lfm2" KIMIVL = "kimivl" LIGHTONOCR = "lightonocr" + JANUS_PRO = "janus_pro" # Items here are (block size, type size) From 5794785ba25357b9b720fa9669a653f72b2d0b03 Mon Sep 17 00:00:00 2001 From: ravenouse Date: Sun, 2 Nov 2025 00:11:33 -0700 Subject: [PATCH 06/10] Update clip model handling Co-authored-by: Xuan-Son Nguyen --- tools/mtmd/clip.cpp | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 733d3e769fa..d266ed13b12 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1514,12 +1514,7 @@ struct clip_graph { ggml_tensor * inp = build_inp(); - const int n_pos = n_patches; - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + ggml_tensor * learned_pos_embd = model.position_embeddings; ggml_tensor * cur = build_vit( inp, n_patches, @@ -1528,18 +1523,12 @@ struct clip_graph { learned_pos_embd, nullptr); - cur = ggml_mul_mat(ctx0, model.mm_0_w, cur); - if (model.mm_0_b) { - cur = ggml_add(ctx0, cur, model.mm_0_b); - } - cb(cur, "aligner_0", -1); - - cur = ggml_gelu(ctx0, cur); - - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - if (model.mm_1_b) { - cur = ggml_add(ctx0, cur, model.mm_1_b); - } + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + hparams.ffn_op, + -1); cb(cur, "aligner_1", -1); // build the graph @@ -2485,14 +2474,6 @@ struct clip_model_loader { hparams.ffn_op = FFN_GELU_ERF; log_ffn_op = "gelu_erf"; // temporary solution for logging } break; - case PROJECTOR_TYPE_JANUS_PRO: - { - // Janus Pro uses mean = std = [0.5, 0.5, 0.5] - // ref: https://huggingface.co/deepseek-community/Janus-Pro-1B/blob/main/preprocessor_config.json - // ref: https://huggingface.co/deepseek-community/Janus-Pro-7B/blob/main/preprocessor_config.json - hparams.image_mean[0] = hparams.image_mean[1] = hparams.image_mean[2] = 0.5f; - hparams.image_std[0] = hparams.image_std[1] = hparams.image_std[2] = 0.5f; - } break; default: break; } @@ -4356,7 +4337,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("pos_w", pos_data); } break; case PROJECTOR_TYPE_GLM_EDGE: - case PROJECTOR_TYPE_JANUS_PRO: { // llava and other models std::vector positions(n_pos); @@ -4394,6 +4374,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_JANUS_PRO: { // do nothing } break; @@ -4481,6 +4462,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_mlp_3_w->ne[1]; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_JANUS_PRO: return ctx->model.mm_1_b->ne[0]; case PROJECTOR_TYPE_GEMMA3: return ctx->model.mm_input_proj_w->ne[0]; @@ -4498,8 +4480,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: return ctx->model.mm_2_w->ne[1]; - case PROJECTOR_TYPE_JANUS_PRO: - return ctx->model.mm_1_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } From 9601dc83a9dbcccf7ea2809d1ce21d25271b325c Mon Sep 17 00:00:00 2001 From: Zhiyong Wang <85110830+ravenouse@users.noreply.github.com> Date: Sun, 2 Nov 2025 10:05:34 -0800 Subject: [PATCH 07/10] Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen --- tools/mtmd/clip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d266ed13b12..385f991cb6f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2812,9 +2812,9 @@ struct clip_model_loader { case PROJECTOR_TYPE_JANUS_PRO: { model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); - model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); - model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); } break; default: GGML_ASSERT(false && "unknown projector type"); From 38ff44f9f3aa30f3974e266665991f146714030f Mon Sep 17 00:00:00 2001 From: ravenouse Date: Sun, 2 Nov 2025 10:18:55 -0800 Subject: [PATCH 08/10] Refactor JANUS_PRO handling in clip.cpp Co-authored-by: Xuan-Son Nguyen --- tools/mtmd/clip.cpp | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 385f991cb6f..416ce525588 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -550,6 +550,15 @@ struct clip_graph { cur = ggml_gelu(ctx0, cur); cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); cur = ggml_add(ctx0, cur, model.mm_2_b); + + } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) { + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + hparams.ffn_op, + -1); + } else { GGML_ABORT("SigLIP: Unsupported projector type"); } @@ -1508,35 +1517,6 @@ struct clip_graph { return gf; } - - ggml_cgraph * build_janus_pro() { - GGML_ASSERT(model.class_embedding == nullptr); // No CLS token - - ggml_tensor * inp = build_inp(); - - ggml_tensor * learned_pos_embd = model.position_embeddings; - - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - nullptr); - - cur = build_ffn(cur, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - hparams.ffn_op, - -1); - cb(cur, "aligner_1", -1); - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - // whisper encoder with custom projector ggml_cgraph * build_whisper_enc() { const int n_frames = img.nx; @@ -2156,7 +2136,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } break; case PROJECTOR_TYPE_JANUS_PRO: { - res = graph.build_janus_pro(); + res = graph.build_siglip(); } break; default: { From c06440fa7091fbd0e4fbc5f36c47e01cba81ac83 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 2 Nov 2025 21:14:08 +0100 Subject: [PATCH 09/10] Update tools/mtmd/clip.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- tools/mtmd/clip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index dae0a8917f1..a95e24435e9 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4121,7 +4121,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const std::array pad_color = {127, 127, 127}; clip_image_u8 resized_image; int sz = params.image_size; - img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR); + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); clip_image_f32_ptr img_f32(clip_image_f32_init()); normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(img_f32)); From 63f7cf31546375083815db1e3f53e90bff67d7ad Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Nov 2025 21:17:45 +0100 Subject: [PATCH 10/10] em whitespace --- convert_hf_to_gguf.py | 32 ++++++++++++++++---------------- tools/mtmd/clip.cpp | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 567f38cd2f7..c6f5ba6a04c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9806,7 +9806,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("JanusForConditionalGeneration") class JanusProModel(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch - + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision, aligner, and generation tensors skip_prefixes = ( @@ -9819,12 +9819,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) if name.startswith(skip_prefixes): return [] - + if name.startswith('model.language_model.'): name = name.replace('model.language_model.', 'model.') elif name.startswith('language_model.'): name = name.replace('language_model.', '') - + return super().modify_tensors(data_torch, name, bid) @@ -9838,25 +9838,25 @@ def __init__(self, *args, **kwargs): hidden_size = self.hparams_vision.get("hidden_size") if mlp_ratio is not None and hidden_size is not None: self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) - + def set_gguf_parameters(self): super().set_gguf_parameters() assert self.hparams_vision is not None - + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO) - + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) - + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() if hidden_act == "gelu": self.gguf_writer.add_vision_use_gelu(True) elif hidden_act == "silu": self.gguf_writer.add_vision_use_silu(True) - + def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]: """Map aligner tensors to projector format""" suffix = ".bias" if name.endswith(".bias") else ".weight" - + if name.startswith("model.aligner."): local_name = name[len("model.aligner."):] elif name.startswith("aligner."): @@ -9873,17 +9873,17 @@ def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[s mm_index = int(parts[1]) + 1 else: raise ValueError(f"Unsupported Janus aligner tensor: {name}") - + tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix) return [(tensor_name, data_torch)] - + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - + # Skip language model tensors as they will be handled by `JanusProModel` if name.startswith(('model.language_model.', 'language_model.')): return [] - + # Skip generation-related components skip_generation_prefixes = ( 'model.vqmodel.', @@ -9897,15 +9897,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) if name.startswith(skip_generation_prefixes): return [] - + # Handle aligner tensors if name.startswith(('model.aligner.', 'aligner.')): return list(self._map_aligner_tensor(data_torch, name)) - + # Handle vision tensors if name.startswith(('model.vision_model.', 'vision_model.')): return [(self.map_tensor_name(name), data_torch)] - + return [] diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a95e24435e9..152a1424dc4 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -597,7 +597,7 @@ struct clip_graph { model.mm_1_w, model.mm_1_b, hparams.ffn_op, -1); - + } else { GGML_ABORT("SigLIP: Unsupported projector type"); }