From 283af6414521f3a43c2f0501783d042eb241a208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 9 Jun 2025 21:06:30 +0200 Subject: [PATCH 1/5] support GEGLU for jina-bert-v2 --- convert_hf_to_gguf.py | 27 --------------------------- gguf-py/gguf/tensor_mapping.py | 1 + src/llama-graph.cpp | 9 ++++----- src/llama-model.cpp | 6 +++--- 4 files changed, 8 insertions(+), 35 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7b9893c8a3e..a208c42ba9a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4798,25 +4798,6 @@ def prepare_tensors(self): class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.intermediate_size = self.hparams["intermediate_size"] - - def get_tensors(self): - for name, data in super().get_tensors(): - if 'gated_layer' in name: - d1 = data[:self.intermediate_size, :] - name1 = name.replace('gated_layers', 'gated_layers_w') - name1 = name1.replace('up_gated_layer', 'gated_layers_v') - d2 = data[self.intermediate_size:, :] - name2 = name.replace('gated_layers', 'gated_layers_v') - name2 = name2.replace('up_gated_layer', 'gated_layers_w') - yield name1, d1 - yield name2, d2 - continue - - yield name, data - def set_vocab(self): tokenizer_class = 'BertTokenizer' with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: @@ -4832,14 +4813,6 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "bert.", remove the prefix - # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - if name.startswith("bert."): - name = name[5:] - - return super().modify_tensors(data_torch, name, bid) - @ModelBase.register("OpenELMForCausalLM") class OpenELMModel(TextModel): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 93dd1d8028f..34e048c34d8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -334,6 +334,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 27c9ab74be1..a35907242b5 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -650,8 +650,8 @@ ggml_tensor * llm_graph_context::build_ffn( { // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf int64_t split_point = cur->ne[0] / 2; - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + ggml_tensor * x0 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0); + ggml_tensor * x1 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)); x0 = ggml_silu(ctx0, x0); cb(cur, "ffn_silu", il); @@ -663,9 +663,8 @@ ggml_tensor * llm_graph_context::build_ffn( { // Split into two equal parts int64_t split_point = cur->ne[0] / 2; - // TODO: these conts should not be needed - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + ggml_tensor * x0 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0); + ggml_tensor * x1 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)); x0 = ggml_gelu(ctx0, x0); cb(x0, "ffn_gelu", il); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c41ee24507f..f4a66390c79 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2224,8 +2224,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); @@ -6043,7 +6043,7 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, - LLM_FFN_GELU, LLM_FFN_PAR, il); + model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, From 75e7e0ab4471c29d2a3e3b463acaacc690bc79d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 9 Jun 2025 21:58:56 +0200 Subject: [PATCH 2/5] update comment [no ci] ggml-ci --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 34e048c34d8..de6d763e89b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -334,7 +334,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 - "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 + "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU) "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone From c75069874ed0b544a821fefd7b06cf31a79cb49b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 10 Jun 2025 12:09:14 +0200 Subject: [PATCH 3/5] put back ggml_cont for now --- src/llama-graph.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a35907242b5..56082279119 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -650,8 +650,9 @@ ggml_tensor * llm_graph_context::build_ffn( { // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf int64_t split_point = cur->ne[0] / 2; - ggml_tensor * x0 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0); - ggml_tensor * x1 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)); + // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217 + ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); x0 = ggml_silu(ctx0, x0); cb(cur, "ffn_silu", il); @@ -663,8 +664,9 @@ ggml_tensor * llm_graph_context::build_ffn( { // Split into two equal parts int64_t split_point = cur->ne[0] / 2; - ggml_tensor * x0 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0); - ggml_tensor * x1 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)); + // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217 + ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); x0 = ggml_gelu(ctx0, x0); cb(x0, "ffn_gelu", il); From c09418bb0fd7faeb0d471aa1440635608b918791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 10 Jun 2025 12:32:01 +0200 Subject: [PATCH 4/5] update comments [no ci] ggml-ci --- gguf-py/gguf/tensor_mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index de6d763e89b..c50a5bbce73 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -333,7 +333,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "model.layers.{bid}.mlp.c_fc", # starcoder2 - "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used) "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU) "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm @@ -371,7 +371,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) "transformer.h.{bid}.mlp.linear_1", # refact "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone From 6dd4d1fa9e5f461577cf47355e8e4248c466abe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 10 Jun 2025 13:01:53 +0200 Subject: [PATCH 5/5] add jina-v2-code up_gated_layer [no ci] --- gguf-py/gguf/tensor_mapping.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c50a5bbce73..439fc1afeeb 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -335,6 +335,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used) "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU) + "encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU) "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone