From 34300a03bc8d8a3b649b17341e5cbce42b95e06a Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 7 Jun 2024 14:00:01 -0700 Subject: [PATCH 1/6] Add `JAIS` model(s) --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 85 ++++++++++++++-- examples/main/main.cpp | 1 - ggml/src/ggml.c | 2 +- gguf-py/gguf/constants.py | 14 +++ gguf-py/gguf/tensor_mapping.py | 19 ++-- include/llama.h | 1 + src/llama.cpp | 177 ++++++++++++++++++++++++++++++++- 8 files changed, 279 insertions(+), 21 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 2758214fa87..944e9d15abd 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -86,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B + {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5bcc849db99..fdf7fcece3b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -427,9 +427,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" @@ -457,18 +454,12 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" @@ -490,6 +481,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": # ref: https://huggingface.co/LumiOpen/Viking-7B res = "viking" + if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": + # ref: https://huggingface.co/core42/jais-13b + res = "jais" if res is None: logger.warning("\n") @@ -2817,6 +2811,79 @@ def write_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False + if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias", "relative_pe.slopes")): + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + + @Model.register("T5ForConditionalGeneration") @Model.register("T5WithLMHeadModel") diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cfaf6a6e8ba..7ab9d036977 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -733,7 +733,6 @@ int main(int argc, char ** argv) { // Console/Stream Output fprintf(stdout, "%s", token_str.c_str()); - // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check if (embd.size() > 1) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f5502afbe98..dc64881038c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -13516,13 +13516,13 @@ static void ggml_compute_forward_soft_max_f32( } else { for (int i = 0; i < nc; ++i) { wp[i] += slope*mp_f32[i]; + } } } #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(wp[i])); } #endif diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cf3d09e70d3..a9f6314cbd6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -161,6 +161,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK2 = auto() BITNET = auto() T5 = auto() + JAIS = auto() class MODEL_TENSOR(IntEnum): @@ -285,6 +286,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", + MODEL_ARCH.JAIS: "jais", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -951,6 +953,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.JAIS: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bed439397b..20e28423b9a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -10,7 +10,7 @@ class TensorNameMap: # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf @@ -49,7 +49,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -58,7 +58,7 @@ class TensorNameMap: # Output norm MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon + "transformer.ln_f", # gpt2 gpt-j falcon jais "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth "transformer.norm_f", # mpt dbrx @@ -81,7 +81,7 @@ class TensorNameMap: # Attention norm MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais "transformer.blocks.{bid}.norm_1", # mpt "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom @@ -109,7 +109,7 @@ class TensorNameMap: # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen + "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais "transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon @@ -160,7 +160,7 @@ class TensorNameMap: # Attention output MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom @@ -202,7 +202,7 @@ class TensorNameMap: # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen + "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf @@ -239,7 +239,7 @@ class TensorNameMap: # Feed-forward up MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.c_fc", # gpt2 jais "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom @@ -285,6 +285,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.gate_proj", # llama-hf refact "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 "encoder.layers.{bid}.mlp.fc12", # nomic-bert @@ -308,7 +309,7 @@ class TensorNameMap: # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom diff --git a/include/llama.h b/include/llama.h index cafeafb85db..c5b61829204 100644 --- a/include/llama.h +++ b/include/llama.h @@ -89,6 +89,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_VIKING = 16, + LLAMA_VOCAB_PRE_TYPE_JAIS = 17, }; // note: these values should be synchronized with ggml_rope diff --git a/src/llama.cpp b/src/llama.cpp index 3edaa98e8d0..5d0b7a29f7c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -228,6 +228,7 @@ enum llm_arch { LLM_ARCH_DEEPSEEK2, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_JAIS, LLM_ARCH_UNKNOWN, }; @@ -269,6 +270,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1230,6 +1232,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_JAIS, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2029,6 +2046,7 @@ enum e_model { MODEL_410M, MODEL_0_5B, MODEL_1B, + MODEL_1_3B, MODEL_1_4B, MODEL_2B, MODEL_2_8B, @@ -4880,6 +4898,18 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_JAIS: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1_3B; break; + case 40: model.type = e_model::MODEL_13B; break; + /* TODO: add variants */ + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -5111,6 +5141,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; + } else if ( + tokenizer_pre == "jais") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -6908,7 +6941,6 @@ static bool llm_load_tensors( case LLM_ARCH_BITNET: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); @@ -6943,6 +6975,43 @@ static bool llm_load_tensors( layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}); } } break; + case LLM_ARCH_JAIS: + { + // Output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -12307,6 +12376,107 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_jais() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + //struct ggml_tensor * pos; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // // inp_pos - contains the positions + // struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + // pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + // cb(pos, "pos_embd", -1); + + // inpL = ggml_add(ctx0, inpL, pos); + // cb(inpL, "inpL", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -12538,6 +12708,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_bitnet(); } break; + case LLM_ARCH_JAIS: + { + result = llm.build_jais(); + } break; default: GGML_ASSERT(false); } @@ -17760,6 +17934,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values From a067ed8cdd5ea0f342f5e189666acf3cdd878800 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 25 Jun 2024 12:04:08 -0700 Subject: [PATCH 2/6] cleanup --- convert-hf-to-gguf.py | 146 ++++++++++++++++++++--------------------- examples/main/main.cpp | 1 + ggml/src/ggml.c | 2 +- src/llama.cpp | 13 +--- 4 files changed, 76 insertions(+), 86 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index fdf7fcece3b..88bfcecec42 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2811,80 +2811,6 @@ def write_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JAISLMHeadModel") -class JaisModel(Model): - model_arch = gguf.MODEL_ARCH.JAIS - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding - assert self.hparams["position_embedding_type"] == "alibi" - - # Embeddings scale - self.embeddings_scale = 1.0 - # note: For some JAIS flavors, output is tied to (same as) wte in original model - self.output_is_wte = False - if 'mup_embeddings_scale' in self.hparams: - self.output_is_wte = True # Hack (?) - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] - else: - assert False - - self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] - else: - assert False - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias", "relative_pe.slopes")): - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((new_name, data_torch * self.embeddings_scale)) - if self.output_is_wte: - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) - elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - assert not self.output_is_wte - tensors.append((new_name, data_torch * self.width_scale)) - else: - tensors.append((new_name, data_torch)) - - return tensors - - - @Model.register("T5ForConditionalGeneration") @Model.register("T5WithLMHeadModel") class T5Model(Model): @@ -3002,6 +2928,78 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False + if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias", "relative_pe.slopes")): + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + ###### CONVERSION LOGIC ###### diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7ab9d036977..cfaf6a6e8ba 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -733,6 +733,7 @@ int main(int argc, char ** argv) { // Console/Stream Output fprintf(stdout, "%s", token_str.c_str()); + // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check if (embd.size() > 1) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index dc64881038c..f5502afbe98 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -13516,13 +13516,13 @@ static void ggml_compute_forward_soft_max_f32( } else { for (int i = 0; i < nc; ++i) { wp[i] += slope*mp_f32[i]; - } } } #ifndef NDEBUG for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(wp[i])); } #endif diff --git a/src/llama.cpp b/src/llama.cpp index 5d0b7a29f7c..00f7f708e7b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6977,6 +6977,8 @@ static bool llm_load_tensors( } break; case LLM_ARCH_JAIS: { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // Output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); @@ -7009,7 +7011,6 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); - } } break; default: @@ -12384,23 +12385,13 @@ struct llm_build_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; - //struct ggml_tensor * pos; struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - // // inp_pos - contains the positions - // struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); - // cb(pos, "pos_embd", -1); - - // inpL = ggml_add(ctx0, inpL, pos); - // cb(inpL, "inpL", -1); - for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, From a8d4afb97b483cfe7d4c94eea62eaa8cf8171f45 Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 28 Jun 2024 06:51:02 -0700 Subject: [PATCH 3/6] address review comments --- convert-hf-to-gguf.py | 10 ++++++++++ include/llama.h | 5 +++++ src/llama.cpp | 9 +++++++++ 3 files changed, 24 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 88bfcecec42..4ce0b49e639 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -427,6 +427,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" @@ -454,12 +457,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-base + res = "dbrx" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" @@ -2811,6 +2820,7 @@ def write_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + @Model.register("T5ForConditionalGeneration") @Model.register("T5WithLMHeadModel") class T5Model(Model): diff --git a/include/llama.h b/include/llama.h index c5b61829204..2591edce93e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -652,6 +652,11 @@ extern "C" { // State / sessions // + // hack + void llama_set_logits_all( + struct llama_context * ctx, + bool logits_all); + // Returns the maximum size in bytes of the state (rng, logits, embedding // and kv_cache) - will often be smaller after compacting tokens LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 00f7f708e7b..88ca14db946 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4281,6 +4281,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_410M: return "410M"; case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; + case MODEL_1_3B: return "1.3B"; case MODEL_1_4B: return "1.4B"; case MODEL_2B: return "2B"; case MODEL_2_8B: return "2.8B"; @@ -13105,6 +13106,13 @@ static void llama_graph_compute( // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } +void llama_set_logits_all( + struct llama_context * ctx, + bool logits_all +) { + ctx->logits_all = logits_all; +} + // decode a batch of tokens by evaluating the transformer // // - lctx: llama context @@ -14052,6 +14060,7 @@ struct llm_tokenizer_bpe { break; case LLAMA_VOCAB_PRE_TYPE_GPT2: case LLAMA_VOCAB_PRE_TYPE_OLMO: + case LLAMA_VOCAB_PRE_TYPE_JAIS: regex_exprs = { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }; From f42285f0e5f2a21a46fa0310a30d71a7b8ebbd47 Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 28 Jun 2024 07:10:59 -0700 Subject: [PATCH 4/6] remove hack --- include/llama.h | 5 ----- src/llama.cpp | 7 ------- 2 files changed, 12 deletions(-) diff --git a/include/llama.h b/include/llama.h index 2591edce93e..c5b61829204 100644 --- a/include/llama.h +++ b/include/llama.h @@ -652,11 +652,6 @@ extern "C" { // State / sessions // - // hack - void llama_set_logits_all( - struct llama_context * ctx, - bool logits_all); - // Returns the maximum size in bytes of the state (rng, logits, embedding // and kv_cache) - will often be smaller after compacting tokens LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 88ca14db946..307bcef84c3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13106,13 +13106,6 @@ static void llama_graph_compute( // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } -void llama_set_logits_all( - struct llama_context * ctx, - bool logits_all -) { - ctx->logits_all = logits_all; -} - // decode a batch of tokens by evaluating the transformer // // - lctx: llama context From 2d4de517bb9e4a9283f0bce11752a5e2caf0db68 Mon Sep 17 00:00:00 2001 From: fmz Date: Mon, 1 Jul 2024 09:26:56 -0700 Subject: [PATCH 5/6] un-hardcode max-alibi-bias --- convert-hf-to-gguf.py | 21 ++++++++++++++++++--- src/llama.cpp | 4 ++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4ce0b49e639..4a8aec8cd02 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2938,6 +2938,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] + @Model.register("JAISLMHeadModel") class JaisModel(Model): model_arch = gguf.MODEL_ARCH.JAIS @@ -2954,7 +2955,7 @@ def __init__(self, *args, **kwargs): self.embeddings_scale = 1.0 # note: For some JAIS flavors, output is tied to (same as) wte in original model self.output_is_wte = False - if 'mup_embeddings_scale' in self.hparams: + if 'mup_embeddings_scale' in self.hparams: self.output_is_wte = True # Hack (?) self.embeddings_scale = self.hparams['mup_embeddings_scale'] elif 'embeddings_scale' in self.hparams: @@ -2963,7 +2964,7 @@ def __init__(self, *args, **kwargs): assert False self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: + if 'mup_output_alpha' in self.hparams: assert 'mup_width_scale' in self.hparams self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] elif 'width_scale' in self.hparams: @@ -2984,13 +2985,27 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) + # Hack to populate self.tensor_names + all(self.get_tensors()) + if 'transformer.relative_pe.slopes' not in self.tensor_names: + self.gguf_writer.add_max_alibi_bias(8.0) + # else set later + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused tensors: list[tuple[str, Tensor]] = [] # we don't need these - if name.endswith((".attn.bias", "relative_pe.slopes")): + if name.endswith((".attn.bias")): + return tensors + + if name.endswith(("relative_pe.slopes")): + # calculate ALiBi bias + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch._data[0]) + alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + self.gguf_writer.add_max_alibi_bias(alibi_bias) return tensors if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): diff --git a/src/llama.cpp b/src/llama.cpp index 307bcef84c3..8549388f10e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4902,8 +4902,8 @@ static void llm_load_hparams( case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - // TODO: become GGUF KV parameter - hparams.f_max_alibi_bias = 8.0f; + ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); + switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1_3B; break; case 40: model.type = e_model::MODEL_13B; break; From 8b64c7ae466b3fbb6b826c3a93dfb47eb9f4f87e Mon Sep 17 00:00:00 2001 From: fmz Date: Mon, 1 Jul 2024 15:29:04 -0700 Subject: [PATCH 6/6] minor tweaks --- convert-hf-to-gguf.py | 21 ++++++++++++--------- src/llama.cpp | 1 + 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4a8aec8cd02..44061e2869e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2972,6 +2972,8 @@ def __init__(self, *args, **kwargs): else: assert False + self.max_alibi_bias = 8.0 + def set_vocab(self): self._set_vocab_gpt2() @@ -2985,12 +2987,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - # Hack to populate self.tensor_names - all(self.get_tensors()) - if 'transformer.relative_pe.slopes' not in self.tensor_names: - self.gguf_writer.add_max_alibi_bias(8.0) - # else set later - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3001,11 +2997,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors if name.endswith(("relative_pe.slopes")): - # calculate ALiBi bias + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) first_val = float(data_torch._data[0]) - alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - self.gguf_writer.add_max_alibi_bias(alibi_bias) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + return tensors if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): @@ -3025,6 +3024,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors + def write_tensors(self): + super().write_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) + ###### CONVERSION LOGIC ###### diff --git a/src/llama.cpp b/src/llama.cpp index 8549388f10e..e5907ac50e3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6942,6 +6942,7 @@ static bool llm_load_tensors( case LLM_ARCH_BITNET: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});