From 1a999819a20c49278d4b5c211e9cf42c07c486f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 23 Feb 2024 20:29:40 +0200 Subject: [PATCH 01/23] llama : refactor k-shift implementation ggml-ci --- examples/passkey/passkey.cpp | 2 +- llama.cpp | 328 ++++++++++++++++++----------------- llama.h | 2 + 3 files changed, 173 insertions(+), 159 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e12a1cdf19a..b6ae2288bc3 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -126,7 +126,7 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token diff --git a/llama.cpp b/llama.cpp index 37477e6ef3c..7b0961508e3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1550,8 +1550,9 @@ static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; struct llama_hparams { - bool vocab_only; - bool rope_finetuned; + bool vocab_only; + bool rope_finetuned; + uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -4595,10 +4596,11 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam using llm_build_cb = std::function; -enum llm_rope_type { - LLM_ROPE, - LLM_ROPE_NEOX, - LLM_ROPE_GLM, +enum llm_rope_type : int { + LLM_ROPE_NONE = -1, + LLM_ROPE = 0, + LLM_ROPE_NEOX = 2, + LLM_ROPE_GLM = 4, }; enum llm_ffn_op_type { @@ -4655,7 +4657,7 @@ static void llm_build_k_shift( const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * K_shift, - llm_rope_type type, + llm_rope_type rope_type, int64_t n_ctx, float freq_base, float freq_scale, @@ -4671,14 +4673,6 @@ static void llm_build_k_shift( const float beta_fast = cparams.yarn_beta_fast; const float beta_slow = cparams.yarn_beta_slow; - int rope_type = 0; - - switch (type) { - case LLM_ROPE: rope_type = 0; break; - case LLM_ROPE_NEOX: rope_type = 2; break; - case LLM_ROPE_GLM: rope_type = 4; break; - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions @@ -4988,6 +4982,38 @@ static struct ggml_tensor * llm_build_kv( return cur; } +static llm_rope_type llm_get_rope_type(llm_arch arch) { + switch (arch) { + case LLM_ARCH_LLAMA: return LLM_ROPE; + case LLM_ARCH_FALCON: return LLM_ROPE_NEOX; + case LLM_ARCH_BAICHUAN: return LLM_ROPE; + case LLM_ARCH_GPT2: return LLM_ROPE_NONE; + case LLM_ARCH_GPTJ: return LLM_ROPE_NONE; + case LLM_ARCH_GPTNEOX: return LLM_ROPE_NONE; + case LLM_ARCH_MPT: return LLM_ROPE_NONE; + case LLM_ARCH_STARCODER: return LLM_ROPE; + case LLM_ARCH_PERSIMMON: return LLM_ROPE_NEOX; + case LLM_ARCH_REFACT: return LLM_ROPE_NONE; + case LLM_ARCH_BERT: return LLM_ROPE_NEOX; + case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX; + case LLM_ARCH_BLOOM: return LLM_ROPE_NONE; + case LLM_ARCH_STABLELM: return LLM_ROPE_NEOX; + case LLM_ARCH_QWEN: return LLM_ROPE_NEOX; + case LLM_ARCH_QWEN2: return LLM_ROPE_NEOX; + case LLM_ARCH_PHI2: return LLM_ROPE_NEOX; + case LLM_ARCH_PLAMO: return LLM_ROPE; + case LLM_ARCH_CODESHELL: return LLM_ROPE; + case LLM_ARCH_ORION: return LLM_ROPE; + case LLM_ARCH_INTERNLM2: return LLM_ROPE; + case LLM_ARCH_MINICPM: return LLM_ROPE; + case LLM_ARCH_GEMMA: return LLM_ROPE; + case LLM_ARCH_UNKNOWN: + default: + GGML_ASSERT(false && "unknown architecture"); + return LLM_ROPE_NONE; + } +} + struct llm_build_context { const llama_model & model; const llama_context & lctx; @@ -5022,9 +5048,10 @@ struct llm_build_context { const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; - const bool do_rope_shift; const uint32_t pooling_type; + const llm_rope_type rope_type; + const llm_build_cb & cb; std::vector & buf_compute_meta; @@ -5066,8 +5093,8 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - do_rope_shift (worst_case || kv_self.has_shift), pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), + rope_type (llm_get_rope_type(model.arch)), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5090,6 +5117,14 @@ struct llm_build_context { } } + struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb); + + return gf; + } + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -5111,11 +5146,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5151,14 +5181,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5299,11 +5329,6 @@ struct llm_build_context { struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); cb(KQ_pos, "KQ_pos", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5327,12 +5352,12 @@ struct llm_build_context { case MODEL_7B: Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -5417,11 +5442,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5460,13 +5480,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5636,10 +5656,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; @@ -5730,13 +5746,13 @@ struct llm_build_context { cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, qrot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, krot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -5988,14 +6004,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6284,11 +6300,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6325,14 +6336,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6407,11 +6418,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6441,13 +6447,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6521,11 +6527,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6561,14 +6562,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6642,11 +6643,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { attn_norm_output = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, @@ -6684,7 +6680,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -6695,7 +6691,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6764,11 +6760,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { // norm @@ -6793,13 +6784,13 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, - n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, - n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -6969,11 +6960,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, @@ -6999,14 +6985,14 @@ struct llm_build_context { struct ggml_tensor * Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7077,11 +7063,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7117,14 +7098,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7196,11 +7177,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7236,14 +7212,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7328,11 +7304,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7368,14 +7339,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7464,11 +7435,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { // norm @@ -7491,7 +7457,7 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, - n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); @@ -7500,7 +7466,7 @@ struct llm_build_context { Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, - n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -7553,6 +7519,22 @@ struct llm_build_context { } }; +static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { + llama_batch dummy; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_k_shift(); + + llm.free(); + + return result; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -7672,6 +7654,20 @@ static struct ggml_cgraph * llama_build_graph( return result; } +static void llama_set_k_shift(llama_context & lctx) { + const auto & cparams = lctx.cparams; + + const int64_t n_ctx = cparams.n_ctx; + + assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); + + int32_t * data = (int32_t *) lctx.inp_K_shift->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } +} + static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // // set input data @@ -7739,18 +7735,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (kv_self.has_shift) { - const int64_t n_ctx = cparams.n_ctx; - - assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); - - int32_t * data = (int32_t *) lctx.inp_K_shift->data; - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { const int64_t n_tokens = batch.n_tokens; @@ -7795,6 +7779,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } +static void llama_graph_compute( + llama_context & lctx, + ggml_cgraph * gf, + int n_threads) { +#ifdef GGML_USE_MPI + const int64_t n_layer = hparams.n_layer; + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); +#endif + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(lctx.backend_metal)) { + ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); + } +#endif + + if (lctx.backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + } + + ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); + +#ifdef GGML_USE_MPI + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); +#endif +} + // decode a batch of tokens by evaluating the transformer // // - lctx: llama context @@ -7890,14 +7902,19 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + if (kv_self.has_shift) { + llama_kv_cache_apply_k_shift(&lctx); + } + ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + if (strcmp(res->name, "result_output") == 0) { // the embeddings could be the second to last tensor, or the third to last tensor if (strcmp(embeddings->name, "result_norm") != 0) { @@ -7924,40 +7941,12 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } -#ifdef GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif - -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend_metal)) { - ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); - } -#endif - - if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); - } - llama_set_inputs(lctx, batch); - ggml_backend_sched_graph_compute(lctx.sched, gf); - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); - -#ifdef GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif + llama_graph_compute(lctx, gf, n_threads); // update the kv ring buffer { - if (kv_self.has_shift) { - kv_self.has_shift = false; - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; - } - } - kv_self.head += n_tokens; // Ensure kv cache head points to a valid index. @@ -8053,6 +8042,28 @@ static int llama_decode_internal( return 0; } +void llama_kv_cache_apply_k_shift(struct llama_context * ctx) { + struct llama_context & lctx = *ctx; + + llama_set_k_shift(lctx); + + { + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } + + { + auto & kv_self = ctx->kv_self; + + kv_self.has_shift = false; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } + } +} + // // tokenizer // @@ -12054,6 +12065,7 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. diff --git a/llama.h b/llama.h index 84f196b3bb6..618841184ae 100644 --- a/llama.h +++ b/llama.h @@ -533,6 +533,8 @@ extern "C" { llama_pos p1, int d); + LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx); + // // State / sessions // From dd392191ca9fc2e0244c54b4cf2b888508363490 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 09:42:21 +0200 Subject: [PATCH 02/23] llama : rename llama_kv_cache_seq_shift to llama_kv_cache_seq_add --- examples/infill/infill.cpp | 4 ++-- examples/main/main.cpp | 10 +++++----- examples/passkey/passkey.cpp | 12 ++++++------ examples/server/server.cpp | 8 ++++---- llama.cpp | 6 +++--- llama.h | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 92c67b7cff5..d4b8729dd02 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -447,8 +447,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7555dffe441..34e84d0d42f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -548,8 +548,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -576,9 +576,9 @@ int main(int argc, char ** argv) { LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); + llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index b6ae2288bc3..f5db05c2d65 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -146,8 +146,8 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); n_past -= bd; } @@ -179,8 +179,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); n_past -= n_discard; @@ -208,8 +208,8 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); n_past -= n_discard; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 369121e885b..1b887b7a2df 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1502,8 +1502,8 @@ struct llama_server_context const int n_discard = n_left / 2; LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1778,9 +1778,9 @@ struct llama_server_context LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); slot.n_past_se -= bd; diff --git a/llama.cpp b/llama.cpp index 7b0961508e3..accf026b194 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2187,7 +2187,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id if (new_head != cache.size && new_head < cache.head) cache.head = new_head; } -static void llama_kv_cache_seq_shift( +static void llama_kv_cache_seq_add( struct llama_kv_cache & cache, llama_seq_id seq_id, llama_pos p0, @@ -12049,12 +12049,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { llama_kv_cache_seq_keep(ctx->kv_self, seq_id); } -void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { +void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { if (delta == 0) { return; } - llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); + llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); } void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { diff --git a/llama.h b/llama.h index 618841184ae..104ca7ead0a 100644 --- a/llama.h +++ b/llama.h @@ -515,7 +515,7 @@ extern "C" { // If the KV cache is RoPEd, the KV data is updated accordingly // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_shift( + LLAMA_API void llama_kv_cache_seq_add( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, From 89b2a43cac57beeaea9c5ea0af371d3a301d7e3f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 10:28:44 +0200 Subject: [PATCH 03/23] llama : cont k-shift refactoring + normalize type names ggml-ci --- common/common.cpp | 12 +- common/common.h | 4 +- examples/llama-bench/llama-bench.cpp | 14 +- examples/server/server.cpp | 12 +- llama.cpp | 318 +++++++++++++-------------- llama.h | 41 ++-- 6 files changed, 199 insertions(+), 202 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 10ef11829cc..95767ce4b6e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { @@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/common/common.h b/common/common.h index 935771d44ca..3e21579b005 100644 --- a/common/common.h +++ b/common/common.h @@ -61,7 +61,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. @@ -75,7 +75,7 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 11410f8ae76..8fec3d43ddf 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) { static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_NONE: return "none"; - case LLAMA_SPLIT_LAYER: return "layer"; - case LLAMA_SPLIT_ROW: return "row"; + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; default: GGML_ASSERT(!"invalid split mode"); } } @@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, - /* split_mode */ {LLAMA_SPLIT_LAYER}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { for (const auto & m : p) { llama_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_NONE; + mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_LAYER; + mode = LLAMA_SPLIT_MODE_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_ROW; + mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1b887b7a2df..89fdd0f8185 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2082,9 +2082,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-freq-base") @@ -2208,15 +2208,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; diff --git a/llama.cpp b/llama.cpp index accf026b194..a69c86e6a8d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -850,9 +850,9 @@ struct LLM_TN { // static std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_NONE, "none" }, - { LLAMA_ROPE_SCALING_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, }; static int32_t llama_rope_scaling_type_from_string(const std::string & name) { @@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) { } } - return LLAMA_ROPE_SCALING_UNSPECIFIED; + return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { @@ -1581,7 +1581,8 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - uint32_t pooling_type = LLAMA_POOLING_NONE; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -2311,7 +2312,7 @@ namespace GGUFMeta { } }; - struct ArrayInfo{ + struct ArrayInfo { const gguf_type gt; const size_t length; const void * data; @@ -2330,7 +2331,7 @@ namespace GGUFMeta { }; template - class GKV: public GKV_Base { + class GKV : public GKV_Base { GKV() = delete; public: @@ -2353,39 +2354,39 @@ namespace GGUFMeta { return "unknown"; } - static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) { - if (!override) { return false; } - if (override->tag == expected_type) { + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { + if (!ovrd) { return false; } + if (ovrd->tag == expected_type) { LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", - __func__, override_type_to_str(override->tag), override->key); - switch (override->tag) { + __func__, override_type_to_str(ovrd->tag), ovrd->key); + switch (ovrd->tag) { case LLAMA_KV_OVERRIDE_BOOL: { - LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value); } break; case LLAMA_KV_OVERRIDE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", override->float_value); + LLAMA_LOG_INFO("%.6f\n", ovrd->float_value); } break; default: // Shouldn't be possible to end up here, but just in case... throw std::runtime_error( format("Unsupported attempt to override %s type for metadata key %s\n", - override_type_to_str(override->tag), override->key)); + override_type_to_str(ovrd->tag), ovrd->key)); } return true; } LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", - __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag)); + __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); return false; } template static typename std::enable_if::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { - target = override->bool_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_BOOL, ovrd)) { + target = ovrd->bool_value; return true; } return false; @@ -2393,9 +2394,9 @@ namespace GGUFMeta { template static typename std::enable_if::value && std::is_integral::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { - target = override->int_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_INT, ovrd)) { + target = ovrd->int_value; return true; } return false; @@ -2403,9 +2404,9 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { - target = override->float_value; + try_override(T & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, ovrd)) { + target = ovrd->float_value; return true; } return false; @@ -2413,17 +2414,17 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { + try_override(T & target, const struct llama_model_kv_override * ovrd) { (void)target; - (void)override; - if (!override) { return false; } + (void)ovrd; + if (!ovrd) { return false; } // Currently, we should never end up here so it would be a bug if we do. throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", - override ? override->key : "NULL")); + ovrd ? ovrd->key : "NULL")); } - static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { - if (try_override(target, override)) { + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + if (try_override(target, ovrd)) { return true; } if (k < 0) { return false; } @@ -2431,12 +2432,12 @@ namespace GGUFMeta { return true; } - static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, gguf_find_key(ctx, key), target, override); + static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, gguf_find_key(ctx, key), target, ovrd); } - static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, key.c_str(), target, override); + static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, key.c_str(), target, ovrd); } }; } @@ -2846,6 +2847,15 @@ struct llama_model_loader { } }; +template<> +bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { + uint32_t tmp; + const bool found = get_key(kid, tmp, required); + result = (enum llama_pooling_type) tmp; + return found; +} + + // // load LLaMA models // @@ -2924,16 +2934,16 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } + static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } - static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); if (model.arch == LLM_ARCH_UNKNOWN) { @@ -2997,7 +3007,7 @@ static void llm_load_hparams( std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); - GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; @@ -3110,10 +3120,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); switch (hparams.n_layer) { case 3: @@ -3131,10 +3141,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_NOMIC_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); if (hparams.n_layer == 12 && hparams.n_embd == 768) { model.type = e_model::MODEL_137M; @@ -3273,6 +3283,8 @@ static void llm_load_hparams( if (hparams.f_max_alibi_bias > 0.0f) { hparams.need_kq_pos = true; } + + hparams.rope_type = llama_rope_type(&model); } // TODO: This should probably be in llama.h @@ -3575,6 +3587,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); + LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); + LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); @@ -3641,7 +3655,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_LAYER) { + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); @@ -3680,10 +3694,10 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_ROW) { + if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { - // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } // assign the repeating layers @@ -4596,13 +4610,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam using llm_build_cb = std::function; -enum llm_rope_type : int { - LLM_ROPE_NONE = -1, - LLM_ROPE = 0, - LLM_ROPE_NEOX = 2, - LLM_ROPE_GLM = 4, -}; - enum llm_ffn_op_type { LLM_FFN_SILU, LLM_FFN_GELU, @@ -4648,47 +4655,6 @@ static struct ggml_tensor * llm_build_inp_embd( return inpL; } -// Persimmon: n_rot = n_embd_head_k/2 -// Other: n_rot = n_embd_head_k -static void llm_build_k_shift( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * K_shift, - llm_rope_type rope_type, - int64_t n_ctx, - float freq_base, - float freq_scale, - const llm_build_cb & cb) { - const int64_t n_layer = hparams.n_layer; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int32_t n_rot = hparams.n_rot; - const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; - const float ext_factor = cparams.yarn_ext_factor; - const float attn_factor = cparams.yarn_attn_factor; - const float beta_fast = cparams.yarn_beta_fast; - const float beta_slow = cparams.yarn_beta_slow; - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions - ggml_rope_custom_inplace(ctx, - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - 0), - K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(graph, tmp); - } -} - static void llm_build_kv_store( struct ggml_context * ctx, const llama_hparams & hparams, @@ -4982,38 +4948,6 @@ static struct ggml_tensor * llm_build_kv( return cur; } -static llm_rope_type llm_get_rope_type(llm_arch arch) { - switch (arch) { - case LLM_ARCH_LLAMA: return LLM_ROPE; - case LLM_ARCH_FALCON: return LLM_ROPE_NEOX; - case LLM_ARCH_BAICHUAN: return LLM_ROPE; - case LLM_ARCH_GPT2: return LLM_ROPE_NONE; - case LLM_ARCH_GPTJ: return LLM_ROPE_NONE; - case LLM_ARCH_GPTNEOX: return LLM_ROPE_NONE; - case LLM_ARCH_MPT: return LLM_ROPE_NONE; - case LLM_ARCH_STARCODER: return LLM_ROPE; - case LLM_ARCH_PERSIMMON: return LLM_ROPE_NEOX; - case LLM_ARCH_REFACT: return LLM_ROPE_NONE; - case LLM_ARCH_BERT: return LLM_ROPE_NEOX; - case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX; - case LLM_ARCH_BLOOM: return LLM_ROPE_NONE; - case LLM_ARCH_STABLELM: return LLM_ROPE_NEOX; - case LLM_ARCH_QWEN: return LLM_ROPE_NEOX; - case LLM_ARCH_QWEN2: return LLM_ROPE_NEOX; - case LLM_ARCH_PHI2: return LLM_ROPE_NEOX; - case LLM_ARCH_PLAMO: return LLM_ROPE; - case LLM_ARCH_CODESHELL: return LLM_ROPE; - case LLM_ARCH_ORION: return LLM_ROPE; - case LLM_ARCH_INTERNLM2: return LLM_ROPE; - case LLM_ARCH_MINICPM: return LLM_ROPE; - case LLM_ARCH_GEMMA: return LLM_ROPE; - case LLM_ARCH_UNKNOWN: - default: - GGML_ASSERT(false && "unknown architecture"); - return LLM_ROPE_NONE; - } -} - struct llm_build_context { const llama_model & model; const llama_context & lctx; @@ -5024,6 +4958,7 @@ struct llm_build_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; const int64_t n_head_kv; @@ -5048,9 +4983,8 @@ struct llm_build_context { const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; - const uint32_t pooling_type; - - const llm_rope_type rope_type; + const enum llama_pooling_type pooling_type; + const enum llama_rope_type rope_type; const llm_build_cb & cb; @@ -5072,6 +5006,7 @@ struct llm_build_context { kv_self (lctx.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), + n_rot (hparams.n_rot), n_ctx (cparams.n_ctx), n_head (hparams.n_head), n_head_kv (hparams.n_head_kv), @@ -5093,8 +5028,8 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), - rope_type (llm_get_rope_type(model.arch)), + pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE), + rope_type (hparams.rope_type), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5120,7 +5055,20 @@ struct llm_build_context { struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx0, + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0), + lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(gf, tmp); + } return gf; } @@ -6063,12 +6011,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_MEAN) { + if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_CLS) { + } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { cur = ggml_get_rows(ctx0, cur, inp_cls); } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); + GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -7521,6 +7469,7 @@ struct llm_build_context { static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { llama_batch dummy; + dummy.n_tokens = 0; llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; @@ -7735,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -7763,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -7784,7 +7733,7 @@ static void llama_graph_compute( ggml_cgraph * gf, int n_threads) { #ifdef GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; + const int64_t n_layer = lctx.hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -7902,9 +7851,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - if (kv_self.has_shift) { - llama_kv_cache_apply_k_shift(&lctx); - } + llama_kv_cache_apply(&lctx); ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8042,24 +7989,25 @@ static int llama_decode_internal( return 0; } -void llama_kv_cache_apply_k_shift(struct llama_context * ctx) { - struct llama_context & lctx = *ctx; - - llama_set_k_shift(lctx); +static void llama_kv_cache_apply_internal(struct llama_context & lctx) { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + llama_set_k_shift(lctx); - { - ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + { + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); - } + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } - { - auto & kv_self = ctx->kv_self; + { + auto & kv_self = lctx.kv_self; - kv_self.has_shift = false; + kv_self.has_shift = false; - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } } } } @@ -11338,7 +11286,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_LAYER, + /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -11364,7 +11312,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11552,16 +11500,16 @@ struct llama_context * llama_new_context_with_model( cparams.cb_eval_user_data = params.cb_eval_user_data; auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } if (params.seed == LLAMA_DEFAULT_SEED) { @@ -11595,8 +11543,8 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -11605,7 +11553,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { @@ -11807,6 +11755,38 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } +enum llama_rope_type llama_rope_type(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_LLAMA: return LLAMA_ROPE_TYPE; + case LLM_ARCH_FALCON: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_BAICHUAN: return LLAMA_ROPE_TYPE; + case LLM_ARCH_GPT2: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_GPTJ: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_MPT: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_STARCODER: return LLAMA_ROPE_TYPE; + case LLM_ARCH_PERSIMMON: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_REFACT: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_BERT: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_NOMIC_BERT: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_BLOOM: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_STABLELM: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_QWEN: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_QWEN2: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_PHI2: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_PLAMO: return LLAMA_ROPE_TYPE; + case LLM_ARCH_CODESHELL: return LLAMA_ROPE_TYPE; + case LLM_ARCH_ORION: return LLAMA_ROPE_TYPE; + case LLM_ARCH_INTERNLM2: return LLAMA_ROPE_TYPE; + case LLM_ARCH_MINICPM: return LLAMA_ROPE_TYPE; + case LLM_ARCH_GEMMA: return LLAMA_ROPE_TYPE; + case LLM_ARCH_UNKNOWN: + default: + GGML_ASSERT(false && "unknown architecture"); + return LLAMA_ROPE_TYPE_NONE; + } +} + int32_t llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -12065,6 +12045,10 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } +void llama_kv_cache_apply(struct llama_context * ctx) { + llama_kv_cache_apply_internal(*ctx); +} + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { diff --git a/llama.h b/llama.h index 104ca7ead0a..479265f6c1b 100644 --- a/llama.h +++ b/llama.h @@ -64,6 +64,13 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; + enum llama_rope_type { + LLAMA_ROPE_TYPE_NONE = -1, + LLAMA_ROPE_TYPE = 0, + LLAMA_ROPE_TYPE_NEOX = 2, + LLAMA_ROPE_TYPE_GLM = 4, + }; + enum llama_token_type { LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, @@ -107,23 +114,23 @@ extern "C" { }; enum llama_rope_scaling_type { - LLAMA_ROPE_SCALING_UNSPECIFIED = -1, - LLAMA_ROPE_SCALING_NONE = 0, - LLAMA_ROPE_SCALING_LINEAR = 1, - LLAMA_ROPE_SCALING_YARN = 2, - LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_TYPE_NONE = 0, + LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, + LLAMA_ROPE_SCALING_TYPE_YARN = 2, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, }; enum llama_pooling_type { - LLAMA_POOLING_NONE = 0, - LLAMA_POOLING_MEAN = 1, - LLAMA_POOLING_CLS = 2, + LLAMA_POOLING_TYPE_NONE = 0, + LLAMA_POOLING_TYPE_MEAN = 1, + LLAMA_POOLING_TYPE_CLS = 2, }; enum llama_split_mode { - LLAMA_SPLIT_NONE = 0, // single GPU - LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_NONE = 0, // single GPU + LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; typedef struct llama_token_data { @@ -358,6 +365,7 @@ extern "C" { LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @@ -512,7 +520,9 @@ extern "C" { llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_apply() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_add( @@ -523,7 +533,9 @@ extern "C" { llama_pos delta); // Integer division of the positions by factor of `d > 1` - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_apply() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( @@ -533,7 +545,8 @@ extern "C" { llama_pos p1, int d); - LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx); + // Apply the KV cache updates (such as K-shifts) to the KV data + LLAMA_API void llama_kv_cache_apply(struct llama_context * ctx); // // State / sessions From 2b9a9bff2b0d70bc8c88cd3c756305129da4cbd5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 10:41:21 +0200 Subject: [PATCH 04/23] minor : fix MPI builds --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a69c86e6a8d..4b257c472da 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7733,7 +7733,7 @@ static void llama_graph_compute( ggml_cgraph * gf, int n_threads) { #ifdef GGML_USE_MPI - const int64_t n_layer = lctx.hparams.n_layer; + const int64_t n_layer = lctx.model.hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif From 5f5b1b57caee36bf9835bdea5514731b5b574322 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 10:44:59 +0200 Subject: [PATCH 05/23] llama : reuse n_rot from the build context ggml-ci --- llama.cpp | 68 +++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4b257c472da..3a257a8637e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5129,14 +5129,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5300,12 +5300,12 @@ struct llm_build_context { case MODEL_7B: Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -5428,13 +5428,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5661,7 +5661,7 @@ struct llm_build_context { // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( - ctx0, tmpq, hparams.n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, 0 @@ -5669,7 +5669,7 @@ struct llm_build_context { cb(qrot, "qrot", il); struct ggml_tensor * krot = ggml_view_3d( - ctx0, tmpk, hparams.n_rot, n_head, n_tokens, + ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 @@ -5678,29 +5678,29 @@ struct llm_build_context { // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, hparams.n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, - ggml_element_size(tmpq) * hparams.n_rot + ggml_element_size(tmpq) * n_rot ); cb(qpass, "qpass", il); struct ggml_tensor * kpass = ggml_view_3d( - ctx0, tmpk, hparams.n_rot, n_head, n_tokens, + ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, - ggml_element_size(tmpk) * hparams.n_rot + ggml_element_size(tmpk) * n_rot ); cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -5952,14 +5952,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6284,14 +6284,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6395,13 +6395,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6510,14 +6510,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6628,7 +6628,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -6639,7 +6639,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6731,13 +6731,13 @@ struct llm_build_context { cb(Vcur, "Vcur", il); Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -6933,14 +6933,14 @@ struct llm_build_context { struct ggml_tensor * Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7046,14 +7046,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7160,14 +7160,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7287,14 +7287,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); From 42ddf4846c556bad1599654df5cd4ec6b9a792a6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 11:23:37 +0200 Subject: [PATCH 06/23] llama : revert enum name changes from this PR ggml-ci --- common/common.cpp | 12 ++++---- common/common.h | 4 +-- examples/llama-bench/llama-bench.cpp | 14 ++++----- examples/server/server.cpp | 12 ++++---- llama.cpp | 46 ++++++++++++++-------------- llama.h | 22 ++++++------- 6 files changed, 55 insertions(+), 55 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 95767ce4b6e..10ef11829cc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { @@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; + params.split_mode = LLAMA_SPLIT_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; + params.split_mode = LLAMA_SPLIT_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_MODE_ROW; + params.split_mode = LLAMA_SPLIT_ROW; } else { invalid_param = true; break; diff --git a/common/common.h b/common/common.h index 3e21579b005..935771d44ca 100644 --- a/common/common.h +++ b/common/common.h @@ -61,7 +61,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. @@ -75,7 +75,7 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8fec3d43ddf..11410f8ae76 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) { static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; + case LLAMA_SPLIT_NONE: return "none"; + case LLAMA_SPLIT_LAYER: return "layer"; + case LLAMA_SPLIT_ROW: return "row"; default: GGML_ASSERT(!"invalid split mode"); } } @@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, + /* split_mode */ {LLAMA_SPLIT_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { for (const auto & m : p) { llama_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_MODE_NONE; + mode = LLAMA_SPLIT_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_MODE_LAYER; + mode = LLAMA_SPLIT_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_MODE_ROW; + mode = LLAMA_SPLIT_ROW; } else { invalid_param = true; break; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 89fdd0f8185..1b887b7a2df 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2082,9 +2082,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-freq-base") @@ -2208,15 +2208,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; + params.split_mode = LLAMA_SPLIT_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; + params.split_mode = LLAMA_SPLIT_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_MODE_ROW; + params.split_mode = LLAMA_SPLIT_ROW; } else { invalid_param = true; diff --git a/llama.cpp b/llama.cpp index 3a257a8637e..d950fc02282 100644 --- a/llama.cpp +++ b/llama.cpp @@ -850,9 +850,9 @@ struct LLM_TN { // static std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, - { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_NONE, "none" }, + { LLAMA_ROPE_SCALING_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_YARN, "yarn" }, }; static int32_t llama_rope_scaling_type_from_string(const std::string & name) { @@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) { } } - return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + return LLAMA_ROPE_SCALING_UNSPECIFIED; } static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { @@ -1581,7 +1581,7 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_pooling_type pooling_type = LLAMA_POOLING_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; bool operator!=(const llama_hparams & other) const { @@ -3007,7 +3007,7 @@ static void llm_load_hparams( std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); - GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; @@ -3655,7 +3655,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_MODE_LAYER) { + if (split_mode == LLAMA_SPLIT_LAYER) { // calculate the split points int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); @@ -3694,10 +3694,10 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_MODE_ROW) { + if (split_mode == LLAMA_SPLIT_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { - // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported + // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } // assign the repeating layers @@ -5028,7 +5028,7 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE), + pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_NONE), rope_type (hparams.rope_type), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { @@ -6011,12 +6011,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { + if (pooling_type == LLAMA_POOLING_MEAN) { cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { + } else if (pooling_type == LLAMA_POOLING_CLS) { cur = ggml_get_rows(ctx0, cur, inp_cls); } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); + GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -7684,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -7712,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -11286,7 +11286,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, + /*.split_mode =*/ LLAMA_SPLIT_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -11312,7 +11312,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11500,16 +11500,16 @@ struct llama_context * llama_new_context_with_model( cparams.cb_eval_user_data = params.cb_eval_user_data; auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; } if (params.seed == LLAMA_DEFAULT_SEED) { @@ -11543,8 +11543,8 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -11553,7 +11553,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + // LLAMA_SPLIT_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { diff --git a/llama.h b/llama.h index 479265f6c1b..ef87ed5a652 100644 --- a/llama.h +++ b/llama.h @@ -114,23 +114,23 @@ extern "C" { }; enum llama_rope_scaling_type { - LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, - LLAMA_ROPE_SCALING_TYPE_NONE = 0, - LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, - LLAMA_ROPE_SCALING_TYPE_YARN = 2, - LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, + LLAMA_ROPE_SCALING_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_NONE = 0, + LLAMA_ROPE_SCALING_LINEAR = 1, + LLAMA_ROPE_SCALING_YARN = 2, + LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; enum llama_pooling_type { - LLAMA_POOLING_TYPE_NONE = 0, - LLAMA_POOLING_TYPE_MEAN = 1, - LLAMA_POOLING_TYPE_CLS = 2, + LLAMA_POOLING_NONE = 0, + LLAMA_POOLING_MEAN = 1, + LLAMA_POOLING_CLS = 2, }; enum llama_split_mode { - LLAMA_SPLIT_MODE_NONE = 0, // single GPU - LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_NONE = 0, // single GPU + LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_ROW = 2, // split rows across GPUs }; typedef struct llama_token_data { From 31e1ec928fc11dc793135da211eae3f78a6dc68f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 11:38:00 +0200 Subject: [PATCH 07/23] llama : update llama_rope_type --- llama.cpp | 57 ++++++++++++++++++++++++++++++++----------------------- llama.h | 2 +- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/llama.cpp b/llama.cpp index d950fc02282..cbeb9714ec7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11757,31 +11757,40 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { enum llama_rope_type llama_rope_type(const struct llama_model * model) { switch (model->arch) { - case LLM_ARCH_LLAMA: return LLAMA_ROPE_TYPE; - case LLM_ARCH_FALCON: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_BAICHUAN: return LLAMA_ROPE_TYPE; - case LLM_ARCH_GPT2: return LLAMA_ROPE_TYPE_NONE; - case LLM_ARCH_GPTJ: return LLAMA_ROPE_TYPE_NONE; - case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NONE; - case LLM_ARCH_MPT: return LLAMA_ROPE_TYPE_NONE; - case LLM_ARCH_STARCODER: return LLAMA_ROPE_TYPE; - case LLM_ARCH_PERSIMMON: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_REFACT: return LLAMA_ROPE_TYPE_NONE; - case LLM_ARCH_BERT: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_NOMIC_BERT: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_BLOOM: return LLAMA_ROPE_TYPE_NONE; - case LLM_ARCH_STABLELM: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_QWEN: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_QWEN2: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_PHI2: return LLAMA_ROPE_TYPE_NEOX; - case LLM_ARCH_PLAMO: return LLAMA_ROPE_TYPE; - case LLM_ARCH_CODESHELL: return LLAMA_ROPE_TYPE; - case LLM_ARCH_ORION: return LLAMA_ROPE_TYPE; - case LLM_ARCH_INTERNLM2: return LLAMA_ROPE_TYPE; - case LLM_ARCH_MINICPM: return LLAMA_ROPE_TYPE; - case LLM_ARCH_GEMMA: return LLAMA_ROPE_TYPE; + // these models do not use RoPE + case LLM_ARCH_GPT2: + case LLM_ARCH_GPTJ: + case LLM_ARCH_GPTNEOX: + case LLM_ARCH_MPT: + case LLM_ARCH_REFACT: + case LLM_ARCH_BLOOM: + return LLAMA_ROPE_TYPE_NONE; + + // use what we call a normal RoPE, operating on pairs of consecutive head values + case LLM_ARCH_LLAMA: + case LLM_ARCH_BAICHUAN: + case LLM_ARCH_STARCODER: + case LLM_ARCH_PLAMO: + case LLM_ARCH_CODESHELL: + case LLM_ARCH_ORION: + case LLM_ARCH_INTERNLM2: + case LLM_ARCH_MINICPM: + case LLM_ARCH_GEMMA: + return LLAMA_ROPE_TYPE_NORM; + + // the pairs of head values are offset by n_rot/2 + case LLM_ARCH_FALCON: + case LLM_ARCH_PERSIMMON: + case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_STABLELM: + case LLM_ARCH_QWEN: + case LLM_ARCH_QWEN2: + case LLM_ARCH_PHI2: + return LLAMA_ROPE_TYPE_NEOX; + + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: - default: GGML_ASSERT(false && "unknown architecture"); return LLAMA_ROPE_TYPE_NONE; } diff --git a/llama.h b/llama.h index ef87ed5a652..160feeda4fb 100644 --- a/llama.h +++ b/llama.h @@ -66,7 +66,7 @@ extern "C" { enum llama_rope_type { LLAMA_ROPE_TYPE_NONE = -1, - LLAMA_ROPE_TYPE = 0, + LLAMA_ROPE_TYPE_NORM = 0, LLAMA_ROPE_TYPE_NEOX = 2, LLAMA_ROPE_TYPE_GLM = 4, }; From decea312200d183001ff2384c0c1b2fc36f52034 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 11:42:55 +0200 Subject: [PATCH 08/23] llama : add comment about rope values --- llama.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama.h b/llama.h index 160feeda4fb..dda6aa39d36 100644 --- a/llama.h +++ b/llama.h @@ -64,6 +64,8 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; + // note: these values should be synchronized with ggml_rope + // TODO: maybe move this enum to ggml.h (ggml_rope_type) enum llama_rope_type { LLAMA_ROPE_TYPE_NONE = -1, LLAMA_ROPE_TYPE_NORM = 0, From 8f9fe6dd7fb7102271ec6e04bbe668c553e3a6d1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 12:40:44 +0200 Subject: [PATCH 09/23] llama : fix build --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index cbeb9714ec7..f074bb2628b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11792,8 +11792,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ASSERT(false && "unknown architecture"); - return LLAMA_ROPE_TYPE_NONE; + break; } + + return LLAMA_ROPE_TYPE_NONE; } int32_t llama_n_vocab(const struct llama_model * model) { From 79e276175e50495aca4460568ad5ee1490bac732 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 12:44:02 +0200 Subject: [PATCH 10/23] passkey : apply kv cache updates explicitly ggml-ci --- examples/passkey/passkey.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index f5db05c2d65..4e129947cb6 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -148,6 +148,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_apply (ctx); n_past -= bd; } @@ -181,6 +182,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_apply (ctx); n_past -= n_discard; @@ -210,6 +212,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_apply (ctx); n_past -= n_discard; } From 18da970e1c8bf9489fdf1d0d1cd2c5ff9d60754a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 12:46:33 +0200 Subject: [PATCH 11/23] llama : change name to llama_kv_cache_update() --- examples/passkey/passkey.cpp | 6 +++--- llama.cpp | 8 ++++---- llama.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 4e129947cb6..574728f89a5 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -148,7 +148,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_kv_cache_apply (ctx); + llama_kv_cache_update (ctx); n_past -= bd; } @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - llama_kv_cache_apply (ctx); + llama_kv_cache_update (ctx); n_past -= n_discard; @@ -212,7 +212,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - llama_kv_cache_apply (ctx); + llama_kv_cache_update (ctx); n_past -= n_discard; } diff --git a/llama.cpp b/llama.cpp index f074bb2628b..263fdf13e4f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7851,7 +7851,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - llama_kv_cache_apply(&lctx); + llama_kv_cache_update(&lctx); ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -7989,7 +7989,7 @@ static int llama_decode_internal( return 0; } -static void llama_kv_cache_apply_internal(struct llama_context & lctx) { +static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { llama_set_k_shift(lctx); @@ -12056,8 +12056,8 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } -void llama_kv_cache_apply(struct llama_context * ctx) { - llama_kv_cache_apply_internal(*ctx); +void llama_kv_cache_update(struct llama_context * ctx) { + llama_kv_cache_update_internal(*ctx); } diff --git a/llama.h b/llama.h index dda6aa39d36..b1621d6a3f1 100644 --- a/llama.h +++ b/llama.h @@ -524,7 +524,7 @@ extern "C" { // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_apply() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_add( @@ -537,7 +537,7 @@ extern "C" { // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_apply() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( @@ -548,7 +548,7 @@ extern "C" { int d); // Apply the KV cache updates (such as K-shifts) to the KV data - LLAMA_API void llama_kv_cache_apply(struct llama_context * ctx); + LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); // // State / sessions From b75ec64ed21dc965b35fb35bf597e6f8d2d52e5a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 12:54:29 +0200 Subject: [PATCH 12/23] llama : add llama_kv_cache_seq_pos_max() --- examples/passkey/passkey.cpp | 6 +++--- llama.cpp | 16 ++++++++++++++++ llama.h | 5 +++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 574728f89a5..1e483edc025 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -150,7 +150,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); llama_kv_cache_update (ctx); - n_past -= bd; + n_past = llama_kv_cache_seq_pos_max(ctx, 0); } llama_batch_clear(batch); @@ -184,7 +184,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_update (ctx); - n_past -= n_discard; + n_past = llama_kv_cache_seq_pos_max(ctx, 0); llama_batch_clear(batch); @@ -214,7 +214,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_update (ctx); - n_past -= n_discard; + n_past = llama_kv_cache_seq_pos_max(ctx, 0); } } diff --git a/llama.cpp b/llama.cpp index 263fdf13e4f..46c82b4adea 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2241,6 +2241,18 @@ static void llama_kv_cache_seq_div( } } +static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id)) { + result = std::max(result, cache.cells[i].pos); + } + } + + return result; +} + // // model loading and saving // @@ -12056,6 +12068,10 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } +llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); +} + void llama_kv_cache_update(struct llama_context * ctx) { llama_kv_cache_update_internal(*ctx); } diff --git a/llama.h b/llama.h index b1621d6a3f1..faea891e479 100644 --- a/llama.h +++ b/llama.h @@ -547,6 +547,11 @@ extern "C" { llama_pos p1, int d); + // Returns the largest position present in the KV cache for the specified sequence + LLAMA_API llama_pos llama_kv_cache_seq_pos_max( + struct llama_context * ctx, + llama_seq_id seq_id); + // Apply the KV cache updates (such as K-shifts) to the KV data LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); From 032ff857064176fbf8f1028311129b89faf336fe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 10:58:18 +0200 Subject: [PATCH 13/23] passkey : fix llama_kv_cache_seq_pos_max() usage --- examples/passkey/passkey.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 1e483edc025..a3a63977fc2 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -150,7 +150,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); llama_kv_cache_update (ctx); - n_past = llama_kv_cache_seq_pos_max(ctx, 0); + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } llama_batch_clear(batch); @@ -184,7 +184,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_update (ctx); - n_past = llama_kv_cache_seq_pos_max(ctx, 0); + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; llama_batch_clear(batch); @@ -214,7 +214,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_update (ctx); - n_past = llama_kv_cache_seq_pos_max(ctx, 0); + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } } From 715a3433436cb7a524461c20b89d2fc13589c5cb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 10:59:52 +0200 Subject: [PATCH 14/23] llama : some llama_kv_cell simplifications --- llama.cpp | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/llama.cpp b/llama.cpp index 46c82b4adea..0effc6db3f0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1709,6 +1709,14 @@ struct llama_kv_cell { bool has_seq_id(const llama_seq_id & id) const { return seq_id.find(id) != seq_id.end(); } + + bool is_empty() const { + return seq_id.empty(); + } + + bool is_same_seq(const llama_kv_cell & other) const { + return seq_id == other.seq_id; + } }; // ring-buffer of cached KV data @@ -2101,7 +2109,7 @@ static bool llama_kv_cache_find_slot( // find how many cells are currently in use static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { for (uint32_t i = cache.size - 1; i > 0; --i) { - if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) { + if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) { return i + 1; } } @@ -2137,7 +2145,7 @@ static void llama_kv_cache_seq_rm( } else { continue; } - if (cache.cells[i].seq_id.empty()) { + if (cache.cells[i].is_empty()) { // keep count of the number of used cells if (cache.cells[i].pos >= 0) cache.used--; @@ -2206,10 +2214,14 @@ static void llama_kv_cache_seq_add( cache.cells[i].delta += delta; if (cache.cells[i].pos < 0) { - if (!cache.cells[i].seq_id.empty()) cache.used--; + if (!cache.cells[i].is_empty()) { + cache.used--; + } cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); - if (new_head == cache.size) new_head = i; + if (new_head == cache.size) { + new_head = i; + } } } } @@ -11618,8 +11630,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, - cparams.n_ctx, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -12203,10 +12214,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const auto n_layer = hparams.n_layer; - const auto n_embd_k_gqa = hparams.n_embd_k_gqa(); - const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); - const auto n_ctx = cparams.n_ctx; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_ctx = cparams.n_ctx; const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; @@ -12221,14 +12232,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + tmp_buf.resize(k_size); ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); // v is not contiguous, copy row by row - size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); @@ -12315,10 +12328,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const int n_layer = hparams.n_layer; - const int n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int n_ctx = cparams.n_ctx; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_ctx = cparams.n_ctx; size_t kv_buf_size; uint32_t kv_head; @@ -12334,13 +12347,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { GGML_ASSERT(kv_self.total_size() == kv_buf_size); for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; // v is not contiguous, copy row by row - size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; From fdfa5bc76b52b3551343d606069cb3107433f236 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 11:00:19 +0200 Subject: [PATCH 15/23] llama : add llama_kv_cache_compress (EXPERIMENTAL) --- examples/passkey/passkey.cpp | 7 +- llama.cpp | 210 ++++++++++++++++++++++++++++++++++- llama.h | 5 + 3 files changed, 215 insertions(+), 7 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index a3a63977fc2..e2725aaa6e9 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -146,9 +146,10 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_compress(ctx, 0); + llama_kv_cache_update (ctx); n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } diff --git a/llama.cpp b/llama.cpp index 0effc6db3f0..e90609089de 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1733,6 +1733,12 @@ struct llama_kv_cache { // computed before each graph build uint32_t n = 0; + ggml_type type_k = GGML_TYPE_F16; + ggml_type type_v = GGML_TYPE_F16; + + // if non-negative, compress data on next update + llama_pos compress_delta = -1; + std::vector cells; std::vector k_l; // per layer @@ -1968,8 +1974,8 @@ struct llama_context { static bool llama_kv_cache_init( struct llama_kv_cache & cache, const llama_model & model, - ggml_type ktype, - ggml_type vtype, + ggml_type type_k, + ggml_type type_v, uint32_t n_ctx, bool offload) { const struct llama_hparams & hparams = model.hparams; @@ -1984,6 +1990,9 @@ static bool llama_kv_cache_init( cache.size = n_ctx; cache.used = 0; + cache.type_k = type_k; + cache.type_v = type_v; + cache.cells.clear(); cache.cells.resize(n_ctx); @@ -2024,8 +2033,8 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); @@ -2265,6 +2274,10 @@ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama return result; } +static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos delta) { + cache.compress_delta = delta; +} + // // model loading and saving // @@ -8034,6 +8047,191 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } } } + + // compress the KV cache data if needed: + // + // - determine which KV cell pairs (i0, i1) to merge: + // + // abs(cell[i0].pos - cell[i1].pos) <= compress_delta + // + // - move the KV cache to the Host memory for easier maniiplation + // - processing is done layer-by-layer + // - convert the KV data to F32 + // - merge the KV data (different ways to merge) + // - convert the KV data back to the original type + // - move the KV cache back to the device memory + // - update the KV cache metadata + // + // as a side effect, the new KV cache is defragmented + // + if (lctx.kv_self.compress_delta >= 0) { + auto & kv_self = lctx.kv_self; + + const auto & hparams = lctx.model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t kv_size = kv_self.size; + + std::vector buf_q; + + std::vector buf_src_f32; + std::vector buf_dst_f32; + + const int64_t t_start = ggml_time_us(); + + struct c_pair { uint32_t i0, i1; }; + struct c_info { bool merged; uint32_t id, cnt;}; + + std::vector infos(kv_size, { false, 0, 0 }); + + // the destination cell in the new KV cache + uint32_t id = 0; + + // number of pairs merged + uint32_t n_merges = 0; + + // determine which KV cells to merge + for (uint32_t i0 = 0; i0 < kv_size; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty() && !infos[i0].merged) { + infos[i0] = { true, id, 0 }; + infos[id].cnt = 1; + + const llama_pos p0 = cell0.pos; + + for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) { + const auto & cell1 = kv_self.cells[i1]; + + if (i0 != i1 && cell0.is_same_seq(cell1)) { + const llama_pos p1 = cell1.pos; + + if (std::abs(p0 - p1) <= kv_self.compress_delta) { + infos[i1] = { true, id, 0 }; + infos[id].cnt++; + n_merges++; + } + } + } + + if (i0 != id) { + kv_self.cells[id] = cell0; + } + + id++; + } + } + + kv_self.head = id; + kv_self.used = id; + + for (uint32_t i = id; i < kv_size; ++i) { + kv_self.cells[i] = llama_kv_cell(); + } + + LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges); + + ggml_type_traits_t tt_k; + ggml_type_traits_t tt_v; + + tt_k = ggml_internal_get_type_traits(kv_self.type_k); + tt_v = ggml_internal_get_type_traits(kv_self.type_v); + + for (uint32_t il = 0; il < n_layer; ++il) { + // update keys + { + const int64_t ne = n_embd_k_gqa*kv_size; + + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne); + + buf_q.resize(k_size); + + buf_src_f32.resize(ne); + buf_dst_f32.resize(ne); + + ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); + + tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne); + + std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); + + for (uint32_t i = 0; i < kv_size; ++i) { + if (!infos[i].merged) { + continue; + } + + const uint32_t id = infos[i].id; + + // merge using averaging + { + const float scale = 1.0f/float(infos[id].cnt); + + const int64_t os = i*n_embd_k_gqa; + const int64_t od = id*n_embd_k_gqa; + + for (uint32_t j = 0; j < n_embd_k_gqa; ++j) { + buf_dst_f32[od + j] += buf_src_f32[os + j]*scale; + } + } + } + + tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne); + + ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); + } + + // update values (note: they are transposed) + { + const int64_t ne = n_embd_v_gqa*kv_size; + + const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne); + + buf_q.resize(v_size); + + buf_src_f32.resize(ne); + buf_dst_f32.resize(ne); + + ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); + + tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne); + + std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); + + for (uint32_t i = 0; i < kv_size; ++i) { + if (!infos[i].merged) { + continue; + } + + const uint32_t id = infos[i].id; + + // merge using averaging + { + const float scale = 1.0f/float(infos[id].cnt); + //printf("i: %d -> id: %d, scale: %f\n", i, id, scale); + + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale; + } + } + } + + tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne); + + ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); + } + } + + const int64_t t_end = ggml_time_us(); + + LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0); + + kv_self.compress_delta = -1; + } } // @@ -12083,6 +12281,10 @@ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id se return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); } +void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) { + llama_kv_cache_compress(ctx->kv_self, delta); +} + void llama_kv_cache_update(struct llama_context * ctx) { llama_kv_cache_update_internal(*ctx); } diff --git a/llama.h b/llama.h index faea891e479..3fac7b79c82 100644 --- a/llama.h +++ b/llama.h @@ -552,6 +552,11 @@ extern "C" { struct llama_context * ctx, llama_seq_id seq_id); + // [EXPERIMENTAL] Compress the data in the KV cache + LLAMA_API void llama_kv_cache_compress( + struct llama_context * ctx, + llama_pos delta); + // Apply the KV cache updates (such as K-shifts) to the KV data LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); From 9ec749df59982d84f7a5bafb8d08a2f4ca08f00f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 13:57:43 +0200 Subject: [PATCH 16/23] llama : add alternative KV cache merging (EXPERIMENTAL) --- llama.cpp | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1fb53f3db45..2c05921bb86 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8072,10 +8072,13 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { const auto & hparams = lctx.model.hparams; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t kv_size = kv_self.size; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_embd_head_k = hparams.n_embd_head_k; GGML_UNUSED(n_embd_head_k); + const uint32_t n_embd_head_v = hparams.n_embd_head_v; GGML_UNUSED(n_embd_head_v); + const uint32_t n_head_kv = hparams.n_head_kv; GGML_UNUSED(n_head_kv); + const uint32_t kv_size = kv_self.size; std::vector buf_q; @@ -8085,9 +8088,9 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { const int64_t t_start = ggml_time_us(); struct c_pair { uint32_t i0, i1; }; - struct c_info { bool merged; uint32_t id, cnt;}; + struct c_info { bool merged; uint32_t id, cnt, r; }; - std::vector infos(kv_size, { false, 0, 0 }); + std::vector infos(kv_size, { false, 0, 0, 0 }); // the destination cell in the new KV cache uint32_t id = 0; @@ -8100,7 +8103,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { const auto & cell0 = kv_self.cells[i0]; if (!cell0.is_empty() && !infos[i0].merged) { - infos[i0] = { true, id, 0 }; + infos[i0] = { true, id, 0, 0 }; infos[id].cnt = 1; const llama_pos p0 = cell0.pos; @@ -8112,7 +8115,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { const llama_pos p1 = cell1.pos; if (std::abs(p0 - p1) <= kv_self.compress_delta) { - infos[i1] = { true, id, 0 }; + infos[i1] = { true, id, 0, 0 }; infos[id].cnt++; n_merges++; } @@ -8143,6 +8146,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { tt_v = ggml_internal_get_type_traits(kv_self.type_v); for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t i = 0; i < kv_size; ++i) { + infos[i].r = 0; + } + // update keys { const int64_t ne = n_embd_k_gqa*kv_size; @@ -8167,6 +8174,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { const uint32_t id = infos[i].id; +#if 1 // merge using averaging { const float scale = 1.0f/float(infos[id].cnt); @@ -8178,6 +8186,25 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { buf_dst_f32[od + j] += buf_src_f32[os + j]*scale; } } +#else + // merge separate heads + { + for (uint32_t h = 0; h < n_head_kv; ++h) { + if ((h + il) % infos[id].cnt != infos[id].r) { + continue; + } + + const int64_t os = i*n_embd_k_gqa + h*n_embd_head_k; + const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k; + + for (uint32_t j = 0; j < n_embd_head_k; ++j) { + buf_dst_f32[od + j] = buf_src_f32[os + j]; + } + } + } + + infos[id].r++; +#endif } tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne); @@ -8185,6 +8212,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); } + for (uint32_t i = 0; i < kv_size; ++i) { + infos[i].r = 0; + } + // update values (note: they are transposed) { const int64_t ne = n_embd_v_gqa*kv_size; @@ -8209,6 +8240,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { const uint32_t id = infos[i].id; +#if 1 // merge using averaging { const float scale = 1.0f/float(infos[id].cnt); @@ -8221,6 +8253,25 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale; } } +#else + // merge separate heads + { + for (uint32_t h = 0; h < n_head_kv; ++h) { + if ((h + il) % infos[id].cnt != infos[id].r) { + continue; + } + + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) { + buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size]; + } + } + } + + infos[id].r++; +#endif } tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne); From 65f21ec5d3e774978765f4de82231809c2cc3e72 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 15:00:45 +0200 Subject: [PATCH 17/23] llama : add llama_kv_cache_defrag --- examples/passkey/passkey.cpp | 2 + llama.cpp | 489 ++++++++++++++++++++++------------- llama.h | 11 +- 3 files changed, 327 insertions(+), 175 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e2725aaa6e9..4c8a041359f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -183,6 +183,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_defrag (ctx); llama_kv_cache_update (ctx); n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; @@ -213,6 +214,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_defrag (ctx); llama_kv_cache_update (ctx); n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; diff --git a/llama.cpp b/llama.cpp index 2c05921bb86..61539b24ae7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1722,6 +1722,7 @@ struct llama_kv_cell { // ring-buffer of cached KV data struct llama_kv_cache { bool has_shift = false; + bool do_defrag = false; // Note: The value of head isn't only used to optimize searching // for a free KV slot. llama_decode_internal also uses it, so it @@ -2278,6 +2279,10 @@ static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos del cache.compress_delta = delta; } +static void llama_kv_cache_defrag(struct llama_kv_cache & cache) { + cache.do_defrag = true; +} + // // model loading and saving // @@ -8029,262 +8034,394 @@ static int llama_decode_internal( return 0; } -static void llama_kv_cache_update_internal(struct llama_context & lctx) { - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { - llama_set_k_shift(lctx); +// summary: +// +// - determine which KV cell pairs (i0, i1) to merge: +// +// abs(cell[i0].pos - cell[i1].pos) <= compress_delta +// +// - move the KV cache to the Host memory for easier maniiplation +// - processing is done layer-by-layer +// - convert the KV data to F32 +// - merge the KV data (different ways to merge) +// - convert the KV data back to the original type +// - move the KV cache back to the device memory +// - update the KV cache metadata +// +// as a side effect, the new KV cache is defragmented +// +static void llama_kv_cache_compress_internal(struct llama_context & lctx) { + auto & kv_self = lctx.kv_self; - { - ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + const auto & hparams = lctx.model.hparams; - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); - } + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_embd_head_k = hparams.n_embd_head_k; GGML_UNUSED(n_embd_head_k); + const uint32_t n_embd_head_v = hparams.n_embd_head_v; GGML_UNUSED(n_embd_head_v); + const uint32_t n_head_kv = hparams.n_head_kv; GGML_UNUSED(n_head_kv); + const uint32_t kv_size = kv_self.size; - { - auto & kv_self = lctx.kv_self; + const int64_t t_start = ggml_time_us(); - kv_self.has_shift = false; + std::vector buf_q; - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; + std::vector buf_src_f32; + std::vector buf_dst_f32; + + struct c_pair { uint32_t i0, i1; }; + struct c_info { bool merged; uint32_t id, cnt, r; }; + + std::vector infos(kv_size, { false, 0, 0, 0 }); + + // the destination cell in the new KV cache + uint32_t id = 0; + + // number of pairs merged + uint32_t n_merges = 0; + + // determine which KV cells to merge + for (uint32_t i0 = 0; i0 < kv_size; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty() && !infos[i0].merged) { + infos[i0] = { true, id, 0, 0 }; + infos[id].cnt = 1; + + const llama_pos p0 = cell0.pos; + + for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) { + const auto & cell1 = kv_self.cells[i1]; + + if (i0 != i1 && cell0.is_same_seq(cell1)) { + const llama_pos p1 = cell1.pos; + + if (std::abs(p0 - p1) <= kv_self.compress_delta) { + infos[i1] = { true, id, 0, 0 }; + infos[id].cnt++; + n_merges++; + } + } + } + + if (i0 != id) { + kv_self.cells[id] = cell0; } + + id++; } } - // compress the KV cache data if needed: - // - // - determine which KV cell pairs (i0, i1) to merge: - // - // abs(cell[i0].pos - cell[i1].pos) <= compress_delta - // - // - move the KV cache to the Host memory for easier maniiplation - // - processing is done layer-by-layer - // - convert the KV data to F32 - // - merge the KV data (different ways to merge) - // - convert the KV data back to the original type - // - move the KV cache back to the device memory - // - update the KV cache metadata - // - // as a side effect, the new KV cache is defragmented - // - if (lctx.kv_self.compress_delta >= 0) { - auto & kv_self = lctx.kv_self; + kv_self.head = id; + kv_self.used = id; - const auto & hparams = lctx.model.hparams; + for (uint32_t i = id; i < kv_size; ++i) { + kv_self.cells[i] = llama_kv_cell(); + } - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_embd_head_k = hparams.n_embd_head_k; GGML_UNUSED(n_embd_head_k); - const uint32_t n_embd_head_v = hparams.n_embd_head_v; GGML_UNUSED(n_embd_head_v); - const uint32_t n_head_kv = hparams.n_head_kv; GGML_UNUSED(n_head_kv); - const uint32_t kv_size = kv_self.size; + LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges); - std::vector buf_q; + ggml_type_traits_t tt_k; + ggml_type_traits_t tt_v; - std::vector buf_src_f32; - std::vector buf_dst_f32; + tt_k = ggml_internal_get_type_traits(kv_self.type_k); + tt_v = ggml_internal_get_type_traits(kv_self.type_v); - const int64_t t_start = ggml_time_us(); + for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t i = 0; i < kv_size; ++i) { + infos[i].r = 0; + } - struct c_pair { uint32_t i0, i1; }; - struct c_info { bool merged; uint32_t id, cnt, r; }; + // update keys + { + const int64_t ne = n_embd_k_gqa*kv_size; - std::vector infos(kv_size, { false, 0, 0, 0 }); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne); - // the destination cell in the new KV cache - uint32_t id = 0; + buf_q.resize(k_size); - // number of pairs merged - uint32_t n_merges = 0; + buf_src_f32.resize(ne); + buf_dst_f32.resize(ne); - // determine which KV cells to merge - for (uint32_t i0 = 0; i0 < kv_size; ++i0) { - const auto & cell0 = kv_self.cells[i0]; + ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); - if (!cell0.is_empty() && !infos[i0].merged) { - infos[i0] = { true, id, 0, 0 }; - infos[id].cnt = 1; + tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne); - const llama_pos p0 = cell0.pos; + std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); - for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) { - const auto & cell1 = kv_self.cells[i1]; + for (uint32_t i = 0; i < kv_size; ++i) { + if (!infos[i].merged) { + continue; + } - if (i0 != i1 && cell0.is_same_seq(cell1)) { - const llama_pos p1 = cell1.pos; + const uint32_t id = infos[i].id; - if (std::abs(p0 - p1) <= kv_self.compress_delta) { - infos[i1] = { true, id, 0, 0 }; - infos[id].cnt++; - n_merges++; - } +#if 1 + // merge using averaging + { + const float scale = 1.0f/float(infos[id].cnt); + + const int64_t os = i*n_embd_k_gqa; + const int64_t od = id*n_embd_k_gqa; + + for (uint32_t j = 0; j < n_embd_k_gqa; ++j) { + buf_dst_f32[od + j] += buf_src_f32[os + j]*scale; } } +#else + // merge separate heads + { + for (uint32_t h = 0; h < n_head_kv; ++h) { + if ((h + il) % infos[id].cnt != infos[id].r) { + continue; + } + + const int64_t os = i*n_embd_k_gqa + h*n_embd_head_k; + const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k; - if (i0 != id) { - kv_self.cells[id] = cell0; + for (uint32_t j = 0; j < n_embd_head_k; ++j) { + buf_dst_f32[od + j] = buf_src_f32[os + j]; + } + } } - id++; + infos[id].r++; +#endif } - } - kv_self.head = id; - kv_self.used = id; + tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne); - for (uint32_t i = id; i < kv_size; ++i) { - kv_self.cells[i] = llama_kv_cell(); + ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); } - LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges); - - ggml_type_traits_t tt_k; - ggml_type_traits_t tt_v; - - tt_k = ggml_internal_get_type_traits(kv_self.type_k); - tt_v = ggml_internal_get_type_traits(kv_self.type_v); - - for (uint32_t il = 0; il < n_layer; ++il) { - for (uint32_t i = 0; i < kv_size; ++i) { - infos[i].r = 0; - } + for (uint32_t i = 0; i < kv_size; ++i) { + infos[i].r = 0; + } - // update keys - { - const int64_t ne = n_embd_k_gqa*kv_size; + // update values (note: they are transposed) + { + const int64_t ne = n_embd_v_gqa*kv_size; - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne); + const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne); - buf_q.resize(k_size); + buf_q.resize(v_size); - buf_src_f32.resize(ne); - buf_dst_f32.resize(ne); + buf_src_f32.resize(ne); + buf_dst_f32.resize(ne); - ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); - tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne); + tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne); - std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); + std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); - for (uint32_t i = 0; i < kv_size; ++i) { - if (!infos[i].merged) { - continue; - } + for (uint32_t i = 0; i < kv_size; ++i) { + if (!infos[i].merged) { + continue; + } - const uint32_t id = infos[i].id; + const uint32_t id = infos[i].id; #if 1 - // merge using averaging - { - const float scale = 1.0f/float(infos[id].cnt); + // merge using averaging + { + const float scale = 1.0f/float(infos[id].cnt); + //printf("i: %d -> id: %d, scale: %f\n", i, id, scale); - const int64_t os = i*n_embd_k_gqa; - const int64_t od = id*n_embd_k_gqa; + const int64_t os = i; + const int64_t od = id; - for (uint32_t j = 0; j < n_embd_k_gqa; ++j) { - buf_dst_f32[od + j] += buf_src_f32[os + j]*scale; - } + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale; } + } #else - // merge separate heads - { - for (uint32_t h = 0; h < n_head_kv; ++h) { - if ((h + il) % infos[id].cnt != infos[id].r) { - continue; - } + // merge separate heads + { + for (uint32_t h = 0; h < n_head_kv; ++h) { + if ((h + il) % infos[id].cnt != infos[id].r) { + continue; + } - const int64_t os = i*n_embd_k_gqa + h*n_embd_head_k; - const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k; + const int64_t os = i; + const int64_t od = id; - for (uint32_t j = 0; j < n_embd_head_k; ++j) { - buf_dst_f32[od + j] = buf_src_f32[os + j]; - } + for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) { + buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size]; } } + } - infos[id].r++; + infos[id].r++; #endif - } + } - tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne); + tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne); - ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); - } + ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); + } + } - for (uint32_t i = 0; i < kv_size; ++i) { - infos[i].r = 0; + const int64_t t_end = ggml_time_us(); + + LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0); +} + +// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache +// removing any empty segments that may have been left by previous KV cache operations +// TODO: optimizations are possible: +// - multiple threads +// - avoid copying to the host memory when already there +// TODO: can we do all this on-device? +static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + auto & kv_self = lctx.kv_self; + + const auto & hparams = lctx.model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + + const uint32_t kv_size = kv_self.size; + + const int64_t t_start = ggml_time_us(); + + std::vector buf_k; + std::vector buf_v; + + // the destination cell in the new KV cache + uint32_t id = 0; + + // number of cells moved + uint32_t n_moves = 0; + + // determine which KV cells to move where + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_kv; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = id; + + if (i0 != id) { + kv_self.cells[id] = cell0; + n_moves++; } - // update values (note: they are transposed) - { - const int64_t ne = n_embd_v_gqa*kv_size; + id++; + } + } - const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne); + if (n_moves == 0) { + return; + } - buf_q.resize(v_size); + LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - buf_src_f32.resize(ne); - buf_dst_f32.resize(ne); + kv_self.head = id; + kv_self.used = id; - ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); + // zero the rest of the cells + for (uint32_t i = id; i < n_kv; ++i) { + kv_self.cells[i] = llama_kv_cell(); + } - tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne); + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); - std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); - for (uint32_t i = 0; i < kv_size; ++i) { - if (!infos[i].merged) { - continue; - } + buf_k.resize(k_size); + buf_v.resize(v_size); - const uint32_t id = infos[i].id; + ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); -#if 1 - // merge using averaging - { - const float scale = 1.0f/float(infos[id].cnt); - //printf("i: %d -> id: %d, scale: %f\n", i, id, scale); + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; - const int64_t os = i; - const int64_t od = id; + if (i == id || id == n_kv) { + continue; + } - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale; - } - } -#else - // merge separate heads - { - for (uint32_t h = 0; h < n_head_kv; ++h) { - if ((h + il) % infos[id].cnt != infos[id].r) { - continue; - } + uint32_t nm = 1; - const int64_t os = i; - const int64_t od = id; + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } - for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) { - buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size]; - } - } - } + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; - infos[id].r++; -#endif + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + } + + const int64_t t_end = ggml_time_us(); + + LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); +} + +static void llama_kv_cache_update_internal(struct llama_context & lctx) { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + llama_set_k_shift(lctx); + + { + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } + + { + auto & kv_self = lctx.kv_self; - ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); + kv_self.has_shift = false; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; } } + } - const int64_t t_end = ggml_time_us(); + // compress the KV cache data if needed + if (lctx.kv_self.compress_delta >= 0) { + llama_kv_cache_compress_internal(lctx); + + lctx.kv_self.compress_delta = -1; + lctx.kv_self.do_defrag = false; + } - LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0); + // defragment the KV cache if needed + if (lctx.kv_self.do_defrag) { + llama_kv_cache_defrag_internal(lctx); - kv_self.compress_delta = -1; + lctx.kv_self.do_defrag = false; } } @@ -12360,6 +12497,10 @@ void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) { llama_kv_cache_compress(ctx->kv_self, delta); } +void llama_kv_cache_defrag(struct llama_context * ctx) { + llama_kv_cache_defrag(ctx->kv_self); +} + void llama_kv_cache_update(struct llama_context * ctx) { llama_kv_cache_update_internal(*ctx); } diff --git a/llama.h b/llama.h index 8f959824fd0..862d555e2b9 100644 --- a/llama.h +++ b/llama.h @@ -555,11 +555,20 @@ extern "C" { llama_seq_id seq_id); // [EXPERIMENTAL] Compress the data in the KV cache + // This will be applied: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() LLAMA_API void llama_kv_cache_compress( struct llama_context * ctx, llama_pos delta); - // Apply the KV cache updates (such as K-shifts) to the KV data + // Defragment the KV cache + // This will be applied: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() + LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); + + // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); // From 1b6aeb830903926a2187a38ebb7e14b397206c97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 15:30:06 +0200 Subject: [PATCH 18/23] llama : comments --- llama.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 61539b24ae7..dc491f14b62 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8040,7 +8040,7 @@ static int llama_decode_internal( // // abs(cell[i0].pos - cell[i1].pos) <= compress_delta // -// - move the KV cache to the Host memory for easier maniiplation +// - move the KV cache to the host memory for easier manipulation // - processing is done layer-by-layer // - convert the KV data to F32 // - merge the KV data (different ways to merge) @@ -8269,11 +8269,14 @@ static void llama_kv_cache_compress_internal(struct llama_context & lctx) { } // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache -// removing any empty segments that may have been left by previous KV cache operations +// this way we eliminate any empty segments that may have been left by previous KV cache operations +// // TODO: optimizations are possible: // - multiple threads // - avoid copying to the host memory when already there +// // TODO: can we do all this on-device? +// static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { auto & kv_self = lctx.kv_self; From 2d7203b975334fd8e3ccf6e93bca66ebacfef436 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 15:32:02 +0200 Subject: [PATCH 19/23] llama : remove llama_kv_cache_compress will add in a separate PR ggml-ci --- examples/passkey/passkey.cpp | 1 - llama.cpp | 253 ----------------------------------- llama.h | 8 -- 3 files changed, 262 deletions(-) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 4c8a041359f..47de67a9304 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -148,7 +148,6 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_kv_cache_compress(ctx, 0); llama_kv_cache_update (ctx); n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; diff --git a/llama.cpp b/llama.cpp index dc491f14b62..75189e719ae 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1737,9 +1737,6 @@ struct llama_kv_cache { ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; - // if non-negative, compress data on next update - llama_pos compress_delta = -1; - std::vector cells; std::vector k_l; // per layer @@ -2275,10 +2272,6 @@ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama return result; } -static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos delta) { - cache.compress_delta = delta; -} - static void llama_kv_cache_defrag(struct llama_kv_cache & cache) { cache.do_defrag = true; } @@ -8034,240 +8027,6 @@ static int llama_decode_internal( return 0; } -// summary: -// -// - determine which KV cell pairs (i0, i1) to merge: -// -// abs(cell[i0].pos - cell[i1].pos) <= compress_delta -// -// - move the KV cache to the host memory for easier manipulation -// - processing is done layer-by-layer -// - convert the KV data to F32 -// - merge the KV data (different ways to merge) -// - convert the KV data back to the original type -// - move the KV cache back to the device memory -// - update the KV cache metadata -// -// as a side effect, the new KV cache is defragmented -// -static void llama_kv_cache_compress_internal(struct llama_context & lctx) { - auto & kv_self = lctx.kv_self; - - const auto & hparams = lctx.model.hparams; - - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_embd_head_k = hparams.n_embd_head_k; GGML_UNUSED(n_embd_head_k); - const uint32_t n_embd_head_v = hparams.n_embd_head_v; GGML_UNUSED(n_embd_head_v); - const uint32_t n_head_kv = hparams.n_head_kv; GGML_UNUSED(n_head_kv); - const uint32_t kv_size = kv_self.size; - - const int64_t t_start = ggml_time_us(); - - std::vector buf_q; - - std::vector buf_src_f32; - std::vector buf_dst_f32; - - struct c_pair { uint32_t i0, i1; }; - struct c_info { bool merged; uint32_t id, cnt, r; }; - - std::vector infos(kv_size, { false, 0, 0, 0 }); - - // the destination cell in the new KV cache - uint32_t id = 0; - - // number of pairs merged - uint32_t n_merges = 0; - - // determine which KV cells to merge - for (uint32_t i0 = 0; i0 < kv_size; ++i0) { - const auto & cell0 = kv_self.cells[i0]; - - if (!cell0.is_empty() && !infos[i0].merged) { - infos[i0] = { true, id, 0, 0 }; - infos[id].cnt = 1; - - const llama_pos p0 = cell0.pos; - - for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) { - const auto & cell1 = kv_self.cells[i1]; - - if (i0 != i1 && cell0.is_same_seq(cell1)) { - const llama_pos p1 = cell1.pos; - - if (std::abs(p0 - p1) <= kv_self.compress_delta) { - infos[i1] = { true, id, 0, 0 }; - infos[id].cnt++; - n_merges++; - } - } - } - - if (i0 != id) { - kv_self.cells[id] = cell0; - } - - id++; - } - } - - kv_self.head = id; - kv_self.used = id; - - for (uint32_t i = id; i < kv_size; ++i) { - kv_self.cells[i] = llama_kv_cell(); - } - - LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges); - - ggml_type_traits_t tt_k; - ggml_type_traits_t tt_v; - - tt_k = ggml_internal_get_type_traits(kv_self.type_k); - tt_v = ggml_internal_get_type_traits(kv_self.type_v); - - for (uint32_t il = 0; il < n_layer; ++il) { - for (uint32_t i = 0; i < kv_size; ++i) { - infos[i].r = 0; - } - - // update keys - { - const int64_t ne = n_embd_k_gqa*kv_size; - - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne); - - buf_q.resize(k_size); - - buf_src_f32.resize(ne); - buf_dst_f32.resize(ne); - - ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); - - tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne); - - std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); - - for (uint32_t i = 0; i < kv_size; ++i) { - if (!infos[i].merged) { - continue; - } - - const uint32_t id = infos[i].id; - -#if 1 - // merge using averaging - { - const float scale = 1.0f/float(infos[id].cnt); - - const int64_t os = i*n_embd_k_gqa; - const int64_t od = id*n_embd_k_gqa; - - for (uint32_t j = 0; j < n_embd_k_gqa; ++j) { - buf_dst_f32[od + j] += buf_src_f32[os + j]*scale; - } - } -#else - // merge separate heads - { - for (uint32_t h = 0; h < n_head_kv; ++h) { - if ((h + il) % infos[id].cnt != infos[id].r) { - continue; - } - - const int64_t os = i*n_embd_k_gqa + h*n_embd_head_k; - const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k; - - for (uint32_t j = 0; j < n_embd_head_k; ++j) { - buf_dst_f32[od + j] = buf_src_f32[os + j]; - } - } - } - - infos[id].r++; -#endif - } - - tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne); - - ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size()); - } - - for (uint32_t i = 0; i < kv_size; ++i) { - infos[i].r = 0; - } - - // update values (note: they are transposed) - { - const int64_t ne = n_embd_v_gqa*kv_size; - - const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne); - - buf_q.resize(v_size); - - buf_src_f32.resize(ne); - buf_dst_f32.resize(ne); - - ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); - - tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne); - - std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0); - - for (uint32_t i = 0; i < kv_size; ++i) { - if (!infos[i].merged) { - continue; - } - - const uint32_t id = infos[i].id; - -#if 1 - // merge using averaging - { - const float scale = 1.0f/float(infos[id].cnt); - //printf("i: %d -> id: %d, scale: %f\n", i, id, scale); - - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale; - } - } -#else - // merge separate heads - { - for (uint32_t h = 0; h < n_head_kv; ++h) { - if ((h + il) % infos[id].cnt != infos[id].r) { - continue; - } - - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) { - buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size]; - } - } - } - - infos[id].r++; -#endif - } - - tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne); - - ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size()); - } - } - - const int64_t t_end = ggml_time_us(); - - LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0); -} - // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache // this way we eliminate any empty segments that may have been left by previous KV cache operations // @@ -8412,14 +8171,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } } - // compress the KV cache data if needed - if (lctx.kv_self.compress_delta >= 0) { - llama_kv_cache_compress_internal(lctx); - - lctx.kv_self.compress_delta = -1; - lctx.kv_self.do_defrag = false; - } - // defragment the KV cache if needed if (lctx.kv_self.do_defrag) { llama_kv_cache_defrag_internal(lctx); @@ -12496,10 +12247,6 @@ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id se return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); } -void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) { - llama_kv_cache_compress(ctx->kv_self, delta); -} - void llama_kv_cache_defrag(struct llama_context * ctx) { llama_kv_cache_defrag(ctx->kv_self); } diff --git a/llama.h b/llama.h index 862d555e2b9..ff131996d9a 100644 --- a/llama.h +++ b/llama.h @@ -554,14 +554,6 @@ extern "C" { struct llama_context * ctx, llama_seq_id seq_id); - // [EXPERIMENTAL] Compress the data in the KV cache - // This will be applied: - // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() - LLAMA_API void llama_kv_cache_compress( - struct llama_context * ctx, - llama_pos delta); - // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() From 65323bc770667b372730e892c4d56f383558e303 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 17:21:33 +0200 Subject: [PATCH 20/23] llama : defragment via non-overlapping moves --- llama.cpp | 71 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index 75189e719ae..aa7574cc102 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8028,7 +8028,7 @@ static int llama_decode_internal( } // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache -// this way we eliminate any empty segments that may have been left by previous KV cache operations +// this way we eliminate any empty holes that may have been left by previous KV cache operations // // TODO: optimizations are possible: // - multiple threads @@ -8045,36 +8045,81 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + const uint32_t n_used = kv_self.used; const uint32_t kv_size = kv_self.size; + assert(n_used <= n_kv); + const int64_t t_start = ggml_time_us(); std::vector buf_k; std::vector buf_v; - // the destination cell in the new KV cache - uint32_t id = 0; - // number of cells moved uint32_t n_moves = 0; // determine which KV cells to move where std::vector ids(n_kv, n_kv); - for (uint32_t i0 = 0; i0 < n_kv; ++i0) { + for (uint32_t i0 = 0; i0 < n_used; ++i0) { const auto & cell0 = kv_self.cells[i0]; if (!cell0.is_empty()) { - ids[i0] = id; + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + // determine the size of the hole + uint32_t nh = 1; + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + // starting from the end, find nh non-empty cells + uint32_t nf = 0; + uint32_t is = n_kv - 1; + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } - if (i0 != id) { - kv_self.cells[id] = cell0; - n_moves++; + // non-empty cell which is not yet moved + nf++; + if (nf == nh) { + break; + } + } + + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + // go back and move the nf cells to the hole + for (uint32_t i1 = is; i1 < n_kv; ++i1) { + const auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + continue; } - id++; + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + n_moves++; + nf++; } + + LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, n_kv, i0, i0 + nh); + + i0 += nh - 1; } if (n_moves == 0) { @@ -8083,11 +8128,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - kv_self.head = id; - kv_self.used = id; + kv_self.head = n_used; + kv_self.used = n_used; // zero the rest of the cells - for (uint32_t i = id; i < n_kv; ++i) { + for (uint32_t i = n_used; i < n_kv; ++i) { kv_self.cells[i] = llama_kv_cell(); } From 4eaaace394016231858acacbc077a693512388f2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 17:36:37 +0200 Subject: [PATCH 21/23] llama : ggml_graph based defrag implementation ggml-ci --- llama.cpp | 112 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 18 deletions(-) diff --git a/llama.cpp b/llama.cpp index aa7574cc102..e6826a31793 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5111,6 +5111,53 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_defrag(const std::vector & ids) { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + for (int il = 0; il < n_layer; ++il) { + for (int i = 0; i < n_kv; ++i) { + const int id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + int nm = 1; + + while (i + nm < n_kv && (int) ids[i + nm] == id + nm) { + nm++; + } + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + + i += nm - 1; + } + } + + return gf; + } + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -7505,6 +7552,23 @@ struct llm_build_context { } }; +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { + llama_batch dummy; + dummy.n_tokens = 0; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_defrag(ids); + + llm.free(); + + return result; +} + static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { llama_batch dummy; dummy.n_tokens = 0; @@ -8030,32 +8094,16 @@ static int llama_decode_internal( // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache // this way we eliminate any empty holes that may have been left by previous KV cache operations // -// TODO: optimizations are possible: -// - multiple threads -// - avoid copying to the host memory when already there -// -// TODO: can we do all this on-device? -// static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { auto & kv_self = lctx.kv_self; - const auto & hparams = lctx.model.hparams; - - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); - const uint32_t n_used = kv_self.used; - - const uint32_t kv_size = kv_self.size; + const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + const uint32_t n_used = kv_self.used; assert(n_used <= n_kv); const int64_t t_start = ggml_time_us(); - std::vector buf_k; - std::vector buf_v; - // number of cells moved uint32_t n_moves = 0; @@ -8136,6 +8184,27 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { kv_self.cells[i] = llama_kv_cell(); } +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const auto & hparams = lctx.model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = kv_self.size; + + std::vector buf_k; + std::vector buf_v; + for (uint32_t il = 0; il < n_layer; ++il) { const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); @@ -8188,6 +8257,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); } +#else + // ggml_graph defrag + + ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); +#endif const int64_t t_end = ggml_time_us(); From 0b72ded501e22501d968583ada5300ed49977621 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 17:51:02 +0200 Subject: [PATCH 22/23] llama : switch the loop order in build_defrag --- llama.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index e6826a31793..f87f44d14d7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5114,20 +5114,20 @@ struct llm_build_context { struct ggml_cgraph * build_defrag(const std::vector & ids) { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - for (int il = 0; il < n_layer; ++il) { - for (int i = 0; i < n_kv; ++i) { - const int id = ids[i]; + for (int i = 0; i < n_kv; ++i) { + const int id = ids[i]; - if (i == id || id == n_kv) { - continue; - } + if (i == id || id == n_kv) { + continue; + } - int nm = 1; + int nm = 1; - while (i + nm < n_kv && (int) ids[i + nm] == id + nm) { - nm++; - } + while (i + nm < n_kv && (int) ids[i + nm] == id + nm) { + nm++; + } + for (int il = 0; il < n_layer; ++il) { ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], n_embd_k_gqa, nm, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), @@ -5150,9 +5150,9 @@ struct llm_build_context { ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - - i += nm - 1; } + + i += nm - 1; } return gf; From 5a122c25a0d8c840f34bce10c0d1565464612405 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 18:16:45 +0200 Subject: [PATCH 23/23] llama : add comments --- llama.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index f87f44d14d7..3424b1999eb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8091,9 +8091,7 @@ static int llama_decode_internal( return 0; } -// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache -// this way we eliminate any empty holes that may have been left by previous KV cache operations -// +// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { auto & kv_self = lctx.kv_self; @@ -8108,6 +8106,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { uint32_t n_moves = 0; // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // std::vector ids(n_kv, n_kv); for (uint32_t i0 = 0; i0 < n_used; ++i0) { @@ -8139,11 +8142,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // non-empty cell which is not yet moved nf++; + if (nf == nh) { break; } } + // this can only happen if `n_used` is not accurate, which would be a bug GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); nf = 0; @@ -8156,6 +8161,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { continue; } + // this cell goes to (i0 + nf) ids[i1] = i0 + nf; // move the cell meta data