From 1a999819a20c49278d4b5c211e9cf42c07c486f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 23 Feb 2024 20:29:40 +0200
Subject: [PATCH 01/23] llama : refactor k-shift implementation

ggml-ci
---
 examples/passkey/passkey.cpp |   2 +-
 llama.cpp                    | 328 ++++++++++++++++++-----------------
 llama.h                      |   2 +
 3 files changed, 173 insertions(+), 159 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index e12a1cdf19a..b6ae2288bc3 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
     const int n_batch     = ctx_params.n_batch;
     const int n_batch_grp = ctx_params.n_batch/n_grp;
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
 
     // print the prompt token-by-token
 
diff --git a/llama.cpp b/llama.cpp
index 37477e6ef3c..7b0961508e3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1550,8 +1550,9 @@ static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 
 struct llama_hparams {
-    bool     vocab_only;
-    bool     rope_finetuned;
+    bool vocab_only;
+    bool rope_finetuned;
+
     uint32_t n_vocab;
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
@@ -4595,10 +4596,11 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
 using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
 
-enum llm_rope_type {
-    LLM_ROPE,
-    LLM_ROPE_NEOX,
-    LLM_ROPE_GLM,
+enum llm_rope_type : int {
+    LLM_ROPE_NONE = -1,
+    LLM_ROPE      =  0,
+    LLM_ROPE_NEOX =  2,
+    LLM_ROPE_GLM  =  4,
 };
 
 enum llm_ffn_op_type {
@@ -4655,7 +4657,7 @@ static void llm_build_k_shift(
      const llama_kv_cache & kv,
        struct ggml_cgraph * graph,
        struct ggml_tensor * K_shift,
-            llm_rope_type   type,
+            llm_rope_type   rope_type,
                   int64_t   n_ctx,
                   float     freq_base,
                   float     freq_scale,
@@ -4671,14 +4673,6 @@ static void llm_build_k_shift(
     const float   beta_fast     = cparams.yarn_beta_fast;
     const float   beta_slow     = cparams.yarn_beta_slow;
 
-    int rope_type = 0;
-
-    switch (type) {
-        case LLM_ROPE:      rope_type = 0; break;
-        case LLM_ROPE_NEOX: rope_type = 2; break;
-        case LLM_ROPE_GLM:  rope_type = 4; break;
-    }
-
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * tmp =
             // we rotate only the first n_rot dimensions
@@ -4988,6 +4982,38 @@ static struct ggml_tensor * llm_build_kv(
     return cur;
 }
 
+static llm_rope_type llm_get_rope_type(llm_arch arch) {
+    switch (arch) {
+        case LLM_ARCH_LLAMA:      return LLM_ROPE;
+        case LLM_ARCH_FALCON:     return LLM_ROPE_NEOX;
+        case LLM_ARCH_BAICHUAN:   return LLM_ROPE;
+        case LLM_ARCH_GPT2:       return LLM_ROPE_NONE;
+        case LLM_ARCH_GPTJ:       return LLM_ROPE_NONE;
+        case LLM_ARCH_GPTNEOX:    return LLM_ROPE_NONE;
+        case LLM_ARCH_MPT:        return LLM_ROPE_NONE;
+        case LLM_ARCH_STARCODER:  return LLM_ROPE;
+        case LLM_ARCH_PERSIMMON:  return LLM_ROPE_NEOX;
+        case LLM_ARCH_REFACT:     return LLM_ROPE_NONE;
+        case LLM_ARCH_BERT:       return LLM_ROPE_NEOX;
+        case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX;
+        case LLM_ARCH_BLOOM:      return LLM_ROPE_NONE;
+        case LLM_ARCH_STABLELM:   return LLM_ROPE_NEOX;
+        case LLM_ARCH_QWEN:       return LLM_ROPE_NEOX;
+        case LLM_ARCH_QWEN2:      return LLM_ROPE_NEOX;
+        case LLM_ARCH_PHI2:       return LLM_ROPE_NEOX;
+        case LLM_ARCH_PLAMO:      return LLM_ROPE;
+        case LLM_ARCH_CODESHELL:  return LLM_ROPE;
+        case LLM_ARCH_ORION:      return LLM_ROPE;
+        case LLM_ARCH_INTERNLM2:  return LLM_ROPE;
+        case LLM_ARCH_MINICPM:    return LLM_ROPE;
+        case LLM_ARCH_GEMMA:      return LLM_ROPE;
+        case LLM_ARCH_UNKNOWN:
+        default:
+            GGML_ASSERT(false && "unknown architecture");
+            return LLM_ROPE_NONE;
+    }
+}
+
 struct llm_build_context {
     const llama_model    & model;
     const llama_context  & lctx;
@@ -5022,9 +5048,10 @@ struct llm_build_context {
     const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_orig_ctx;
 
-    const bool do_rope_shift;
     const uint32_t pooling_type;
 
+    const llm_rope_type rope_type;
+
     const llm_build_cb & cb;
 
     std::vector<uint8_t> & buf_compute_meta;
@@ -5066,8 +5093,8 @@ struct llm_build_context {
         n_kv             (worst_case ? n_ctx            : kv_self.n),
         kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
-        do_rope_shift    (worst_case || kv_self.has_shift),
         pooling_type     (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
+        rope_type        (llm_get_rope_type(model.arch)),
         cb               (cb),
         buf_compute_meta (lctx.buf_compute_meta) {
             // all initializations should be done in init()
@@ -5090,6 +5117,14 @@ struct llm_build_context {
         }
     }
 
+    struct ggml_cgraph * build_k_shift() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_llama() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -5111,11 +5146,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -5151,14 +5181,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -5299,11 +5329,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
         cb(KQ_pos, "KQ_pos", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -5327,12 +5352,12 @@ struct llm_build_context {
                     case MODEL_7B:
                         Qcur = ggml_rope_custom(
                             ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                            hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                            hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow
                         );
                         Kcur = ggml_rope_custom(
                             ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                            hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                            hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow
                         );
                         break;
@@ -5417,11 +5442,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
 
@@ -5460,13 +5480,13 @@ struct llm_build_context {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -5636,10 +5656,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * residual = inpL;
 
@@ -5730,13 +5746,13 @@ struct llm_build_context {
                 cb(kpass, "kpass", il);
 
                 struct ggml_tensor * qrotated = ggml_rope_custom(
-                    ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, qrot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(qrotated, "qrotated", il);
 
                 struct ggml_tensor * krotated = ggml_rope_custom(
-                    ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, krot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(krotated, "krotated", il);
@@ -5988,14 +6004,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6284,11 +6300,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -6325,14 +6336,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6407,11 +6418,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -6441,13 +6447,13 @@ struct llm_build_context {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6521,11 +6527,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -6561,14 +6562,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6642,11 +6643,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
@@ -6684,7 +6680,7 @@ struct llm_build_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
                 Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
@@ -6695,7 +6691,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6764,11 +6760,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
 
             // norm
@@ -6793,13 +6784,13 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                         ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head,    n_tokens), inp_pos,
-                        n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                         ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
-                        n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
@@ -6969,11 +6960,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
@@ -6999,14 +6985,14 @@ struct llm_build_context {
 
                 struct ggml_tensor * Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 struct ggml_tensor * Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7077,11 +7063,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -7117,14 +7098,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7196,11 +7177,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -7236,14 +7212,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7328,11 +7304,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -7368,14 +7339,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7464,11 +7435,6 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
-        }
-
         for (int il = 0; il < n_layer; ++il) {
 
             // norm
@@ -7491,7 +7457,7 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                         ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos,
-                        n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
 
@@ -7500,7 +7466,7 @@ struct llm_build_context {
 
                 Kcur = ggml_rope_custom(
                         ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
-                        n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
@@ -7553,6 +7519,22 @@ struct llm_build_context {
     }
 };
 
+static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
+    llama_batch dummy;
+
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+
+    struct llm_build_context llm(lctx, dummy, cb, false);
+
+    llm.init();
+
+    struct ggml_cgraph * result = llm.build_k_shift();
+
+    llm.free();
+
+    return result;
+}
+
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch,
@@ -7672,6 +7654,20 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
+static void llama_set_k_shift(llama_context & lctx) {
+    const auto & cparams = lctx.cparams;
+
+    const int64_t n_ctx = cparams.n_ctx;
+
+    assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
+
+    int32_t * data = (int32_t *) lctx.inp_K_shift->data;
+
+    for (int i = 0; i < n_ctx; ++i) {
+        data[i] = lctx.kv_self.cells[i].delta;
+    }
+}
+
 static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     //
     // set input data
@@ -7739,18 +7735,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    if (kv_self.has_shift) {
-        const int64_t n_ctx = cparams.n_ctx;
-
-        assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
-
-        int32_t * data = (int32_t *) lctx.inp_K_shift->data;
-
-        for (int i = 0; i < n_ctx; ++i) {
-            data[i] = lctx.kv_self.cells[i].delta;
-        }
-    }
-
     if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
         const int64_t n_tokens = batch.n_tokens;
 
@@ -7795,6 +7779,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     }
 }
 
+static void llama_graph_compute(
+        llama_context & lctx,
+          ggml_cgraph * gf,
+                  int   n_threads) {
+#ifdef GGML_USE_MPI
+    const int64_t n_layer = hparams.n_layer;
+    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
+#endif
+
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(lctx.backend_metal)) {
+        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
+    }
+#endif
+
+    if (lctx.backend_cpu != nullptr) {
+        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+    }
+
+    ggml_backend_sched_graph_compute(lctx.sched, gf);
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
+#endif
+}
+
 // decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
@@ -7890,14 +7902,19 @@ static int llama_decode_internal(
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
+    if (kv_self.has_shift) {
+        llama_kv_cache_apply_k_shift(&lctx);
+    }
+
     ggml_backend_sched_reset(lctx.sched);
     ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
 
     // the output is always the last tensor in the graph
-    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
     struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+
     if (strcmp(res->name, "result_output") == 0) {
         // the embeddings could be the second to last tensor, or the third to last tensor
         if (strcmp(embeddings->name, "result_norm") != 0) {
@@ -7924,40 +7941,12 @@ static int llama_decode_internal(
         n_threads = std::min(4, n_threads);
     }
 
-#ifdef GGML_USE_MPI
-    const int64_t n_layer = hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
-
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(lctx.backend_metal)) {
-        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
-    }
-#endif
-
-    if (lctx.backend_cpu != nullptr) {
-        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
-    }
-
     llama_set_inputs(lctx, batch);
 
-    ggml_backend_sched_graph_compute(lctx.sched, gf);
-
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
+    llama_graph_compute(lctx, gf, n_threads);
 
     // update the kv ring buffer
     {
-        if (kv_self.has_shift) {
-            kv_self.has_shift = false;
-            for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
-            }
-        }
-
         kv_self.head += n_tokens;
 
         // Ensure kv cache head points to a valid index.
@@ -8053,6 +8042,28 @@ static int llama_decode_internal(
     return 0;
 }
 
+void llama_kv_cache_apply_k_shift(struct llama_context * ctx) {
+    struct llama_context & lctx = *ctx;
+
+    llama_set_k_shift(lctx);
+
+    {
+        ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
+
+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    }
+
+    {
+        auto & kv_self = ctx->kv_self;
+
+        kv_self.has_shift = false;
+
+        for (uint32_t i = 0; i < kv_self.size; ++i) {
+            kv_self.cells[i].delta = 0;
+        }
+    }
+}
+
 //
 // tokenizer
 //
@@ -12054,6 +12065,7 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
     llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
 }
 
+
 // Returns the *maximum* size of the state
 size_t llama_get_state_size(const struct llama_context * ctx) {
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
diff --git a/llama.h b/llama.h
index 84f196b3bb6..618841184ae 100644
--- a/llama.h
+++ b/llama.h
@@ -533,6 +533,8 @@ extern "C" {
                        llama_pos   p1,
                              int   d);
 
+    LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx);
+
     //
     // State / sessions
     //

From dd392191ca9fc2e0244c54b4cf2b888508363490 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 09:42:21 +0200
Subject: [PATCH 02/23] llama : rename llama_kv_cache_seq_shift to
 llama_kv_cache_seq_add

---
 examples/infill/infill.cpp   |  4 ++--
 examples/main/main.cpp       | 10 +++++-----
 examples/passkey/passkey.cpp | 12 ++++++------
 examples/server/server.cpp   |  8 ++++----
 llama.cpp                    |  6 +++---
 llama.h                      |  2 +-
 6 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 92c67b7cff5..d4b8729dd02 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
                 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7555dffe441..34e84d0d42f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
                     LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
                     LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_shift(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div  (ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index b6ae2288bc3..f5db05c2d65 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -146,8 +146,8 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div  (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_cache_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
 
             n_past -= bd;
         }
@@ -179,8 +179,8 @@ int main(int argc, char ** argv) {
 
         LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
 
         n_past -= n_discard;
 
@@ -208,8 +208,8 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
 
             n_past -= n_discard;
         }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 369121e885b..1b887b7a2df 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1502,8 +1502,8 @@ struct llama_server_context
                     const int n_discard = n_left / 2;
 
                     LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, slot.id, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+                    llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
 
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
                     {
@@ -1778,9 +1778,9 @@ struct llama_server_context
                         LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
                         LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
 
-                        llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
+                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                         llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
 
                         slot.n_past_se -= bd;
 
diff --git a/llama.cpp b/llama.cpp
index 7b0961508e3..accf026b194 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2187,7 +2187,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
     if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
 }
 
-static void llama_kv_cache_seq_shift(
+static void llama_kv_cache_seq_add(
         struct llama_kv_cache & cache,
                  llama_seq_id   seq_id,
                     llama_pos   p0,
@@ -12049,12 +12049,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
     llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
 }
 
-void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
     if (delta == 0) {
         return;
     }
 
-    llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
+    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
 }
 
 void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
diff --git a/llama.h b/llama.h
index 618841184ae..104ca7ead0a 100644
--- a/llama.h
+++ b/llama.h
@@ -515,7 +515,7 @@ extern "C" {
     // If the KV cache is RoPEd, the KV data is updated accordingly
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_shift(
+    LLAMA_API void llama_kv_cache_seq_add(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,

From 89b2a43cac57beeaea9c5ea0af371d3a301d7e3f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 10:28:44 +0200
Subject: [PATCH 03/23] llama : cont k-shift refactoring + normalize type names

ggml-ci
---
 common/common.cpp                    |  12 +-
 common/common.h                      |   4 +-
 examples/llama-bench/llama-bench.cpp |  14 +-
 examples/server/server.cpp           |  12 +-
 llama.cpp                            | 318 +++++++++++++--------------
 llama.h                              |  41 ++--
 6 files changed, 199 insertions(+), 202 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 10ef11829cc..95767ce4b6e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
             else { invalid_param = true; break; }
         } else if (arg == "--rope-scale") {
             if (++i >= argc) {
@@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             std::string arg_next = argv[i];
             if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_NONE;
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
             } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_LAYER;
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
             } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_ROW;
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
             } else {
                 invalid_param = true;
                 break;
diff --git a/common/common.h b/common/common.h
index 935771d44ca..3e21579b005 100644
--- a/common/common.h
+++ b/common/common.h
@@ -61,7 +61,7 @@ struct gpt_params {
     float   p_split               = 0.1f;  // speculative decoding split probability
     int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
+    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
     int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
@@ -75,7 +75,7 @@ struct gpt_params {
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
     int32_t yarn_orig_ctx         = 0;     // YaRN original context length
-    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     ggml_numa_strategy numa       = GGML_NUMA_STRATEGY_DISABLED;
 
     // // sampling parameters
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 11410f8ae76..8fec3d43ddf 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
 
 static const char * split_mode_str(llama_split_mode mode) {
     switch (mode) {
-        case LLAMA_SPLIT_NONE:  return "none";
-        case LLAMA_SPLIT_LAYER: return "layer";
-        case LLAMA_SPLIT_ROW:   return "row";
+        case LLAMA_SPLIT_MODE_NONE:  return "none";
+        case LLAMA_SPLIT_MODE_LAYER: return "layer";
+        case LLAMA_SPLIT_MODE_ROW:   return "row";
         default: GGML_ASSERT(!"invalid split mode");
     }
 }
@@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
     /* type_v        */ {GGML_TYPE_F16},
     /* n_threads     */ {get_num_physical_cores()},
     /* n_gpu_layers  */ {99},
-    /* split_mode    */ {LLAMA_SPLIT_LAYER},
+    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
     /* main_gpu      */ {0},
     /* no_kv_offload */ {false},
     /* mul_mat_q     */ {true},
@@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             for (const auto & m : p) {
                 llama_split_mode mode;
                 if (m == "none") {
-                    mode = LLAMA_SPLIT_NONE;
+                    mode = LLAMA_SPLIT_MODE_NONE;
                 } else if (m == "layer") {
-                    mode = LLAMA_SPLIT_LAYER;
+                    mode = LLAMA_SPLIT_MODE_LAYER;
                 } else if (m == "row") {
-                    mode = LLAMA_SPLIT_ROW;
+                    mode = LLAMA_SPLIT_MODE_ROW;
                 } else {
                     invalid_param = true;
                     break;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1b887b7a2df..89fdd0f8185 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2082,9 +2082,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
             else { invalid_param = true; break; }
         }
         else if (arg == "--rope-freq-base")
@@ -2208,15 +2208,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             std::string arg_next = argv[i];
             if (arg_next == "none")
             {
-                params.split_mode = LLAMA_SPLIT_NONE;
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
             }
             else if (arg_next == "layer")
             {
-                params.split_mode = LLAMA_SPLIT_LAYER;
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
             }
             else if (arg_next == "row")
             {
-                params.split_mode = LLAMA_SPLIT_ROW;
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
             }
             else {
                 invalid_param = true;
diff --git a/llama.cpp b/llama.cpp
index accf026b194..a69c86e6a8d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -850,9 +850,9 @@ struct LLM_TN {
 //
 
 static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_YARN,   "yarn"   },
+    { LLAMA_ROPE_SCALING_TYPE_NONE,   "none"   },
+    { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
+    { LLAMA_ROPE_SCALING_TYPE_YARN,   "yarn"   },
 };
 
 static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
         }
     }
 
-    return LLAMA_ROPE_SCALING_UNSPECIFIED;
+    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
 }
 
 static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1581,7 +1581,8 @@ struct llama_hparams {
     bool causal_attn = true;
     bool need_kq_pos = false;
 
-    uint32_t pooling_type = LLAMA_POOLING_NONE;
+    enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
+    enum llama_rope_type    rope_type    = LLAMA_ROPE_TYPE_NONE;
 
     bool operator!=(const llama_hparams & other) const {
         if (this->vocab_only    != other.vocab_only)    return true;
@@ -2311,7 +2312,7 @@ namespace GGUFMeta {
         }
     };
 
-    struct ArrayInfo{
+    struct ArrayInfo {
         const gguf_type gt;
         const size_t length;
         const void * data;
@@ -2330,7 +2331,7 @@ namespace GGUFMeta {
     };
 
     template<typename T>
-    class GKV: public GKV_Base<T> {
+    class GKV : public GKV_Base<T> {
         GKV() = delete;
 
         public:
@@ -2353,39 +2354,39 @@ namespace GGUFMeta {
             return "unknown";
         }
 
-        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
-            if (!override) { return false; }
-            if (override->tag == expected_type) {
+        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
+            if (!ovrd) { return false; }
+            if (ovrd->tag == expected_type) {
                 LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
-                    __func__, override_type_to_str(override->tag), override->key);
-                switch (override->tag) {
+                    __func__, override_type_to_str(ovrd->tag), ovrd->key);
+                switch (ovrd->tag) {
                     case LLAMA_KV_OVERRIDE_BOOL:  {
-                        LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
+                        LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
                     } break;
                     case LLAMA_KV_OVERRIDE_INT:   {
-                        LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
+                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
                     } break;
                     case LLAMA_KV_OVERRIDE_FLOAT: {
-                        LLAMA_LOG_INFO("%.6f\n", override->float_value);
+                        LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
                     } break;
                     default:
                         // Shouldn't be possible to end up here, but just in case...
                         throw std::runtime_error(
                             format("Unsupported attempt to override %s type for metadata key %s\n",
-                                override_type_to_str(override->tag), override->key));
+                                override_type_to_str(ovrd->tag), ovrd->key));
                 }
                 return true;
             }
             LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
-                __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
+                __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
             return false;
         }
 
         template<typename OT>
         static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
-        try_override(OT & target, const struct llama_model_kv_override *override) {
-            if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
-                target = override->bool_value;
+        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_BOOL, ovrd)) {
+                target = ovrd->bool_value;
                 return true;
             }
             return false;
@@ -2393,9 +2394,9 @@ namespace GGUFMeta {
 
         template<typename OT>
         static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
-        try_override(OT & target, const struct llama_model_kv_override *override) {
-            if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
-                target = override->int_value;
+        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_INT, ovrd)) {
+                target = ovrd->int_value;
                 return true;
             }
             return false;
@@ -2403,9 +2404,9 @@ namespace GGUFMeta {
 
         template<typename OT>
         static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override *override) {
-            if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
-                target = override->float_value;
+        try_override(T & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, ovrd)) {
+                target = ovrd->float_value;
                 return true;
             }
             return false;
@@ -2413,17 +2414,17 @@ namespace GGUFMeta {
 
         template<typename OT>
         static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override *override) {
+        try_override(T & target, const struct llama_model_kv_override * ovrd) {
             (void)target;
-            (void)override;
-            if (!override) { return false; }
+            (void)ovrd;
+            if (!ovrd) { return false; }
             // Currently, we should never end up here so it would be a bug if we do.
             throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
-                override ? override->key : "NULL"));
+                ovrd ? ovrd->key : "NULL"));
         }
 
-        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
-            if (try_override<T>(target, override)) {
+        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+            if (try_override<T>(target, ovrd)) {
                 return true;
             }
             if (k < 0) { return false; }
@@ -2431,12 +2432,12 @@ namespace GGUFMeta {
             return true;
         }
 
-        static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
-            return set(ctx, gguf_find_key(ctx, key), target, override);
+        static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+            return set(ctx, gguf_find_key(ctx, key), target, ovrd);
         }
 
-        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
-            return set(ctx, key.c_str(), target, override);
+        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+            return set(ctx, key.c_str(), target, ovrd);
         }
     };
 }
@@ -2846,6 +2847,15 @@ struct llama_model_loader {
     }
 };
 
+template<>
+bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
+    uint32_t tmp;
+    const bool found = get_key(kid, tmp, required);
+    result = (enum llama_pooling_type) tmp;
+    return found;
+}
+
+
 //
 // load LLaMA models
 //
@@ -2924,16 +2934,16 @@ static const char * llama_model_type_name(e_model type) {
         default:           return "?B";
     }
 }
+
 static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
     switch (type) {
-        case LLAMA_VOCAB_TYPE_SPM:         return "SPM";
-        case LLAMA_VOCAB_TYPE_BPE:         return "BPE";
-        case LLAMA_VOCAB_TYPE_WPM:         return "WPM";
-        default:                           return "unknown";
+        case LLAMA_VOCAB_TYPE_SPM: return "SPM";
+        case LLAMA_VOCAB_TYPE_BPE: return "BPE";
+        case LLAMA_VOCAB_TYPE_WPM: return "WPM";
+        default:                   return "unknown";
     }
 }
 
-
 static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     model.arch = ml.get_arch();
     if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -2997,7 +3007,7 @@ static void llm_load_hparams(
     std::string rope_scaling("linear");
     ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
     hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
-    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
+    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
 
     // rope_freq_scale (inverse of the kv) is optional
     float ropescale = 0.0f;
@@ -3110,10 +3120,10 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_BERT:
             {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
-                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
 
                 switch (hparams.n_layer) {
                     case 3:
@@ -3131,10 +3141,10 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_NOMIC_BERT:
             {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
-                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
 
                 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
                     model.type = e_model::MODEL_137M;
@@ -3273,6 +3283,8 @@ static void llm_load_hparams(
     if (hparams.f_max_alibi_bias > 0.0f) {
         hparams.need_kq_pos = true;
     }
+
+    hparams.rope_type = llama_rope_type(&model);
 }
 
 // TODO: This should probably be in llama.h
@@ -3575,6 +3587,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
     LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
+    LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
+    LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
     LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
@@ -3641,7 +3655,7 @@ static bool llm_load_tensors(
         model.buft_layer[i] = llama_default_buffer_type_cpu(true);
     }
 
-    if (split_mode == LLAMA_SPLIT_LAYER) {
+    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
         // calculate the split points
         int device_count = llama_get_device_count();
         bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3680,10 +3694,10 @@ static bool llm_load_tensors(
         }
     } else {
         ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_ROW) {
+        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
             split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
         } else {
-            // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
+            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
             split_buft = llama_default_buffer_type_offload(main_gpu);
         }
         // assign the repeating layers
@@ -4596,13 +4610,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
 using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
 
-enum llm_rope_type : int {
-    LLM_ROPE_NONE = -1,
-    LLM_ROPE      =  0,
-    LLM_ROPE_NEOX =  2,
-    LLM_ROPE_GLM  =  4,
-};
-
 enum llm_ffn_op_type {
     LLM_FFN_SILU,
     LLM_FFN_GELU,
@@ -4648,47 +4655,6 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
 
-// Persimmon: n_rot = n_embd_head_k/2
-// Other:     n_rot = n_embd_head_k
-static void llm_build_k_shift(
-      struct ggml_context * ctx,
-      const llama_hparams & hparams,
-      const llama_cparams & cparams,
-     const llama_kv_cache & kv,
-       struct ggml_cgraph * graph,
-       struct ggml_tensor * K_shift,
-            llm_rope_type   rope_type,
-                  int64_t   n_ctx,
-                  float     freq_base,
-                  float     freq_scale,
-       const llm_build_cb & cb) {
-    const int64_t n_layer       = hparams.n_layer;
-    const int64_t n_head_kv     = hparams.n_head_kv;
-    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
-    const int32_t n_rot         = hparams.n_rot;
-    const int32_t n_orig_ctx    = cparams.n_yarn_orig_ctx;
-    const float   ext_factor    = cparams.yarn_ext_factor;
-    const float   attn_factor   = cparams.yarn_attn_factor;
-    const float   beta_fast     = cparams.yarn_beta_fast;
-    const float   beta_slow     = cparams.yarn_beta_slow;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * tmp =
-            // we rotate only the first n_rot dimensions
-            ggml_rope_custom_inplace(ctx,
-                    ggml_view_3d(ctx, kv.k_l[il],
-                        n_embd_head_k, n_head_kv, n_ctx,
-                        ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
-                        ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
-                        0),
-                    K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
-        cb(tmp, "K_shifted", il);
-        ggml_build_forward_expand(graph, tmp);
-    }
-}
-
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
@@ -4982,38 +4948,6 @@ static struct ggml_tensor * llm_build_kv(
     return cur;
 }
 
-static llm_rope_type llm_get_rope_type(llm_arch arch) {
-    switch (arch) {
-        case LLM_ARCH_LLAMA:      return LLM_ROPE;
-        case LLM_ARCH_FALCON:     return LLM_ROPE_NEOX;
-        case LLM_ARCH_BAICHUAN:   return LLM_ROPE;
-        case LLM_ARCH_GPT2:       return LLM_ROPE_NONE;
-        case LLM_ARCH_GPTJ:       return LLM_ROPE_NONE;
-        case LLM_ARCH_GPTNEOX:    return LLM_ROPE_NONE;
-        case LLM_ARCH_MPT:        return LLM_ROPE_NONE;
-        case LLM_ARCH_STARCODER:  return LLM_ROPE;
-        case LLM_ARCH_PERSIMMON:  return LLM_ROPE_NEOX;
-        case LLM_ARCH_REFACT:     return LLM_ROPE_NONE;
-        case LLM_ARCH_BERT:       return LLM_ROPE_NEOX;
-        case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX;
-        case LLM_ARCH_BLOOM:      return LLM_ROPE_NONE;
-        case LLM_ARCH_STABLELM:   return LLM_ROPE_NEOX;
-        case LLM_ARCH_QWEN:       return LLM_ROPE_NEOX;
-        case LLM_ARCH_QWEN2:      return LLM_ROPE_NEOX;
-        case LLM_ARCH_PHI2:       return LLM_ROPE_NEOX;
-        case LLM_ARCH_PLAMO:      return LLM_ROPE;
-        case LLM_ARCH_CODESHELL:  return LLM_ROPE;
-        case LLM_ARCH_ORION:      return LLM_ROPE;
-        case LLM_ARCH_INTERNLM2:  return LLM_ROPE;
-        case LLM_ARCH_MINICPM:    return LLM_ROPE;
-        case LLM_ARCH_GEMMA:      return LLM_ROPE;
-        case LLM_ARCH_UNKNOWN:
-        default:
-            GGML_ASSERT(false && "unknown architecture");
-            return LLM_ROPE_NONE;
-    }
-}
-
 struct llm_build_context {
     const llama_model    & model;
     const llama_context  & lctx;
@@ -5024,6 +4958,7 @@ struct llm_build_context {
 
     const int64_t n_embd;
     const int64_t n_layer;
+    const int64_t n_rot;
     const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
     const int64_t n_head;
     const int64_t n_head_kv;
@@ -5048,9 +4983,8 @@ struct llm_build_context {
     const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_orig_ctx;
 
-    const uint32_t pooling_type;
-
-    const llm_rope_type rope_type;
+    const enum llama_pooling_type pooling_type;
+    const enum llama_rope_type    rope_type;
 
     const llm_build_cb & cb;
 
@@ -5072,6 +5006,7 @@ struct llm_build_context {
         kv_self          (lctx.kv_self),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
+        n_rot            (hparams.n_rot),
         n_ctx            (cparams.n_ctx),
         n_head           (hparams.n_head),
         n_head_kv        (hparams.n_head_kv),
@@ -5093,8 +5028,8 @@ struct llm_build_context {
         n_kv             (worst_case ? n_ctx            : kv_self.n),
         kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
-        pooling_type     (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
-        rope_type        (llm_get_rope_type(model.arch)),
+        pooling_type     (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
+        rope_type        (hparams.rope_type),
         cb               (cb),
         buf_compute_meta (lctx.buf_compute_meta) {
             // all initializations should be done in init()
@@ -5120,7 +5055,20 @@ struct llm_build_context {
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
-        llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb);
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * tmp =
+                // we rotate only the first n_rot dimensions
+                ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k_l[il],
+                            n_embd_head_k, n_head_kv, n_ctx,
+                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                            0),
+                        lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(tmp, "K_shifted", il);
+            ggml_build_forward_expand(gf, tmp);
+        }
 
         return gf;
     }
@@ -6063,12 +6011,12 @@ struct llm_build_context {
         cur = inpL;
 
         // pooling layer
-        if (pooling_type == LLAMA_POOLING_MEAN) {
+        if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
             cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
-        } else if (pooling_type == LLAMA_POOLING_CLS) {
+        } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
             cur = ggml_get_rows(ctx0, cur, inp_cls);
         } else {
-            GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
+            GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
         }
         cb(cur, "result_embd", -1);
 
@@ -7521,6 +7469,7 @@ struct llm_build_context {
 
 static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
     llama_batch dummy;
+    dummy.n_tokens = 0;
 
     llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
 
@@ -7735,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
+    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
         const int64_t n_tokens = batch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -7763,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
+    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
         const int64_t n_tokens = batch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -7784,7 +7733,7 @@ static void llama_graph_compute(
           ggml_cgraph * gf,
                   int   n_threads) {
 #ifdef GGML_USE_MPI
-    const int64_t n_layer = hparams.n_layer;
+    const int64_t n_layer = lctx.hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif
 
@@ -7902,9 +7851,7 @@ static int llama_decode_internal(
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-    if (kv_self.has_shift) {
-        llama_kv_cache_apply_k_shift(&lctx);
-    }
+    llama_kv_cache_apply(&lctx);
 
     ggml_backend_sched_reset(lctx.sched);
     ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
@@ -8042,24 +7989,25 @@ static int llama_decode_internal(
     return 0;
 }
 
-void llama_kv_cache_apply_k_shift(struct llama_context * ctx) {
-    struct llama_context & lctx = *ctx;
-
-    llama_set_k_shift(lctx);
+static void llama_kv_cache_apply_internal(struct llama_context & lctx) {
+    // apply K-shift if needed
+    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
+        llama_set_k_shift(lctx);
 
-    {
-        ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
+        {
+            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
 
-        llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
-    }
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+        }
 
-    {
-        auto & kv_self = ctx->kv_self;
+        {
+            auto & kv_self = lctx.kv_self;
 
-        kv_self.has_shift = false;
+            kv_self.has_shift = false;
 
-        for (uint32_t i = 0; i < kv_self.size; ++i) {
-            kv_self.cells[i].delta = 0;
+            for (uint32_t i = 0; i < kv_self.size; ++i) {
+                kv_self.cells[i].delta = 0;
+            }
         }
     }
 }
@@ -11338,7 +11286,7 @@ static int llama_apply_lora_from_file_internal(
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_LAYER,
+        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
@@ -11364,7 +11312,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_batch                     =*/ 512,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
@@ -11552,16 +11500,16 @@ struct llama_context * llama_new_context_with_model(
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
     auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;
     }
 
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
         cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
     }
 
     if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
     }
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11595,8 +11543,8 @@ struct llama_context * llama_new_context_with_model(
         }
 #elif defined(GGML_USE_CUBLAS)
         if (model->n_gpu_layers > 0) {
-            // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
-            if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
+            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
+            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
                 ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
                 if (backend == nullptr) {
                     LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11605,7 +11553,7 @@ struct llama_context * llama_new_context_with_model(
                 }
                 ctx->backends.push_back(backend);
             } else {
-                // LLAMA_SPLIT_LAYER requires a backend for each GPU
+                // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
                 for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
                     ggml_backend_t backend = ggml_backend_cuda_init(device);
                     if (backend == nullptr) {
@@ -11807,6 +11755,38 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
     return model->vocab.type;
 }
 
+enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+    switch (model->arch) {
+        case LLM_ARCH_LLAMA:      return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_FALCON:     return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_BAICHUAN:   return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_GPT2:       return LLAMA_ROPE_TYPE_NONE;
+        case LLM_ARCH_GPTJ:       return LLAMA_ROPE_TYPE_NONE;
+        case LLM_ARCH_GPTNEOX:    return LLAMA_ROPE_TYPE_NONE;
+        case LLM_ARCH_MPT:        return LLAMA_ROPE_TYPE_NONE;
+        case LLM_ARCH_STARCODER:  return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_PERSIMMON:  return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_REFACT:     return LLAMA_ROPE_TYPE_NONE;
+        case LLM_ARCH_BERT:       return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_NOMIC_BERT: return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_BLOOM:      return LLAMA_ROPE_TYPE_NONE;
+        case LLM_ARCH_STABLELM:   return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_QWEN:       return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_QWEN2:      return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_PHI2:       return LLAMA_ROPE_TYPE_NEOX;
+        case LLM_ARCH_PLAMO:      return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_CODESHELL:  return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_ORION:      return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_INTERNLM2:  return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_MINICPM:    return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_GEMMA:      return LLAMA_ROPE_TYPE;
+        case LLM_ARCH_UNKNOWN:
+        default:
+            GGML_ASSERT(false && "unknown architecture");
+            return LLAMA_ROPE_TYPE_NONE;
+    }
+}
+
 int32_t llama_n_vocab(const struct llama_model * model) {
     return model->vocab.id_to_token.size();
 }
@@ -12065,6 +12045,10 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
     llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
 }
 
+void llama_kv_cache_apply(struct llama_context * ctx) {
+    llama_kv_cache_apply_internal(*ctx);
+}
+
 
 // Returns the *maximum* size of the state
 size_t llama_get_state_size(const struct llama_context * ctx) {
diff --git a/llama.h b/llama.h
index 104ca7ead0a..479265f6c1b 100644
--- a/llama.h
+++ b/llama.h
@@ -64,6 +64,13 @@ extern "C" {
         LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
     };
 
+    enum llama_rope_type {
+        LLAMA_ROPE_TYPE_NONE = -1,
+        LLAMA_ROPE_TYPE      =  0,
+        LLAMA_ROPE_TYPE_NEOX =  2,
+        LLAMA_ROPE_TYPE_GLM  =  4,
+    };
+
     enum llama_token_type {
         LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
         LLAMA_TOKEN_TYPE_NORMAL       = 1,
@@ -107,23 +114,23 @@ extern "C" {
     };
 
     enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_NONE        = 0,
-        LLAMA_ROPE_SCALING_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_YARN        = 2,
-        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
+        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
+        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
     };
 
     enum llama_pooling_type {
-        LLAMA_POOLING_NONE = 0,
-        LLAMA_POOLING_MEAN = 1,
-        LLAMA_POOLING_CLS  = 2,
+        LLAMA_POOLING_TYPE_NONE = 0,
+        LLAMA_POOLING_TYPE_MEAN = 1,
+        LLAMA_POOLING_TYPE_CLS  = 2,
     };
 
     enum llama_split_mode {
-        LLAMA_SPLIT_NONE    = 0, // single GPU
-        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
+        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
     };
 
     typedef struct llama_token_data {
@@ -358,6 +365,7 @@ extern "C" {
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
+    LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);
 
     LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
     LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -512,7 +520,9 @@ extern "C" {
                     llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // If the KV cache is RoPEd, the KV data is updated accordingly
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_apply()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_add(
@@ -523,7 +533,9 @@ extern "C" {
                        llama_pos   delta);
 
     // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_apply()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_div(
@@ -533,7 +545,8 @@ extern "C" {
                        llama_pos   p1,
                              int   d);
 
-    LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx);
+    // Apply the KV cache updates (such as K-shifts) to the KV data
+    LLAMA_API void llama_kv_cache_apply(struct llama_context * ctx);
 
     //
     // State / sessions

From 2b9a9bff2b0d70bc8c88cd3c756305129da4cbd5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 10:41:21 +0200
Subject: [PATCH 04/23] minor : fix MPI builds

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index a69c86e6a8d..4b257c472da 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7733,7 +7733,7 @@ static void llama_graph_compute(
           ggml_cgraph * gf,
                   int   n_threads) {
 #ifdef GGML_USE_MPI
-    const int64_t n_layer = lctx.hparams.n_layer;
+    const int64_t n_layer = lctx.model.hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif
 

From 5f5b1b57caee36bf9835bdea5514731b5b574322 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 10:44:59 +0200
Subject: [PATCH 05/23] llama : reuse n_rot from the build context

ggml-ci
---
 llama.cpp | 68 +++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4b257c472da..3a257a8637e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5129,14 +5129,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -5300,12 +5300,12 @@ struct llm_build_context {
                     case MODEL_7B:
                         Qcur = ggml_rope_custom(
                             ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                            hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                            n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow
                         );
                         Kcur = ggml_rope_custom(
                             ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                            hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                            n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow
                         );
                         break;
@@ -5428,13 +5428,13 @@ struct llm_build_context {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -5661,7 +5661,7 @@ struct llm_build_context {
 
                 // RoPE the first n_rot of q/k, pass the other half, and concat.
                 struct ggml_tensor * qrot = ggml_view_3d(
-                        ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
+                        ctx0, tmpq, n_rot, n_head, n_tokens,
                         ggml_element_size(tmpq) * n_embd_head,
                         ggml_element_size(tmpq) * n_embd_head * n_head,
                         0
@@ -5669,7 +5669,7 @@ struct llm_build_context {
                 cb(qrot, "qrot", il);
 
                 struct ggml_tensor * krot = ggml_view_3d(
-                        ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
+                        ctx0, tmpk, n_rot, n_head, n_tokens,
                         ggml_element_size(tmpk) * n_embd_head,
                         ggml_element_size(tmpk) * n_embd_head * n_head,
                         0
@@ -5678,29 +5678,29 @@ struct llm_build_context {
 
                 // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
                 struct ggml_tensor * qpass = ggml_view_3d(
-                        ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
+                        ctx0, tmpq, n_rot, n_head, n_tokens,
                         ggml_element_size(tmpq) * n_embd_head,
                         ggml_element_size(tmpq) * n_embd_head * n_head,
-                        ggml_element_size(tmpq) * hparams.n_rot
+                        ggml_element_size(tmpq) * n_rot
                         );
                 cb(qpass, "qpass", il);
 
                 struct ggml_tensor * kpass = ggml_view_3d(
-                        ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
+                        ctx0, tmpk, n_rot, n_head, n_tokens,
                         ggml_element_size(tmpk) * n_embd_head,
                         ggml_element_size(tmpk) * n_embd_head * n_head,
-                        ggml_element_size(tmpk) * hparams.n_rot
+                        ggml_element_size(tmpk) * n_rot
                         );
                 cb(kpass, "kpass", il);
 
                 struct ggml_tensor * qrotated = ggml_rope_custom(
-                    ctx0, qrot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(qrotated, "qrotated", il);
 
                 struct ggml_tensor * krotated = ggml_rope_custom(
-                    ctx0, krot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(krotated, "krotated", il);
@@ -5952,14 +5952,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6284,14 +6284,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6395,13 +6395,13 @@ struct llm_build_context {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6510,14 +6510,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6628,7 +6628,7 @@ struct llm_build_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
                 Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
@@ -6639,7 +6639,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -6731,13 +6731,13 @@ struct llm_build_context {
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_custom(
-                        ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head,    n_tokens), inp_pos,
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens), inp_pos,
                         n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
-                        ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
                         n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
@@ -6933,14 +6933,14 @@ struct llm_build_context {
 
                 struct ggml_tensor * Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 struct ggml_tensor * Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7046,14 +7046,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7160,14 +7160,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7287,14 +7287,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);

From 42ddf4846c556bad1599654df5cd4ec6b9a792a6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 11:23:37 +0200
Subject: [PATCH 06/23] llama : revert enum name changes from this PR

ggml-ci
---
 common/common.cpp                    | 12 ++++----
 common/common.h                      |  4 +--
 examples/llama-bench/llama-bench.cpp | 14 ++++-----
 examples/server/server.cpp           | 12 ++++----
 llama.cpp                            | 46 ++++++++++++++--------------
 llama.h                              | 22 ++++++-------
 6 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 95767ce4b6e..10ef11829cc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
             else { invalid_param = true; break; }
         } else if (arg == "--rope-scale") {
             if (++i >= argc) {
@@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             std::string arg_next = argv[i];
             if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+                params.split_mode = LLAMA_SPLIT_NONE;
             } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+                params.split_mode = LLAMA_SPLIT_LAYER;
             } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+                params.split_mode = LLAMA_SPLIT_ROW;
             } else {
                 invalid_param = true;
                 break;
diff --git a/common/common.h b/common/common.h
index 3e21579b005..935771d44ca 100644
--- a/common/common.h
+++ b/common/common.h
@@ -61,7 +61,7 @@ struct gpt_params {
     float   p_split               = 0.1f;  // speculative decoding split probability
     int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    llama_split_mode split_mode   = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
     int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
@@ -75,7 +75,7 @@ struct gpt_params {
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
     int32_t yarn_orig_ctx         = 0;     // YaRN original context length
-    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED;
     ggml_numa_strategy numa       = GGML_NUMA_STRATEGY_DISABLED;
 
     // // sampling parameters
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8fec3d43ddf..11410f8ae76 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
 
 static const char * split_mode_str(llama_split_mode mode) {
     switch (mode) {
-        case LLAMA_SPLIT_MODE_NONE:  return "none";
-        case LLAMA_SPLIT_MODE_LAYER: return "layer";
-        case LLAMA_SPLIT_MODE_ROW:   return "row";
+        case LLAMA_SPLIT_NONE:  return "none";
+        case LLAMA_SPLIT_LAYER: return "layer";
+        case LLAMA_SPLIT_ROW:   return "row";
         default: GGML_ASSERT(!"invalid split mode");
     }
 }
@@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
     /* type_v        */ {GGML_TYPE_F16},
     /* n_threads     */ {get_num_physical_cores()},
     /* n_gpu_layers  */ {99},
-    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
+    /* split_mode    */ {LLAMA_SPLIT_LAYER},
     /* main_gpu      */ {0},
     /* no_kv_offload */ {false},
     /* mul_mat_q     */ {true},
@@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             for (const auto & m : p) {
                 llama_split_mode mode;
                 if (m == "none") {
-                    mode = LLAMA_SPLIT_MODE_NONE;
+                    mode = LLAMA_SPLIT_NONE;
                 } else if (m == "layer") {
-                    mode = LLAMA_SPLIT_MODE_LAYER;
+                    mode = LLAMA_SPLIT_LAYER;
                 } else if (m == "row") {
-                    mode = LLAMA_SPLIT_MODE_ROW;
+                    mode = LLAMA_SPLIT_ROW;
                 } else {
                     invalid_param = true;
                     break;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 89fdd0f8185..1b887b7a2df 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2082,9 +2082,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
             else { invalid_param = true; break; }
         }
         else if (arg == "--rope-freq-base")
@@ -2208,15 +2208,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             std::string arg_next = argv[i];
             if (arg_next == "none")
             {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+                params.split_mode = LLAMA_SPLIT_NONE;
             }
             else if (arg_next == "layer")
             {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+                params.split_mode = LLAMA_SPLIT_LAYER;
             }
             else if (arg_next == "row")
             {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+                params.split_mode = LLAMA_SPLIT_ROW;
             }
             else {
                 invalid_param = true;
diff --git a/llama.cpp b/llama.cpp
index 3a257a8637e..d950fc02282 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -850,9 +850,9 @@ struct LLM_TN {
 //
 
 static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_TYPE_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_TYPE_YARN,   "yarn"   },
+    { LLAMA_ROPE_SCALING_NONE,   "none"   },
+    { LLAMA_ROPE_SCALING_LINEAR, "linear" },
+    { LLAMA_ROPE_SCALING_YARN,   "yarn"   },
 };
 
 static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
         }
     }
 
-    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    return LLAMA_ROPE_SCALING_UNSPECIFIED;
 }
 
 static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1581,7 +1581,7 @@ struct llama_hparams {
     bool causal_attn = true;
     bool need_kq_pos = false;
 
-    enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
+    enum llama_pooling_type pooling_type = LLAMA_POOLING_NONE;
     enum llama_rope_type    rope_type    = LLAMA_ROPE_TYPE_NONE;
 
     bool operator!=(const llama_hparams & other) const {
@@ -3007,7 +3007,7 @@ static void llm_load_hparams(
     std::string rope_scaling("linear");
     ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
     hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
-    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
+    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
 
     // rope_freq_scale (inverse of the kv) is optional
     float ropescale = 0.0f;
@@ -3655,7 +3655,7 @@ static bool llm_load_tensors(
         model.buft_layer[i] = llama_default_buffer_type_cpu(true);
     }
 
-    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
+    if (split_mode == LLAMA_SPLIT_LAYER) {
         // calculate the split points
         int device_count = llama_get_device_count();
         bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3694,10 +3694,10 @@ static bool llm_load_tensors(
         }
     } else {
         ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
+        if (split_mode == LLAMA_SPLIT_ROW) {
             split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
         } else {
-            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
+            // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
             split_buft = llama_default_buffer_type_offload(main_gpu);
         }
         // assign the repeating layers
@@ -5028,7 +5028,7 @@ struct llm_build_context {
         n_kv             (worst_case ? n_ctx            : kv_self.n),
         kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
-        pooling_type     (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
+        pooling_type     (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_NONE),
         rope_type        (hparams.rope_type),
         cb               (cb),
         buf_compute_meta (lctx.buf_compute_meta) {
@@ -6011,12 +6011,12 @@ struct llm_build_context {
         cur = inpL;
 
         // pooling layer
-        if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        if (pooling_type == LLAMA_POOLING_MEAN) {
             cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
-        } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
+        } else if (pooling_type == LLAMA_POOLING_CLS) {
             cur = ggml_get_rows(ctx0, cur, inp_cls);
         } else {
-            GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
+            GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
         }
         cb(cur, "result_embd", -1);
 
@@ -7684,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
         const int64_t n_tokens = batch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -7712,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
         const int64_t n_tokens = batch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -11286,7 +11286,7 @@ static int llama_apply_lora_from_file_internal(
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
+        /*.split_mode                  =*/ LLAMA_SPLIT_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
@@ -11312,7 +11312,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_batch                     =*/ 512,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
@@ -11500,16 +11500,16 @@ struct llama_context * llama_new_context_with_model(
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
     auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;
     }
 
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
         cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
     }
 
     if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
     }
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11543,8 +11543,8 @@ struct llama_context * llama_new_context_with_model(
         }
 #elif defined(GGML_USE_CUBLAS)
         if (model->n_gpu_layers > 0) {
-            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
+            if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
                 ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
                 if (backend == nullptr) {
                     LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11553,7 +11553,7 @@ struct llama_context * llama_new_context_with_model(
                 }
                 ctx->backends.push_back(backend);
             } else {
-                // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
+                // LLAMA_SPLIT_LAYER requires a backend for each GPU
                 for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
                     ggml_backend_t backend = ggml_backend_cuda_init(device);
                     if (backend == nullptr) {
diff --git a/llama.h b/llama.h
index 479265f6c1b..ef87ed5a652 100644
--- a/llama.h
+++ b/llama.h
@@ -114,23 +114,23 @@ extern "C" {
     };
 
     enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
-        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
     };
 
     enum llama_pooling_type {
-        LLAMA_POOLING_TYPE_NONE = 0,
-        LLAMA_POOLING_TYPE_MEAN = 1,
-        LLAMA_POOLING_TYPE_CLS  = 2,
+        LLAMA_POOLING_NONE = 0,
+        LLAMA_POOLING_MEAN = 1,
+        LLAMA_POOLING_CLS  = 2,
     };
 
     enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_NONE    = 0, // single GPU
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
     };
 
     typedef struct llama_token_data {

From 31e1ec928fc11dc793135da211eae3f78a6dc68f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 11:38:00 +0200
Subject: [PATCH 07/23] llama : update llama_rope_type

---
 llama.cpp | 57 ++++++++++++++++++++++++++++++++-----------------------
 llama.h   |  2 +-
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d950fc02282..cbeb9714ec7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11757,31 +11757,40 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
 
 enum llama_rope_type llama_rope_type(const struct llama_model * model) {
     switch (model->arch) {
-        case LLM_ARCH_LLAMA:      return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_FALCON:     return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_BAICHUAN:   return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_GPT2:       return LLAMA_ROPE_TYPE_NONE;
-        case LLM_ARCH_GPTJ:       return LLAMA_ROPE_TYPE_NONE;
-        case LLM_ARCH_GPTNEOX:    return LLAMA_ROPE_TYPE_NONE;
-        case LLM_ARCH_MPT:        return LLAMA_ROPE_TYPE_NONE;
-        case LLM_ARCH_STARCODER:  return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_PERSIMMON:  return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_REFACT:     return LLAMA_ROPE_TYPE_NONE;
-        case LLM_ARCH_BERT:       return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_NOMIC_BERT: return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_BLOOM:      return LLAMA_ROPE_TYPE_NONE;
-        case LLM_ARCH_STABLELM:   return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_QWEN:       return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_QWEN2:      return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_PHI2:       return LLAMA_ROPE_TYPE_NEOX;
-        case LLM_ARCH_PLAMO:      return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_CODESHELL:  return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_ORION:      return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_INTERNLM2:  return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_MINICPM:    return LLAMA_ROPE_TYPE;
-        case LLM_ARCH_GEMMA:      return LLAMA_ROPE_TYPE;
+        // these models do not use RoPE
+        case LLM_ARCH_GPT2:
+        case LLM_ARCH_GPTJ:
+        case LLM_ARCH_GPTNEOX:
+        case LLM_ARCH_MPT:
+        case LLM_ARCH_REFACT:
+        case LLM_ARCH_BLOOM:
+            return LLAMA_ROPE_TYPE_NONE;
+
+        // use what we call a normal RoPE, operating on pairs of consecutive head values
+        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_BAICHUAN:
+        case LLM_ARCH_STARCODER:
+        case LLM_ARCH_PLAMO:
+        case LLM_ARCH_CODESHELL:
+        case LLM_ARCH_ORION:
+        case LLM_ARCH_INTERNLM2:
+        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_GEMMA:
+            return LLAMA_ROPE_TYPE_NORM;
+
+        // the pairs of head values are offset by n_rot/2
+        case LLM_ARCH_FALCON:
+        case LLM_ARCH_PERSIMMON:
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_STABLELM:
+        case LLM_ARCH_QWEN:
+        case LLM_ARCH_QWEN2:
+        case LLM_ARCH_PHI2:
+            return LLAMA_ROPE_TYPE_NEOX;
+
+        // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
-        default:
             GGML_ASSERT(false && "unknown architecture");
             return LLAMA_ROPE_TYPE_NONE;
     }
diff --git a/llama.h b/llama.h
index ef87ed5a652..160feeda4fb 100644
--- a/llama.h
+++ b/llama.h
@@ -66,7 +66,7 @@ extern "C" {
 
     enum llama_rope_type {
         LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE      =  0,
+        LLAMA_ROPE_TYPE_NORM =  0,
         LLAMA_ROPE_TYPE_NEOX =  2,
         LLAMA_ROPE_TYPE_GLM  =  4,
     };

From decea312200d183001ff2384c0c1b2fc36f52034 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 11:42:55 +0200
Subject: [PATCH 08/23] llama : add comment about rope values

---
 llama.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama.h b/llama.h
index 160feeda4fb..dda6aa39d36 100644
--- a/llama.h
+++ b/llama.h
@@ -64,6 +64,8 @@ extern "C" {
         LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
     };
 
+    // note: these values should be synchronized with ggml_rope
+    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
     enum llama_rope_type {
         LLAMA_ROPE_TYPE_NONE = -1,
         LLAMA_ROPE_TYPE_NORM =  0,

From 8f9fe6dd7fb7102271ec6e04bbe668c553e3a6d1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 12:40:44 +0200
Subject: [PATCH 09/23] llama : fix build

---
 llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index cbeb9714ec7..f074bb2628b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11792,8 +11792,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
             GGML_ASSERT(false && "unknown architecture");
-            return LLAMA_ROPE_TYPE_NONE;
+            break;
     }
+
+    return LLAMA_ROPE_TYPE_NONE;
 }
 
 int32_t llama_n_vocab(const struct llama_model * model) {

From 79e276175e50495aca4460568ad5ee1490bac732 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 12:44:02 +0200
Subject: [PATCH 10/23] passkey : apply kv cache updates explicitly

ggml-ci
---
 examples/passkey/passkey.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index f5db05c2d65..4e129947cb6 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -148,6 +148,7 @@ int main(int argc, char ** argv) {
 
             llama_kv_cache_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
             llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_cache_apply  (ctx);
 
             n_past -= bd;
         }
@@ -181,6 +182,7 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+        llama_kv_cache_apply  (ctx);
 
         n_past -= n_discard;
 
@@ -210,6 +212,7 @@ int main(int argc, char ** argv) {
 
             llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+            llama_kv_cache_apply  (ctx);
 
             n_past -= n_discard;
         }

From 18da970e1c8bf9489fdf1d0d1cd2c5ff9d60754a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 12:46:33 +0200
Subject: [PATCH 11/23] llama : change name to llama_kv_cache_update()

---
 examples/passkey/passkey.cpp | 6 +++---
 llama.cpp                    | 8 ++++----
 llama.h                      | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 4e129947cb6..574728f89a5 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -148,7 +148,7 @@ int main(int argc, char ** argv) {
 
             llama_kv_cache_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
             llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_apply  (ctx);
+            llama_kv_cache_update (ctx);
 
             n_past -= bd;
         }
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-        llama_kv_cache_apply  (ctx);
+        llama_kv_cache_update (ctx);
 
         n_past -= n_discard;
 
@@ -212,7 +212,7 @@ int main(int argc, char ** argv) {
 
             llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-            llama_kv_cache_apply  (ctx);
+            llama_kv_cache_update (ctx);
 
             n_past -= n_discard;
         }
diff --git a/llama.cpp b/llama.cpp
index f074bb2628b..263fdf13e4f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7851,7 +7851,7 @@ static int llama_decode_internal(
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-    llama_kv_cache_apply(&lctx);
+    llama_kv_cache_update(&lctx);
 
     ggml_backend_sched_reset(lctx.sched);
     ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
@@ -7989,7 +7989,7 @@ static int llama_decode_internal(
     return 0;
 }
 
-static void llama_kv_cache_apply_internal(struct llama_context & lctx) {
+static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     // apply K-shift if needed
     if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
         llama_set_k_shift(lctx);
@@ -12056,8 +12056,8 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
     llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
 }
 
-void llama_kv_cache_apply(struct llama_context * ctx) {
-    llama_kv_cache_apply_internal(*ctx);
+void llama_kv_cache_update(struct llama_context * ctx) {
+    llama_kv_cache_update_internal(*ctx);
 }
 
 
diff --git a/llama.h b/llama.h
index dda6aa39d36..b1621d6a3f1 100644
--- a/llama.h
+++ b/llama.h
@@ -524,7 +524,7 @@ extern "C" {
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_apply()
+    //   - explicitly with llama_kv_cache_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_add(
@@ -537,7 +537,7 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_apply()
+    //   - explicitly with llama_kv_cache_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_div(
@@ -548,7 +548,7 @@ extern "C" {
                              int   d);
 
     // Apply the KV cache updates (such as K-shifts) to the KV data
-    LLAMA_API void llama_kv_cache_apply(struct llama_context * ctx);
+    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
 
     //
     // State / sessions

From b75ec64ed21dc965b35fb35bf597e6f8d2d52e5a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 24 Feb 2024 12:54:29 +0200
Subject: [PATCH 12/23] llama : add llama_kv_cache_seq_pos_max()

---
 examples/passkey/passkey.cpp |  6 +++---
 llama.cpp                    | 16 ++++++++++++++++
 llama.h                      |  5 +++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 574728f89a5..1e483edc025 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -150,7 +150,7 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
             llama_kv_cache_update (ctx);
 
-            n_past -= bd;
+            n_past = llama_kv_cache_seq_pos_max(ctx, 0);
         }
 
         llama_batch_clear(batch);
@@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
         llama_kv_cache_update (ctx);
 
-        n_past -= n_discard;
+        n_past = llama_kv_cache_seq_pos_max(ctx, 0);
 
         llama_batch_clear(batch);
 
@@ -214,7 +214,7 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
             llama_kv_cache_update (ctx);
 
-            n_past -= n_discard;
+            n_past = llama_kv_cache_seq_pos_max(ctx, 0);
         }
     }
 
diff --git a/llama.cpp b/llama.cpp
index 263fdf13e4f..46c82b4adea 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2241,6 +2241,18 @@ static void llama_kv_cache_seq_div(
     }
 }
 
+static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cache.cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
 //
 // model loading and saving
 //
@@ -12056,6 +12068,10 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
     llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
 }
 
+llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
+}
+
 void llama_kv_cache_update(struct llama_context * ctx) {
     llama_kv_cache_update_internal(*ctx);
 }
diff --git a/llama.h b/llama.h
index b1621d6a3f1..faea891e479 100644
--- a/llama.h
+++ b/llama.h
@@ -547,6 +547,11 @@ extern "C" {
                        llama_pos   p1,
                              int   d);
 
+    // Returns the largest position present in the KV cache for the specified sequence
+    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
     // Apply the KV cache updates (such as K-shifts) to the KV data
     LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
 

From 032ff857064176fbf8f1028311129b89faf336fe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 10:58:18 +0200
Subject: [PATCH 13/23] passkey : fix llama_kv_cache_seq_pos_max() usage

---
 examples/passkey/passkey.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 1e483edc025..a3a63977fc2 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -150,7 +150,7 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
             llama_kv_cache_update (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0);
+            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
 
         llama_batch_clear(batch);
@@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
         llama_kv_cache_update (ctx);
 
-        n_past = llama_kv_cache_seq_pos_max(ctx, 0);
+        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
 
         llama_batch_clear(batch);
 
@@ -214,7 +214,7 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
             llama_kv_cache_update (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0);
+            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
     }
 

From 715a3433436cb7a524461c20b89d2fc13589c5cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 10:59:52 +0200
Subject: [PATCH 14/23] llama : some llama_kv_cell simplifications

---
 llama.cpp | 55 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 46c82b4adea..0effc6db3f0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1709,6 +1709,14 @@ struct llama_kv_cell {
     bool has_seq_id(const llama_seq_id & id) const {
         return seq_id.find(id) != seq_id.end();
     }
+
+    bool is_empty() const {
+        return seq_id.empty();
+    }
+
+    bool is_same_seq(const llama_kv_cell & other) const {
+        return seq_id == other.seq_id;
+    }
 };
 
 // ring-buffer of cached KV data
@@ -2101,7 +2109,7 @@ static bool llama_kv_cache_find_slot(
 // find how many cells are currently in use
 static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     for (uint32_t i = cache.size - 1; i > 0; --i) {
-        if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
+        if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
             return i + 1;
         }
     }
@@ -2137,7 +2145,7 @@ static void llama_kv_cache_seq_rm(
             } else {
                 continue;
             }
-            if (cache.cells[i].seq_id.empty()) {
+            if (cache.cells[i].is_empty()) {
                 // keep count of the number of used cells
                 if (cache.cells[i].pos >= 0) cache.used--;
 
@@ -2206,10 +2214,14 @@ static void llama_kv_cache_seq_add(
             cache.cells[i].delta += delta;
 
             if (cache.cells[i].pos < 0) {
-                if (!cache.cells[i].seq_id.empty()) cache.used--;
+                if (!cache.cells[i].is_empty()) {
+                    cache.used--;
+                }
                 cache.cells[i].pos = -1;
                 cache.cells[i].seq_id.clear();
-                if (new_head == cache.size) new_head = i;
+                if (new_head == cache.size) {
+                    new_head = i;
+                }
             }
         }
     }
@@ -11618,8 +11630,7 @@ struct llama_context * llama_new_context_with_model(
         }
         ctx->backends.push_back(ctx->backend_cpu);
 
-        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
-                cparams.n_ctx, cparams.offload_kqv)) {
+        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
@@ -12203,10 +12214,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         const auto & hparams = ctx->model.hparams;
         const auto & cparams = ctx->cparams;
 
-        const auto   n_layer      = hparams.n_layer;
-        const auto   n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const auto   n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const auto   n_ctx        = cparams.n_ctx;
+        const uint32_t n_layer      = hparams.n_layer;
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const uint32_t n_ctx        = cparams.n_ctx;
 
         const size_t   kv_buf_size = kv_self.total_size();
         const uint32_t kv_head     = kv_self.head;
@@ -12221,14 +12232,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         if (kv_buf_size) {
             std::vector<uint8_t> tmp_buf;
             for (int il = 0; il < (int) n_layer; ++il) {
-                size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+
                 tmp_buf.resize(k_size);
                 ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
                 data_ctx->write(tmp_buf.data(), tmp_buf.size());
 
                 // v is not contiguous, copy row by row
-                size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
-                size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
+                const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
+                const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
+
                 tmp_buf.resize(v_row_size);
                 for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
                     ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
@@ -12315,10 +12328,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
         const auto & hparams = ctx->model.hparams;
         const auto & cparams = ctx->cparams;
 
-        const int    n_layer      = hparams.n_layer;
-        const int    n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const int    n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const int    n_ctx        = cparams.n_ctx;
+        const uint32_t n_layer      = hparams.n_layer;
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const uint32_t n_ctx        = cparams.n_ctx;
 
         size_t   kv_buf_size;
         uint32_t kv_head;
@@ -12334,13 +12347,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
             GGML_ASSERT(kv_self.total_size() == kv_buf_size);
 
             for (int il = 0; il < (int) n_layer; ++il) {
-                size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+
                 ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
                 inp += k_size;
 
                 // v is not contiguous, copy row by row
-                size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
-                size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
+                const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
+                const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
+
                 for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
                     ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
                     inp += v_row_size;

From fdfa5bc76b52b3551343d606069cb3107433f236 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 11:00:19 +0200
Subject: [PATCH 15/23] llama : add llama_kv_cache_compress (EXPERIMENTAL)

---
 examples/passkey/passkey.cpp |   7 +-
 llama.cpp                    | 210 ++++++++++++++++++++++++++++++++++-
 llama.h                      |   5 +
 3 files changed, 215 insertions(+), 7 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index a3a63977fc2..e2725aaa6e9 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -146,9 +146,10 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update (ctx);
+            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_cache_compress(ctx, 0);
+            llama_kv_cache_update  (ctx);
 
             n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
diff --git a/llama.cpp b/llama.cpp
index 0effc6db3f0..e90609089de 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1733,6 +1733,12 @@ struct llama_kv_cache {
     // computed before each graph build
     uint32_t n = 0;
 
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;
+
+    // if non-negative, compress data on next update
+    llama_pos compress_delta = -1;
+
     std::vector<llama_kv_cell> cells;
 
     std::vector<struct ggml_tensor *> k_l; // per layer
@@ -1968,8 +1974,8 @@ struct llama_context {
 static bool llama_kv_cache_init(
              struct llama_kv_cache & cache,
                  const llama_model & model,
-                         ggml_type   ktype,
-                         ggml_type   vtype,
+                         ggml_type   type_k,
+                         ggml_type   type_v,
                           uint32_t   n_ctx,
                               bool   offload) {
     const struct llama_hparams & hparams = model.hparams;
@@ -1984,6 +1990,9 @@ static bool llama_kv_cache_init(
     cache.size = n_ctx;
     cache.used = 0;
 
+    cache.type_k = type_k;
+    cache.type_v = type_v;
+
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
@@ -2024,8 +2033,8 @@ static bool llama_kv_cache_init(
 
     for (int i = 0; i < (int) n_layer; i++) {
         struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
@@ -2265,6 +2274,10 @@ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama
     return result;
 }
 
+static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos delta) {
+    cache.compress_delta = delta;
+}
+
 //
 // model loading and saving
 //
@@ -8034,6 +8047,191 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
             }
         }
     }
+
+    // compress the KV cache data if needed:
+    //
+    //   - determine which KV cell pairs (i0, i1) to merge:
+    //
+    //     abs(cell[i0].pos - cell[i1].pos) <= compress_delta
+    //
+    //   - move the KV cache to the Host memory for easier maniiplation
+    //   - processing is done layer-by-layer
+    //   - convert the KV data to F32
+    //   - merge the KV data (different ways to merge)
+    //   - convert the KV data back to the original type
+    //   - move the KV cache back to the device memory
+    //   - update the KV cache metadata
+    //
+    // as a side effect, the new KV cache is defragmented
+    //
+    if (lctx.kv_self.compress_delta >= 0) {
+        auto & kv_self = lctx.kv_self;
+
+        const auto & hparams = lctx.model.hparams;
+
+        const uint32_t n_layer      = hparams.n_layer;
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const uint32_t kv_size      = kv_self.size;
+
+        std::vector<uint8_t> buf_q;
+
+        std::vector<float> buf_src_f32;
+        std::vector<float> buf_dst_f32;
+
+        const int64_t t_start = ggml_time_us();
+
+        struct c_pair { uint32_t i0, i1; };
+        struct c_info { bool merged; uint32_t id, cnt;};
+
+        std::vector<c_info> infos(kv_size, { false, 0, 0 });
+
+        // the destination cell in the new KV cache
+        uint32_t id = 0;
+
+        // number of pairs merged
+        uint32_t n_merges = 0;
+
+        // determine which KV cells to merge
+        for (uint32_t i0 = 0; i0 < kv_size; ++i0) {
+            const auto & cell0 = kv_self.cells[i0];
+
+            if (!cell0.is_empty() && !infos[i0].merged) {
+                infos[i0] = { true, id, 0 };
+                infos[id].cnt = 1;
+
+                const llama_pos p0 = cell0.pos;
+
+                for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) {
+                    const auto & cell1 = kv_self.cells[i1];
+
+                    if (i0 != i1 && cell0.is_same_seq(cell1)) {
+                        const llama_pos p1 = cell1.pos;
+
+                        if (std::abs(p0 - p1) <= kv_self.compress_delta) {
+                            infos[i1] = { true, id, 0 };
+                            infos[id].cnt++;
+                            n_merges++;
+                        }
+                    }
+                }
+
+                if (i0 != id) {
+                    kv_self.cells[id] = cell0;
+                }
+
+                id++;
+            }
+        }
+
+        kv_self.head = id;
+        kv_self.used = id;
+
+        for (uint32_t i = id; i < kv_size; ++i) {
+            kv_self.cells[i] = llama_kv_cell();
+        }
+
+        LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges);
+
+        ggml_type_traits_t tt_k;
+        ggml_type_traits_t tt_v;
+
+        tt_k = ggml_internal_get_type_traits(kv_self.type_k);
+        tt_v = ggml_internal_get_type_traits(kv_self.type_v);
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            // update keys
+            {
+                const int64_t ne = n_embd_k_gqa*kv_size;
+
+                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne);
+
+                buf_q.resize(k_size);
+
+                buf_src_f32.resize(ne);
+                buf_dst_f32.resize(ne);
+
+                ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
+
+                tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne);
+
+                std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
+
+                for (uint32_t i = 0; i < kv_size; ++i) {
+                    if (!infos[i].merged) {
+                        continue;
+                    }
+
+                    const uint32_t id = infos[i].id;
+
+                    // merge using averaging
+                    {
+                        const float scale = 1.0f/float(infos[id].cnt);
+
+                        const int64_t os =  i*n_embd_k_gqa;
+                        const int64_t od = id*n_embd_k_gqa;
+
+                        for (uint32_t j = 0; j < n_embd_k_gqa; ++j) {
+                            buf_dst_f32[od + j] += buf_src_f32[os + j]*scale;
+                        }
+                    }
+                }
+
+                tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne);
+
+                ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
+            }
+
+            // update values (note: they are transposed)
+            {
+                const int64_t ne = n_embd_v_gqa*kv_size;
+
+                const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne);
+
+                buf_q.resize(v_size);
+
+                buf_src_f32.resize(ne);
+                buf_dst_f32.resize(ne);
+
+                ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
+
+                tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne);
+
+                std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
+
+                for (uint32_t i = 0; i < kv_size; ++i) {
+                    if (!infos[i].merged) {
+                        continue;
+                    }
+
+                    const uint32_t id = infos[i].id;
+
+                    // merge using averaging
+                    {
+                        const float scale = 1.0f/float(infos[id].cnt);
+                        //printf("i: %d -> id: %d, scale: %f\n", i, id, scale);
+
+                        const int64_t os =  i;
+                        const int64_t od = id;
+
+                        for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                            buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale;
+                        }
+                    }
+                }
+
+                tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne);
+
+                ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
+            }
+        }
+
+        const int64_t t_end = ggml_time_us();
+
+        LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0);
+
+        kv_self.compress_delta = -1;
+    }
 }
 
 //
@@ -12083,6 +12281,10 @@ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id se
     return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
 }
 
+void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) {
+    llama_kv_cache_compress(ctx->kv_self, delta);
+}
+
 void llama_kv_cache_update(struct llama_context * ctx) {
     llama_kv_cache_update_internal(*ctx);
 }
diff --git a/llama.h b/llama.h
index faea891e479..3fac7b79c82 100644
--- a/llama.h
+++ b/llama.h
@@ -552,6 +552,11 @@ extern "C" {
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
 
+    // [EXPERIMENTAL] Compress the data in the KV cache
+    LLAMA_API void llama_kv_cache_compress(
+            struct llama_context * ctx,
+                       llama_pos   delta);
+
     // Apply the KV cache updates (such as K-shifts) to the KV data
     LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
 

From 9ec749df59982d84f7a5bafb8d08a2f4ca08f00f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 13:57:43 +0200
Subject: [PATCH 16/23] llama : add alternative KV cache merging (EXPERIMENTAL)

---
 llama.cpp | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 1fb53f3db45..2c05921bb86 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8072,10 +8072,13 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
         const auto & hparams = lctx.model.hparams;
 
-        const uint32_t n_layer      = hparams.n_layer;
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const uint32_t kv_size      = kv_self.size;
+        const uint32_t n_layer       = hparams.n_layer;
+        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+        const uint32_t n_embd_head_k = hparams.n_embd_head_k;  GGML_UNUSED(n_embd_head_k);
+        const uint32_t n_embd_head_v = hparams.n_embd_head_v;  GGML_UNUSED(n_embd_head_v);
+        const uint32_t n_head_kv     = hparams.n_head_kv;      GGML_UNUSED(n_head_kv);
+        const uint32_t kv_size       = kv_self.size;
 
         std::vector<uint8_t> buf_q;
 
@@ -8085,9 +8088,9 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         const int64_t t_start = ggml_time_us();
 
         struct c_pair { uint32_t i0, i1; };
-        struct c_info { bool merged; uint32_t id, cnt;};
+        struct c_info { bool merged; uint32_t id, cnt, r; };
 
-        std::vector<c_info> infos(kv_size, { false, 0, 0 });
+        std::vector<c_info> infos(kv_size, { false, 0, 0, 0 });
 
         // the destination cell in the new KV cache
         uint32_t id = 0;
@@ -8100,7 +8103,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
             const auto & cell0 = kv_self.cells[i0];
 
             if (!cell0.is_empty() && !infos[i0].merged) {
-                infos[i0] = { true, id, 0 };
+                infos[i0] = { true, id, 0, 0 };
                 infos[id].cnt = 1;
 
                 const llama_pos p0 = cell0.pos;
@@ -8112,7 +8115,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
                         const llama_pos p1 = cell1.pos;
 
                         if (std::abs(p0 - p1) <= kv_self.compress_delta) {
-                            infos[i1] = { true, id, 0 };
+                            infos[i1] = { true, id, 0, 0 };
                             infos[id].cnt++;
                             n_merges++;
                         }
@@ -8143,6 +8146,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         tt_v = ggml_internal_get_type_traits(kv_self.type_v);
 
         for (uint32_t il = 0; il < n_layer; ++il) {
+            for (uint32_t i = 0; i < kv_size; ++i) {
+                infos[i].r = 0;
+            }
+
             // update keys
             {
                 const int64_t ne = n_embd_k_gqa*kv_size;
@@ -8167,6 +8174,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
                     const uint32_t id = infos[i].id;
 
+#if 1
                     // merge using averaging
                     {
                         const float scale = 1.0f/float(infos[id].cnt);
@@ -8178,6 +8186,25 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
                             buf_dst_f32[od + j] += buf_src_f32[os + j]*scale;
                         }
                     }
+#else
+                    // merge separate heads
+                    {
+                        for (uint32_t h = 0; h < n_head_kv; ++h) {
+                            if ((h + il) % infos[id].cnt != infos[id].r) {
+                                continue;
+                            }
+
+                            const int64_t os =  i*n_embd_k_gqa + h*n_embd_head_k;
+                            const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k;
+
+                            for (uint32_t j = 0; j < n_embd_head_k; ++j) {
+                                buf_dst_f32[od + j] = buf_src_f32[os + j];
+                            }
+                        }
+                    }
+
+                    infos[id].r++;
+#endif
                 }
 
                 tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne);
@@ -8185,6 +8212,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
                 ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
             }
 
+            for (uint32_t i = 0; i < kv_size; ++i) {
+                infos[i].r = 0;
+            }
+
             // update values (note: they are transposed)
             {
                 const int64_t ne = n_embd_v_gqa*kv_size;
@@ -8209,6 +8240,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
                     const uint32_t id = infos[i].id;
 
+#if 1
                     // merge using averaging
                     {
                         const float scale = 1.0f/float(infos[id].cnt);
@@ -8221,6 +8253,25 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
                             buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale;
                         }
                     }
+#else
+                    // merge separate heads
+                    {
+                        for (uint32_t h = 0; h < n_head_kv; ++h) {
+                            if ((h + il) % infos[id].cnt != infos[id].r) {
+                                continue;
+                            }
+
+                            const int64_t os =  i;
+                            const int64_t od = id;
+
+                            for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) {
+                                buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size];
+                            }
+                        }
+                    }
+
+                    infos[id].r++;
+#endif
                 }
 
                 tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne);

From 65f21ec5d3e774978765f4de82231809c2cc3e72 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 15:00:45 +0200
Subject: [PATCH 17/23] llama : add llama_kv_cache_defrag

---
 examples/passkey/passkey.cpp |   2 +
 llama.cpp                    | 489 ++++++++++++++++++++++-------------
 llama.h                      |  11 +-
 3 files changed, 327 insertions(+), 175 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index e2725aaa6e9..4c8a041359f 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -183,6 +183,7 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+        llama_kv_cache_defrag (ctx);
         llama_kv_cache_update (ctx);
 
         n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
@@ -213,6 +214,7 @@ int main(int argc, char ** argv) {
 
             llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+            llama_kv_cache_defrag (ctx);
             llama_kv_cache_update (ctx);
 
             n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
diff --git a/llama.cpp b/llama.cpp
index 2c05921bb86..61539b24ae7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1722,6 +1722,7 @@ struct llama_kv_cell {
 // ring-buffer of cached KV data
 struct llama_kv_cache {
     bool has_shift = false;
+    bool do_defrag = false;
 
     // Note: The value of head isn't only used to optimize searching
     // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2278,6 +2279,10 @@ static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos del
     cache.compress_delta = delta;
 }
 
+static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
+    cache.do_defrag = true;
+}
+
 //
 // model loading and saving
 //
@@ -8029,262 +8034,394 @@ static int llama_decode_internal(
     return 0;
 }
 
-static void llama_kv_cache_update_internal(struct llama_context & lctx) {
-    // apply K-shift if needed
-    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
-        llama_set_k_shift(lctx);
+// summary:
+//
+//   - determine which KV cell pairs (i0, i1) to merge:
+//
+//     abs(cell[i0].pos - cell[i1].pos) <= compress_delta
+//
+//   - move the KV cache to the Host memory for easier maniiplation
+//   - processing is done layer-by-layer
+//   - convert the KV data to F32
+//   - merge the KV data (different ways to merge)
+//   - convert the KV data back to the original type
+//   - move the KV cache back to the device memory
+//   - update the KV cache metadata
+//
+// as a side effect, the new KV cache is defragmented
+//
+static void llama_kv_cache_compress_internal(struct llama_context & lctx) {
+    auto & kv_self = lctx.kv_self;
 
-        {
-            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
+    const auto & hparams = lctx.model.hparams;
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
-        }
+    const uint32_t n_layer       = hparams.n_layer;
+    const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+    const uint32_t n_embd_head_k = hparams.n_embd_head_k;  GGML_UNUSED(n_embd_head_k);
+    const uint32_t n_embd_head_v = hparams.n_embd_head_v;  GGML_UNUSED(n_embd_head_v);
+    const uint32_t n_head_kv     = hparams.n_head_kv;      GGML_UNUSED(n_head_kv);
+    const uint32_t kv_size       = kv_self.size;
 
-        {
-            auto & kv_self = lctx.kv_self;
+    const int64_t t_start = ggml_time_us();
 
-            kv_self.has_shift = false;
+    std::vector<uint8_t> buf_q;
 
-            for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
+    std::vector<float> buf_src_f32;
+    std::vector<float> buf_dst_f32;
+
+    struct c_pair { uint32_t i0, i1; };
+    struct c_info { bool merged; uint32_t id, cnt, r; };
+
+    std::vector<c_info> infos(kv_size, { false, 0, 0, 0 });
+
+    // the destination cell in the new KV cache
+    uint32_t id = 0;
+
+    // number of pairs merged
+    uint32_t n_merges = 0;
+
+    // determine which KV cells to merge
+    for (uint32_t i0 = 0; i0 < kv_size; ++i0) {
+        const auto & cell0 = kv_self.cells[i0];
+
+        if (!cell0.is_empty() && !infos[i0].merged) {
+            infos[i0] = { true, id, 0, 0 };
+            infos[id].cnt = 1;
+
+            const llama_pos p0 = cell0.pos;
+
+            for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) {
+                const auto & cell1 = kv_self.cells[i1];
+
+                if (i0 != i1 && cell0.is_same_seq(cell1)) {
+                    const llama_pos p1 = cell1.pos;
+
+                    if (std::abs(p0 - p1) <= kv_self.compress_delta) {
+                        infos[i1] = { true, id, 0, 0 };
+                        infos[id].cnt++;
+                        n_merges++;
+                    }
+                }
+            }
+
+            if (i0 != id) {
+                kv_self.cells[id] = cell0;
             }
+
+            id++;
         }
     }
 
-    // compress the KV cache data if needed:
-    //
-    //   - determine which KV cell pairs (i0, i1) to merge:
-    //
-    //     abs(cell[i0].pos - cell[i1].pos) <= compress_delta
-    //
-    //   - move the KV cache to the Host memory for easier maniiplation
-    //   - processing is done layer-by-layer
-    //   - convert the KV data to F32
-    //   - merge the KV data (different ways to merge)
-    //   - convert the KV data back to the original type
-    //   - move the KV cache back to the device memory
-    //   - update the KV cache metadata
-    //
-    // as a side effect, the new KV cache is defragmented
-    //
-    if (lctx.kv_self.compress_delta >= 0) {
-        auto & kv_self = lctx.kv_self;
+    kv_self.head = id;
+    kv_self.used = id;
 
-        const auto & hparams = lctx.model.hparams;
+    for (uint32_t i = id; i < kv_size; ++i) {
+        kv_self.cells[i] = llama_kv_cell();
+    }
 
-        const uint32_t n_layer       = hparams.n_layer;
-        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
-        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
-        const uint32_t n_embd_head_k = hparams.n_embd_head_k;  GGML_UNUSED(n_embd_head_k);
-        const uint32_t n_embd_head_v = hparams.n_embd_head_v;  GGML_UNUSED(n_embd_head_v);
-        const uint32_t n_head_kv     = hparams.n_head_kv;      GGML_UNUSED(n_head_kv);
-        const uint32_t kv_size       = kv_self.size;
+    LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges);
 
-        std::vector<uint8_t> buf_q;
+    ggml_type_traits_t tt_k;
+    ggml_type_traits_t tt_v;
 
-        std::vector<float> buf_src_f32;
-        std::vector<float> buf_dst_f32;
+    tt_k = ggml_internal_get_type_traits(kv_self.type_k);
+    tt_v = ggml_internal_get_type_traits(kv_self.type_v);
 
-        const int64_t t_start = ggml_time_us();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        for (uint32_t i = 0; i < kv_size; ++i) {
+            infos[i].r = 0;
+        }
 
-        struct c_pair { uint32_t i0, i1; };
-        struct c_info { bool merged; uint32_t id, cnt, r; };
+        // update keys
+        {
+            const int64_t ne = n_embd_k_gqa*kv_size;
 
-        std::vector<c_info> infos(kv_size, { false, 0, 0, 0 });
+            const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne);
 
-        // the destination cell in the new KV cache
-        uint32_t id = 0;
+            buf_q.resize(k_size);
 
-        // number of pairs merged
-        uint32_t n_merges = 0;
+            buf_src_f32.resize(ne);
+            buf_dst_f32.resize(ne);
 
-        // determine which KV cells to merge
-        for (uint32_t i0 = 0; i0 < kv_size; ++i0) {
-            const auto & cell0 = kv_self.cells[i0];
+            ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
 
-            if (!cell0.is_empty() && !infos[i0].merged) {
-                infos[i0] = { true, id, 0, 0 };
-                infos[id].cnt = 1;
+            tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne);
 
-                const llama_pos p0 = cell0.pos;
+            std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
 
-                for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) {
-                    const auto & cell1 = kv_self.cells[i1];
+            for (uint32_t i = 0; i < kv_size; ++i) {
+                if (!infos[i].merged) {
+                    continue;
+                }
 
-                    if (i0 != i1 && cell0.is_same_seq(cell1)) {
-                        const llama_pos p1 = cell1.pos;
+                const uint32_t id = infos[i].id;
 
-                        if (std::abs(p0 - p1) <= kv_self.compress_delta) {
-                            infos[i1] = { true, id, 0, 0 };
-                            infos[id].cnt++;
-                            n_merges++;
-                        }
+#if 1
+                // merge using averaging
+                {
+                    const float scale = 1.0f/float(infos[id].cnt);
+
+                    const int64_t os =  i*n_embd_k_gqa;
+                    const int64_t od = id*n_embd_k_gqa;
+
+                    for (uint32_t j = 0; j < n_embd_k_gqa; ++j) {
+                        buf_dst_f32[od + j] += buf_src_f32[os + j]*scale;
                     }
                 }
+#else
+                // merge separate heads
+                {
+                    for (uint32_t h = 0; h < n_head_kv; ++h) {
+                        if ((h + il) % infos[id].cnt != infos[id].r) {
+                            continue;
+                        }
+
+                        const int64_t os =  i*n_embd_k_gqa + h*n_embd_head_k;
+                        const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k;
 
-                if (i0 != id) {
-                    kv_self.cells[id] = cell0;
+                        for (uint32_t j = 0; j < n_embd_head_k; ++j) {
+                            buf_dst_f32[od + j] = buf_src_f32[os + j];
+                        }
+                    }
                 }
 
-                id++;
+                infos[id].r++;
+#endif
             }
-        }
 
-        kv_self.head = id;
-        kv_self.used = id;
+            tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne);
 
-        for (uint32_t i = id; i < kv_size; ++i) {
-            kv_self.cells[i] = llama_kv_cell();
+            ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
         }
 
-        LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges);
-
-        ggml_type_traits_t tt_k;
-        ggml_type_traits_t tt_v;
-
-        tt_k = ggml_internal_get_type_traits(kv_self.type_k);
-        tt_v = ggml_internal_get_type_traits(kv_self.type_v);
-
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            for (uint32_t i = 0; i < kv_size; ++i) {
-                infos[i].r = 0;
-            }
+        for (uint32_t i = 0; i < kv_size; ++i) {
+            infos[i].r = 0;
+        }
 
-            // update keys
-            {
-                const int64_t ne = n_embd_k_gqa*kv_size;
+        // update values (note: they are transposed)
+        {
+            const int64_t ne = n_embd_v_gqa*kv_size;
 
-                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne);
+            const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne);
 
-                buf_q.resize(k_size);
+            buf_q.resize(v_size);
 
-                buf_src_f32.resize(ne);
-                buf_dst_f32.resize(ne);
+            buf_src_f32.resize(ne);
+            buf_dst_f32.resize(ne);
 
-                ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
+            ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
 
-                tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne);
+            tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne);
 
-                std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
+            std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
 
-                for (uint32_t i = 0; i < kv_size; ++i) {
-                    if (!infos[i].merged) {
-                        continue;
-                    }
+            for (uint32_t i = 0; i < kv_size; ++i) {
+                if (!infos[i].merged) {
+                    continue;
+                }
 
-                    const uint32_t id = infos[i].id;
+                const uint32_t id = infos[i].id;
 
 #if 1
-                    // merge using averaging
-                    {
-                        const float scale = 1.0f/float(infos[id].cnt);
+                // merge using averaging
+                {
+                    const float scale = 1.0f/float(infos[id].cnt);
+                    //printf("i: %d -> id: %d, scale: %f\n", i, id, scale);
 
-                        const int64_t os =  i*n_embd_k_gqa;
-                        const int64_t od = id*n_embd_k_gqa;
+                    const int64_t os =  i;
+                    const int64_t od = id;
 
-                        for (uint32_t j = 0; j < n_embd_k_gqa; ++j) {
-                            buf_dst_f32[od + j] += buf_src_f32[os + j]*scale;
-                        }
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale;
                     }
+                }
 #else
-                    // merge separate heads
-                    {
-                        for (uint32_t h = 0; h < n_head_kv; ++h) {
-                            if ((h + il) % infos[id].cnt != infos[id].r) {
-                                continue;
-                            }
+                // merge separate heads
+                {
+                    for (uint32_t h = 0; h < n_head_kv; ++h) {
+                        if ((h + il) % infos[id].cnt != infos[id].r) {
+                            continue;
+                        }
 
-                            const int64_t os =  i*n_embd_k_gqa + h*n_embd_head_k;
-                            const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k;
+                        const int64_t os =  i;
+                        const int64_t od = id;
 
-                            for (uint32_t j = 0; j < n_embd_head_k; ++j) {
-                                buf_dst_f32[od + j] = buf_src_f32[os + j];
-                            }
+                        for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) {
+                            buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size];
                         }
                     }
+                }
 
-                    infos[id].r++;
+                infos[id].r++;
 #endif
-                }
+            }
 
-                tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne);
+            tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne);
 
-                ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
-            }
+            ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
+        }
+    }
 
-            for (uint32_t i = 0; i < kv_size; ++i) {
-                infos[i].r = 0;
+    const int64_t t_end = ggml_time_us();
+
+    LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0);
+}
+
+// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
+// removing any empty segments that may have been left by previous KV cache operations
+// TODO: optimizations are possible:
+//       - multiple threads
+//       - avoid copying to the host memory when already there
+// TODO: can we do all this on-device?
+static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+    auto & kv_self = lctx.kv_self;
+
+    const auto & hparams = lctx.model.hparams;
+
+    const uint32_t n_layer      = hparams.n_layer;
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+    const uint32_t n_kv         = llama_kv_cache_cell_max(kv_self);
+
+    const uint32_t kv_size = kv_self.size;
+
+    const int64_t t_start = ggml_time_us();
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    // the destination cell in the new KV cache
+    uint32_t id = 0;
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // determine which KV cells to move where
+    std::vector<uint32_t> ids(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_kv; ++i0) {
+        const auto & cell0 = kv_self.cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = id;
+
+            if (i0 != id) {
+                kv_self.cells[id] = cell0;
+                n_moves++;
             }
 
-            // update values (note: they are transposed)
-            {
-                const int64_t ne = n_embd_v_gqa*kv_size;
+            id++;
+        }
+    }
 
-                const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne);
+    if (n_moves == 0) {
+        return;
+    }
 
-                buf_q.resize(v_size);
+    LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
 
-                buf_src_f32.resize(ne);
-                buf_dst_f32.resize(ne);
+    kv_self.head = id;
+    kv_self.used = id;
 
-                ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
+    // zero the rest of the cells
+    for (uint32_t i = id; i < n_kv; ++i) {
+        kv_self.cells[i] = llama_kv_cell();
+    }
 
-                tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne);
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
 
-                std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
+        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
 
-                for (uint32_t i = 0; i < kv_size; ++i) {
-                    if (!infos[i].merged) {
-                        continue;
-                    }
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
 
-                    const uint32_t id = infos[i].id;
+        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
 
-#if 1
-                    // merge using averaging
-                    {
-                        const float scale = 1.0f/float(infos[id].cnt);
-                        //printf("i: %d -> id: %d, scale: %f\n", i, id, scale);
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
 
-                        const int64_t os =  i;
-                        const int64_t od = id;
+            if (i == id || id == n_kv) {
+                continue;
+            }
 
-                        for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                            buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale;
-                        }
-                    }
-#else
-                    // merge separate heads
-                    {
-                        for (uint32_t h = 0; h < n_head_kv; ++h) {
-                            if ((h + il) % infos[id].cnt != infos[id].r) {
-                                continue;
-                            }
+            uint32_t nm = 1;
 
-                            const int64_t os =  i;
-                            const int64_t od = id;
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
 
-                            for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) {
-                                buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size];
-                            }
-                        }
-                    }
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
 
-                    infos[id].r++;
-#endif
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
                 }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+
+    const int64_t t_end = ggml_time_us();
+
+    LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
+}
+
+static void llama_kv_cache_update_internal(struct llama_context & lctx) {
+    // apply K-shift if needed
+    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
+        llama_set_k_shift(lctx);
+
+        {
+            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
 
-                tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+        }
+
+        {
+            auto & kv_self = lctx.kv_self;
 
-                ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
+            kv_self.has_shift = false;
+
+            for (uint32_t i = 0; i < kv_self.size; ++i) {
+                kv_self.cells[i].delta = 0;
             }
         }
+    }
 
-        const int64_t t_end = ggml_time_us();
+    // compress the KV cache data if needed
+    if (lctx.kv_self.compress_delta >= 0) {
+        llama_kv_cache_compress_internal(lctx);
+
+        lctx.kv_self.compress_delta = -1;
+        lctx.kv_self.do_defrag = false;
+    }
 
-        LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0);
+    // defragment the KV cache if needed
+    if (lctx.kv_self.do_defrag) {
+        llama_kv_cache_defrag_internal(lctx);
 
-        kv_self.compress_delta = -1;
+        lctx.kv_self.do_defrag = false;
     }
 }
 
@@ -12360,6 +12497,10 @@ void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) {
     llama_kv_cache_compress(ctx->kv_self, delta);
 }
 
+void llama_kv_cache_defrag(struct llama_context * ctx) {
+    llama_kv_cache_defrag(ctx->kv_self);
+}
+
 void llama_kv_cache_update(struct llama_context * ctx) {
     llama_kv_cache_update_internal(*ctx);
 }
diff --git a/llama.h b/llama.h
index 8f959824fd0..862d555e2b9 100644
--- a/llama.h
+++ b/llama.h
@@ -555,11 +555,20 @@ extern "C" {
                     llama_seq_id   seq_id);
 
     // [EXPERIMENTAL] Compress the data in the KV cache
+    // This will be applied:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_update()
     LLAMA_API void llama_kv_cache_compress(
             struct llama_context * ctx,
                        llama_pos   delta);
 
-    // Apply the KV cache updates (such as K-shifts) to the KV data
+    // Defragment the KV cache
+    // This will be applied:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_update()
+    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
     LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
 
     //

From 1b6aeb830903926a2187a38ebb7e14b397206c97 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 15:30:06 +0200
Subject: [PATCH 18/23] llama : comments

---
 llama.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 61539b24ae7..dc491f14b62 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8040,7 +8040,7 @@ static int llama_decode_internal(
 //
 //     abs(cell[i0].pos - cell[i1].pos) <= compress_delta
 //
-//   - move the KV cache to the Host memory for easier maniiplation
+//   - move the KV cache to the host memory for easier manipulation
 //   - processing is done layer-by-layer
 //   - convert the KV data to F32
 //   - merge the KV data (different ways to merge)
@@ -8269,11 +8269,14 @@ static void llama_kv_cache_compress_internal(struct llama_context & lctx) {
 }
 
 // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
-// removing any empty segments that may have been left by previous KV cache operations
+// this way we eliminate any empty segments that may have been left by previous KV cache operations
+//
 // TODO: optimizations are possible:
 //       - multiple threads
 //       - avoid copying to the host memory when already there
+//
 // TODO: can we do all this on-device?
+//
 static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     auto & kv_self = lctx.kv_self;
 

From 2d7203b975334fd8e3ccf6e93bca66ebacfef436 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 15:32:02 +0200
Subject: [PATCH 19/23] llama : remove llama_kv_cache_compress

will add in a separate PR

ggml-ci
---
 examples/passkey/passkey.cpp |   1 -
 llama.cpp                    | 253 -----------------------------------
 llama.h                      |   8 --
 3 files changed, 262 deletions(-)

diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 4c8a041359f..47de67a9304 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -148,7 +148,6 @@ int main(int argc, char ** argv) {
 
             llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
             llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_compress(ctx, 0);
             llama_kv_cache_update  (ctx);
 
             n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
diff --git a/llama.cpp b/llama.cpp
index dc491f14b62..75189e719ae 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1737,9 +1737,6 @@ struct llama_kv_cache {
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
-    // if non-negative, compress data on next update
-    llama_pos compress_delta = -1;
-
     std::vector<llama_kv_cell> cells;
 
     std::vector<struct ggml_tensor *> k_l; // per layer
@@ -2275,10 +2272,6 @@ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama
     return result;
 }
 
-static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos delta) {
-    cache.compress_delta = delta;
-}
-
 static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
     cache.do_defrag = true;
 }
@@ -8034,240 +8027,6 @@ static int llama_decode_internal(
     return 0;
 }
 
-// summary:
-//
-//   - determine which KV cell pairs (i0, i1) to merge:
-//
-//     abs(cell[i0].pos - cell[i1].pos) <= compress_delta
-//
-//   - move the KV cache to the host memory for easier manipulation
-//   - processing is done layer-by-layer
-//   - convert the KV data to F32
-//   - merge the KV data (different ways to merge)
-//   - convert the KV data back to the original type
-//   - move the KV cache back to the device memory
-//   - update the KV cache metadata
-//
-// as a side effect, the new KV cache is defragmented
-//
-static void llama_kv_cache_compress_internal(struct llama_context & lctx) {
-    auto & kv_self = lctx.kv_self;
-
-    const auto & hparams = lctx.model.hparams;
-
-    const uint32_t n_layer       = hparams.n_layer;
-    const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
-    const uint32_t n_embd_head_k = hparams.n_embd_head_k;  GGML_UNUSED(n_embd_head_k);
-    const uint32_t n_embd_head_v = hparams.n_embd_head_v;  GGML_UNUSED(n_embd_head_v);
-    const uint32_t n_head_kv     = hparams.n_head_kv;      GGML_UNUSED(n_head_kv);
-    const uint32_t kv_size       = kv_self.size;
-
-    const int64_t t_start = ggml_time_us();
-
-    std::vector<uint8_t> buf_q;
-
-    std::vector<float> buf_src_f32;
-    std::vector<float> buf_dst_f32;
-
-    struct c_pair { uint32_t i0, i1; };
-    struct c_info { bool merged; uint32_t id, cnt, r; };
-
-    std::vector<c_info> infos(kv_size, { false, 0, 0, 0 });
-
-    // the destination cell in the new KV cache
-    uint32_t id = 0;
-
-    // number of pairs merged
-    uint32_t n_merges = 0;
-
-    // determine which KV cells to merge
-    for (uint32_t i0 = 0; i0 < kv_size; ++i0) {
-        const auto & cell0 = kv_self.cells[i0];
-
-        if (!cell0.is_empty() && !infos[i0].merged) {
-            infos[i0] = { true, id, 0, 0 };
-            infos[id].cnt = 1;
-
-            const llama_pos p0 = cell0.pos;
-
-            for (uint32_t i1 = i0 + 1; i1 < kv_size; ++i1) {
-                const auto & cell1 = kv_self.cells[i1];
-
-                if (i0 != i1 && cell0.is_same_seq(cell1)) {
-                    const llama_pos p1 = cell1.pos;
-
-                    if (std::abs(p0 - p1) <= kv_self.compress_delta) {
-                        infos[i1] = { true, id, 0, 0 };
-                        infos[id].cnt++;
-                        n_merges++;
-                    }
-                }
-            }
-
-            if (i0 != id) {
-                kv_self.cells[id] = cell0;
-            }
-
-            id++;
-        }
-    }
-
-    kv_self.head = id;
-    kv_self.used = id;
-
-    for (uint32_t i = id; i < kv_size; ++i) {
-        kv_self.cells[i] = llama_kv_cell();
-    }
-
-    LLAMA_LOG_INFO("(tmp log) KV compress pairs: %u\n", n_merges);
-
-    ggml_type_traits_t tt_k;
-    ggml_type_traits_t tt_v;
-
-    tt_k = ggml_internal_get_type_traits(kv_self.type_k);
-    tt_v = ggml_internal_get_type_traits(kv_self.type_v);
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        for (uint32_t i = 0; i < kv_size; ++i) {
-            infos[i].r = 0;
-        }
-
-        // update keys
-        {
-            const int64_t ne = n_embd_k_gqa*kv_size;
-
-            const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, ne);
-
-            buf_q.resize(k_size);
-
-            buf_src_f32.resize(ne);
-            buf_dst_f32.resize(ne);
-
-            ggml_backend_tensor_get(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
-
-            tt_k.to_float(buf_q.data(), buf_src_f32.data(), ne);
-
-            std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
-
-            for (uint32_t i = 0; i < kv_size; ++i) {
-                if (!infos[i].merged) {
-                    continue;
-                }
-
-                const uint32_t id = infos[i].id;
-
-#if 1
-                // merge using averaging
-                {
-                    const float scale = 1.0f/float(infos[id].cnt);
-
-                    const int64_t os =  i*n_embd_k_gqa;
-                    const int64_t od = id*n_embd_k_gqa;
-
-                    for (uint32_t j = 0; j < n_embd_k_gqa; ++j) {
-                        buf_dst_f32[od + j] += buf_src_f32[os + j]*scale;
-                    }
-                }
-#else
-                // merge separate heads
-                {
-                    for (uint32_t h = 0; h < n_head_kv; ++h) {
-                        if ((h + il) % infos[id].cnt != infos[id].r) {
-                            continue;
-                        }
-
-                        const int64_t os =  i*n_embd_k_gqa + h*n_embd_head_k;
-                        const int64_t od = id*n_embd_k_gqa + h*n_embd_head_k;
-
-                        for (uint32_t j = 0; j < n_embd_head_k; ++j) {
-                            buf_dst_f32[od + j] = buf_src_f32[os + j];
-                        }
-                    }
-                }
-
-                infos[id].r++;
-#endif
-            }
-
-            tt_k.from_float(buf_dst_f32.data(), buf_q.data(), ne);
-
-            ggml_backend_tensor_set(kv_self.k_l[il], buf_q.data(), 0, buf_q.size());
-        }
-
-        for (uint32_t i = 0; i < kv_size; ++i) {
-            infos[i].r = 0;
-        }
-
-        // update values (note: they are transposed)
-        {
-            const int64_t ne = n_embd_v_gqa*kv_size;
-
-            const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, ne);
-
-            buf_q.resize(v_size);
-
-            buf_src_f32.resize(ne);
-            buf_dst_f32.resize(ne);
-
-            ggml_backend_tensor_get(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
-
-            tt_v.to_float(buf_q.data(), buf_src_f32.data(), ne);
-
-            std::fill(buf_dst_f32.begin(), buf_dst_f32.end(), 0);
-
-            for (uint32_t i = 0; i < kv_size; ++i) {
-                if (!infos[i].merged) {
-                    continue;
-                }
-
-                const uint32_t id = infos[i].id;
-
-#if 1
-                // merge using averaging
-                {
-                    const float scale = 1.0f/float(infos[id].cnt);
-                    //printf("i: %d -> id: %d, scale: %f\n", i, id, scale);
-
-                    const int64_t os =  i;
-                    const int64_t od = id;
-
-                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                        buf_dst_f32[od + j*kv_size] += buf_src_f32[os + j*kv_size]*scale;
-                    }
-                }
-#else
-                // merge separate heads
-                {
-                    for (uint32_t h = 0; h < n_head_kv; ++h) {
-                        if ((h + il) % infos[id].cnt != infos[id].r) {
-                            continue;
-                        }
-
-                        const int64_t os =  i;
-                        const int64_t od = id;
-
-                        for (uint32_t j = h*n_embd_head_v; j < (h + 1)*n_embd_head_v; ++j) {
-                            buf_dst_f32[od + j*kv_size] = buf_src_f32[os + j*kv_size];
-                        }
-                    }
-                }
-
-                infos[id].r++;
-#endif
-            }
-
-            tt_v.from_float(buf_dst_f32.data(), buf_q.data(), ne);
-
-            ggml_backend_tensor_set(kv_self.v_l[il], buf_q.data(), 0, buf_q.size());
-        }
-    }
-
-    const int64_t t_end = ggml_time_us();
-
-    LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0);
-}
-
 // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
 // this way we eliminate any empty segments that may have been left by previous KV cache operations
 //
@@ -8412,14 +8171,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         }
     }
 
-    // compress the KV cache data if needed
-    if (lctx.kv_self.compress_delta >= 0) {
-        llama_kv_cache_compress_internal(lctx);
-
-        lctx.kv_self.compress_delta = -1;
-        lctx.kv_self.do_defrag = false;
-    }
-
     // defragment the KV cache if needed
     if (lctx.kv_self.do_defrag) {
         llama_kv_cache_defrag_internal(lctx);
@@ -12496,10 +12247,6 @@ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id se
     return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
 }
 
-void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) {
-    llama_kv_cache_compress(ctx->kv_self, delta);
-}
-
 void llama_kv_cache_defrag(struct llama_context * ctx) {
     llama_kv_cache_defrag(ctx->kv_self);
 }
diff --git a/llama.h b/llama.h
index 862d555e2b9..ff131996d9a 100644
--- a/llama.h
+++ b/llama.h
@@ -554,14 +554,6 @@ extern "C" {
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
 
-    // [EXPERIMENTAL] Compress the data in the KV cache
-    // This will be applied:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_compress(
-            struct llama_context * ctx,
-                       llama_pos   delta);
-
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()

From 65323bc770667b372730e892c4d56f383558e303 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 17:21:33 +0200
Subject: [PATCH 20/23] llama : defragment via non-overlapping moves

---
 llama.cpp | 71 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 13 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 75189e719ae..aa7574cc102 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8028,7 +8028,7 @@ static int llama_decode_internal(
 }
 
 // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
-// this way we eliminate any empty segments that may have been left by previous KV cache operations
+// this way we eliminate any empty holes that may have been left by previous KV cache operations
 //
 // TODO: optimizations are possible:
 //       - multiple threads
@@ -8045,36 +8045,81 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
     const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
     const uint32_t n_kv         = llama_kv_cache_cell_max(kv_self);
+    const uint32_t n_used       = kv_self.used;
 
     const uint32_t kv_size = kv_self.size;
 
+    assert(n_used <= n_kv);
+
     const int64_t t_start = ggml_time_us();
 
     std::vector<uint8_t> buf_k;
     std::vector<uint8_t> buf_v;
 
-    // the destination cell in the new KV cache
-    uint32_t id = 0;
-
     // number of cells moved
     uint32_t n_moves = 0;
 
     // determine which KV cells to move where
     std::vector<uint32_t> ids(n_kv, n_kv);
 
-    for (uint32_t i0 = 0; i0 < n_kv; ++i0) {
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
         const auto & cell0 = kv_self.cells[i0];
 
         if (!cell0.is_empty()) {
-            ids[i0] = id;
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        // determine the size of the hole
+        uint32_t nh = 1;
+        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        // starting from the end, find nh non-empty cells
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+        for (; is > i0; --is) {
+            const auto & cell1 = kv_self.cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
 
-            if (i0 != id) {
-                kv_self.cells[id] = cell0;
-                n_moves++;
+            // non-empty cell which is not yet moved
+            nf++;
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        // go back and move the nf cells to the hole
+        for (uint32_t i1 = is; i1 < n_kv; ++i1) {
+            const auto & cell1 = kv_self.cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                continue;
             }
 
-            id++;
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            kv_self.cells[i0 + nf] = cell1;
+
+            n_moves++;
+            nf++;
         }
+
+        LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, n_kv, i0, i0 + nh);
+
+        i0 += nh - 1;
     }
 
     if (n_moves == 0) {
@@ -8083,11 +8128,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
     LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
 
-    kv_self.head = id;
-    kv_self.used = id;
+    kv_self.head = n_used;
+    kv_self.used = n_used;
 
     // zero the rest of the cells
-    for (uint32_t i = id; i < n_kv; ++i) {
+    for (uint32_t i = n_used; i < n_kv; ++i) {
         kv_self.cells[i] = llama_kv_cell();
     }
 

From 4eaaace394016231858acacbc077a693512388f2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 17:36:37 +0200
Subject: [PATCH 21/23] llama : ggml_graph based defrag implementation

ggml-ci
---
 llama.cpp | 112 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 94 insertions(+), 18 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index aa7574cc102..e6826a31793 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5111,6 +5111,53 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        for (int il = 0; il < n_layer; ++il) {
+            for (int i = 0; i < n_kv; ++i) {
+                const int id = ids[i];
+
+                if (i == id || id == n_kv) {
+                    continue;
+                }
+
+                int nm = 1;
+
+                while (i + nm < n_kv && (int) ids[i + nm] == id + nm) {
+                    nm++;
+                }
+
+                ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+                        n_embd_k_gqa, nm,
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+
+                ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+                        n_embd_k_gqa, nm,
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+
+                ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, i));
+
+                ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, id));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+
+                i += nm - 1;
+            }
+        }
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_llama() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -7505,6 +7552,23 @@ struct llm_build_context {
     }
 };
 
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+    llama_batch dummy;
+    dummy.n_tokens = 0;
+
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+
+    struct llm_build_context llm(lctx, dummy, cb, false);
+
+    llm.init();
+
+    struct ggml_cgraph * result = llm.build_defrag(ids);
+
+    llm.free();
+
+    return result;
+}
+
 static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
     llama_batch dummy;
     dummy.n_tokens = 0;
@@ -8030,32 +8094,16 @@ static int llama_decode_internal(
 // copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
 // this way we eliminate any empty holes that may have been left by previous KV cache operations
 //
-// TODO: optimizations are possible:
-//       - multiple threads
-//       - avoid copying to the host memory when already there
-//
-// TODO: can we do all this on-device?
-//
 static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     auto & kv_self = lctx.kv_self;
 
-    const auto & hparams = lctx.model.hparams;
-
-    const uint32_t n_layer      = hparams.n_layer;
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-    const uint32_t n_kv         = llama_kv_cache_cell_max(kv_self);
-    const uint32_t n_used       = kv_self.used;
-
-    const uint32_t kv_size = kv_self.size;
+    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
+    const uint32_t n_used = kv_self.used;
 
     assert(n_used <= n_kv);
 
     const int64_t t_start = ggml_time_us();
 
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
     // number of cells moved
     uint32_t n_moves = 0;
 
@@ -8136,6 +8184,27 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
         kv_self.cells[i] = llama_kv_cell();
     }
 
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const auto & hparams = lctx.model.hparams;
+
+    const uint32_t n_layer      = hparams.n_layer;
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = kv_self.size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
     for (uint32_t il = 0; il < n_layer; ++il) {
         const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
         const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
@@ -8188,6 +8257,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
         ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
         ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
     }
+#else
+    // ggml_graph defrag
+
+    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+#endif
 
     const int64_t t_end = ggml_time_us();
 

From 0b72ded501e22501d968583ada5300ed49977621 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 17:51:02 +0200
Subject: [PATCH 22/23] llama : switch the loop order in build_defrag

---
 llama.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e6826a31793..f87f44d14d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5114,20 +5114,20 @@ struct llm_build_context {
     struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
-        for (int il = 0; il < n_layer; ++il) {
-            for (int i = 0; i < n_kv; ++i) {
-                const int id = ids[i];
+        for (int i = 0; i < n_kv; ++i) {
+            const int id = ids[i];
 
-                if (i == id || id == n_kv) {
-                    continue;
-                }
+            if (i == id || id == n_kv) {
+                continue;
+            }
 
-                int nm = 1;
+            int nm = 1;
 
-                while (i + nm < n_kv && (int) ids[i + nm] == id + nm) {
-                    nm++;
-                }
+            while (i + nm < n_kv && (int) ids[i + nm] == id + nm) {
+                nm++;
+            }
 
+            for (int il = 0; il < n_layer; ++il) {
                 ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
                         n_embd_k_gqa, nm,
                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
@@ -5150,9 +5150,9 @@ struct llm_build_context {
 
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-
-                i += nm - 1;
             }
+
+            i += nm - 1;
         }
 
         return gf;

From 5a122c25a0d8c840f34bce10c0d1565464612405 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 25 Feb 2024 18:16:45 +0200
Subject: [PATCH 23/23] llama : add comments

---
 llama.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f87f44d14d7..3424b1999eb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8091,9 +8091,7 @@ static int llama_decode_internal(
     return 0;
 }
 
-// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
-// this way we eliminate any empty holes that may have been left by previous KV cache operations
-//
+// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
 static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     auto & kv_self = lctx.kv_self;
 
@@ -8108,6 +8106,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     uint32_t n_moves = 0;
 
     // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
     std::vector<uint32_t> ids(n_kv, n_kv);
 
     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
@@ -8139,11 +8142,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
             // non-empty cell which is not yet moved
             nf++;
+
             if (nf == nh) {
                 break;
             }
         }
 
+        // this can only happen if `n_used` is not accurate, which would be a bug
         GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
 
         nf = 0;
@@ -8156,6 +8161,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
                 continue;
             }
 
+            // this cell goes to (i0 + nf)
             ids[i1] = i0 + nf;
 
             // move the cell meta data