From f341bd6c8690eb9fcbb53476562b14d9296606e1 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 5 Jul 2024 14:06:03 +0200
Subject: [PATCH 1/4] llama : add early return for empty range

This commit adds an early return to the llama_kv_cache_seq_add and
llama_kv_cache_seq_div functions.

The motivation for adding this is to avoid looping over the cache
when the range is empty. I ran into this when using the self-extend
feature in main.cpp.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 src/llama.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 18956d44140..efd7429d5fe 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3258,6 +3258,11 @@ static void llama_kv_cache_seq_add(
 
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        cache.head = 0;
+        return;
+    }
 
     if (cache.recurrent) {
         // for Mamba-like models, only the pos needs to be shifted
@@ -3302,6 +3307,8 @@ static void llama_kv_cache_seq_div(
                           int   d) {
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) return;
 
     if (cache.recurrent) {
         // for Mamba-like models, only the pos needs to be changed

From 4eb8073c54ffeb3c0d27b8d9cac645f060aad453 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 5 Jul 2024 15:02:47 +0200
Subject: [PATCH 2/4] llama : add static_cast to fix CI warning/error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit attempts to fix the following warning/error:

```console
src/llama.cpp:7271:31: error:
comparison of integer expressions of different signedness:
‘int’ and ‘uint32_t’ {aka ‘unsigned int’} [-Werror=sign-compare]
 7271 |                         if (i < hparams.n_layer_dense_lead) {
      |                             ~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~
```
This can be reproduced locally by setting -Wsign-compare in the
Makefile.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index efd7429d5fe..b2c302915d5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7268,7 +7268,7 @@ static bool llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
-                        if (i < (int) hparams.n_layer_dense_lead) {
+                        if (static_cast<uint32_t>(i) < hparams.n_layer_dense_lead) {
                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                             layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});

From eb572f9ac64efa5087c823550127c3c3f3b4e1a8 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 5 Jul 2024 18:36:55 +0200
Subject: [PATCH 3/4] squash! llama : add early return for empty range

Remove the setting of cache.head to 0 when the range is empty.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 src/llama.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index b2c302915d5..24356d2b439 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3259,10 +3259,7 @@ static void llama_kv_cache_seq_add(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
     // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) {
-        cache.head = 0;
-        return;
-    }
+    if (p0 == p1) return;
 
     if (cache.recurrent) {
         // for Mamba-like models, only the pos needs to be shifted

From c9d6700adc3aeba8f75cd745fd26c492295868d0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 6 Jul 2024 10:22:09 +0300
Subject: [PATCH 4/4] Update src/llama.cpp

---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 24356d2b439..22f71b02e5a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7265,7 +7265,7 @@ static bool llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
-                        if (static_cast<uint32_t>(i) < hparams.n_layer_dense_lead) {
+                        if (i < (int) hparams.n_layer_dense_lead) {
                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                             layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});