From ffc6b144037dd310afe98b17edb1e0018e3d2cb5 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Tue, 28 Nov 2023 14:56:28 -0500
Subject: [PATCH 1/9] turbo

---
 server/bleep/src/agent/model.rs      | 10 +++++-----
 server/bleep/src/agent/tools/code.rs |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/server/bleep/src/agent/model.rs b/server/bleep/src/agent/model.rs
index 70ad7177bb..8948d8161e 100644
--- a/server/bleep/src/agent/model.rs
+++ b/server/bleep/src/agent/model.rs
@@ -32,11 +32,11 @@ pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
 };
 
 pub const GPT_4: AnswerModel = AnswerModel {
-    tokenizer: "gpt-4-0613",
-    model_name: "gpt-4-0613",
-    answer_headroom: 1024,
-    prompt_headroom: 2500,
-    history_headroom: 2048,
+    tokenizer: "gpt-4-1106-preview",
+    model_name: "gpt-4-1106-preview",
+    answer_headroom: 1024 + 120000 - 16000,
+    prompt_headroom: 2500+ 120000 - 16000,
+    history_headroom: 2048+ 120000 - 16000,
     system_prompt: prompts::answer_article_prompt,
 };
 
diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs
index 8f2017c831..9116d60ac4 100644
--- a/server/bleep/src/agent/tools/code.rs
+++ b/server/bleep/src/agent/tools/code.rs
@@ -13,7 +13,7 @@ use crate::{
 impl Agent {
     #[instrument(skip(self))]
     pub async fn code_search(&mut self, query: &String) -> Result<String> {
-        const CODE_SEARCH_LIMIT: u64 = 10;
+        const CODE_SEARCH_LIMIT: u64 = 20;
         const MINIMUM_RESULTS: usize = CODE_SEARCH_LIMIT as usize / 2;
 
         self.update(Update::StartStep(SearchStep::Code {

From ab2f2b41947d045f7077403fc2d706468f426bc9 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Wed, 29 Nov 2023 16:56:51 -0500
Subject: [PATCH 2/9] gpt4 turbo agent

---
 server/bleep/src/agent.rs            |  3 ++-
 server/bleep/src/agent/model.rs      | 17 +++++++++++++---
 server/bleep/src/agent/tools/code.rs |  2 +-
 server/bleep/src/webserver/answer.rs | 29 +++++++++++++++++++---------
 4 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/server/bleep/src/agent.rs b/server/bleep/src/agent.rs
index 133e365089..f2adfdfa8e 100644
--- a/server/bleep/src/agent.rs
+++ b/server/bleep/src/agent.rs
@@ -58,6 +58,7 @@ pub struct Agent {
     pub query_id: uuid::Uuid,
 
     pub model: model::AnswerModel,
+    pub agent_model: model::AnswerModel,
 
     /// Indicate whether the request was answered.
     ///
@@ -220,7 +221,7 @@ impl Agent {
         ))];
         history.extend(self.history()?);
 
-        let trimmed_history = trim_history(history.clone(), self.model)?;
+        let trimmed_history = trim_history(history.clone(), self.agent_model)?;
 
         let raw_response = self
             .llm_gateway
diff --git a/server/bleep/src/agent/model.rs b/server/bleep/src/agent/model.rs
index 8948d8161e..7f94ef9ab2 100644
--- a/server/bleep/src/agent/model.rs
+++ b/server/bleep/src/agent/model.rs
@@ -31,12 +31,21 @@ pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
     system_prompt: prompts::answer_article_prompt_finetuned,
 };
 
-pub const GPT_4: AnswerModel = AnswerModel {
+pub const GPT_4_TURBO_24K: AnswerModel = AnswerModel {
     tokenizer: "gpt-4-1106-preview",
     model_name: "gpt-4-1106-preview",
     answer_headroom: 1024 + 120000 - 16000,
-    prompt_headroom: 2500+ 120000 - 16000,
-    history_headroom: 2048+ 120000 - 16000,
+    prompt_headroom: 2500 + 120000 - 16000,
+    history_headroom: 2048 + 120000 - 16000,
+    system_prompt: prompts::answer_article_prompt,
+};
+
+pub const GPT_4: AnswerModel = AnswerModel {
+    tokenizer: "gpt-4-0613",
+    model_name: "gpt-4-0613",
+    answer_headroom: 1024,
+    prompt_headroom: 2500,
+    history_headroom: 2048,
     system_prompt: prompts::answer_article_prompt,
 };
 
@@ -46,7 +55,9 @@ impl FromStr for AnswerModel {
         #[allow(clippy::wildcard_in_or_patterns)]
         match s {
             "gpt-4" => Ok(GPT_4),
+            "gpt-4-turbo-24k" => Ok(GPT_4_TURBO_24K),
             "gpt-3.5-turbo-finetuned" | _ => Ok(GPT_3_5_TURBO_FINETUNED),
+            
         }
     }
 }
diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs
index 9116d60ac4..8f2017c831 100644
--- a/server/bleep/src/agent/tools/code.rs
+++ b/server/bleep/src/agent/tools/code.rs
@@ -13,7 +13,7 @@ use crate::{
 impl Agent {
     #[instrument(skip(self))]
     pub async fn code_search(&mut self, query: &String) -> Result<String> {
-        const CODE_SEARCH_LIMIT: u64 = 20;
+        const CODE_SEARCH_LIMIT: u64 = 10;
         const MINIMUM_RESULTS: usize = CODE_SEARCH_LIMIT as usize / 2;
 
         self.update(Update::StartStep(SearchStep::Code {
diff --git a/server/bleep/src/webserver/answer.rs b/server/bleep/src/webserver/answer.rs
index 5b7b343cdf..860f83baa0 100644
--- a/server/bleep/src/webserver/answer.rs
+++ b/server/bleep/src/webserver/answer.rs
@@ -76,6 +76,8 @@ pub struct Answer {
     /// Optional id of the parent of the exchange to overwrite
     /// If this UUID is nil, then overwrite the first exchange in the thread
     pub parent_exchange_id: Option<uuid::Uuid>,
+    #[serde(default = "default_agent_model")]
+    pub agent_model: agent::model::AnswerModel,
 }
 
 fn default_thread_id() -> uuid::Uuid {
@@ -86,6 +88,10 @@ fn default_model() -> agent::model::AnswerModel {
     agent::model::GPT_3_5_TURBO_FINETUNED
 }
 
+fn default_agent_model() -> agent::model::AnswerModel {
+    agent::model::GPT_4
+}
+
 pub(super) async fn answer(
     Query(params): Query<Answer>,
     Extension(app): Extension<Application>,
@@ -207,12 +213,20 @@ async fn try_execute_agent(
     Sse<std::pin::Pin<Box<dyn tokio_stream::Stream<Item = Result<sse::Event>> + Send>>>,
 > {
     QueryLog::new(&app.sql).insert(&params.q).await?;
-
+    let Answer {
+        thread_id,
+        repo_ref,
+        model,
+        agent_model,
+        ..
+    } = params.clone();
+    
     let llm_gateway = user
         .llm_gateway(&app)
         .await?
         .temperature(0.0)
-        .session_reference_id(conversation_id.to_string());
+        .session_reference_id(conversation_id.to_string())
+        .model(agent_model.model_name);
 
     // confirm client compatibility with answer-api
     match llm_gateway
@@ -243,12 +257,7 @@ async fn try_execute_agent(
         }
     };
 
-    let Answer {
-        thread_id,
-        repo_ref,
-        model,
-        ..
-    } = params.clone();
+    
     let stream = async_stream::try_stream! {
         let (exchange_tx, exchange_rx) = tokio::sync::mpsc::channel(10);
 
@@ -263,6 +272,7 @@ async fn try_execute_agent(
             query_id,
             exchange_state: ExchangeState::Pending,
             model,
+            agent_model
         };
 
         let mut exchange_rx = tokio_stream::wrappers::ReceiverStream::new(exchange_rx);
@@ -339,7 +349,7 @@ async fn try_execute_agent(
         Ok(sse::Event::default()
             .json_data(json!({
                 "thread_id": params.thread_id.to_string(),
-                "query_id": query_id
+                "query_id": query_id,
             }))
             // This should never happen, so we force an unwrap.
             .expect("failed to serialize initialization object"))
@@ -392,6 +402,7 @@ pub async fn explain(
         thread_id: params.thread_id,
         parent_exchange_id: None,
         model: agent::model::GPT_4,
+        agent_model: agent::model::GPT_4,
     };
 
     let conversation_id = ConversationId {

From d76fc9c6305fd680c7266106ef3f011ded65c600 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:22:57 -0500
Subject: [PATCH 3/9] add turbo to studio

---
 server/bleep/src/webserver/studio.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/server/bleep/src/webserver/studio.rs b/server/bleep/src/webserver/studio.rs
index 12edeed4a8..be9c838a98 100644
--- a/server/bleep/src/webserver/studio.rs
+++ b/server/bleep/src/webserver/studio.rs
@@ -32,7 +32,7 @@ use crate::{
 
 mod diff;
 
-const LLM_GATEWAY_MODEL: &str = "gpt-4-0613";
+const LLM_GATEWAY_MODEL: &str = "gpt-4-1106-preview";
 
 fn no_user_id() -> Error {
     Error::user("didn't have user ID")
@@ -472,7 +472,7 @@ async fn token_counts(
         })
         .collect::<Vec<_>>();
 
-    let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-0613").unwrap();
+    let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-1106-preview").unwrap();
     let per_doc_file = stream::iter(doc_context)
         .map(|file| async {
             if file.hidden {
@@ -633,14 +633,14 @@ pub async fn get_doc_file_token_count(
         .map(|sr| sr.text)
         .collect::<String>();
 
-    let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-0613").unwrap();
+    let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-1106-preview").unwrap();
     let token_count = core_bpe.encode_ordinary(&content).len();
 
     Ok(Json(token_count))
 }
 
 fn count_tokens_for_file(path: &str, body: &str, ranges: &[Range<usize>]) -> usize {
-    let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-0613").unwrap();
+    let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-1106-preview").unwrap();
 
     let mut chunks = Vec::new();
 

From 66d6919ea81abbbcfdc7498db56a66c36dfd5509 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:27:28 -0500
Subject: [PATCH 4/9] make turbo 24k default

---
 client/src/components/Chat/index.tsx | 2 +-
 server/bleep/src/agent/model.rs      | 8 ++++----
 server/bleep/src/webserver/answer.rs | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/client/src/components/Chat/index.tsx b/client/src/components/Chat/index.tsx
index 814117d0de..9d2c012149 100644
--- a/client/src/components/Chat/index.tsx
+++ b/client/src/components/Chat/index.tsx
@@ -95,7 +95,7 @@ const Chat = () => {
             }`
           : ''
       }&model=${
-        preferredAnswerSpeed === 'normal' ? 'gpt-4' : 'gpt-3.5-turbo-finetuned'
+        preferredAnswerSpeed === 'normal' ? 'gpt-4-turbo-24k' : 'gpt-3.5-turbo-finetuned'
       }`;
       console.log(url);
       const eventSource = new EventSource(url);
diff --git a/server/bleep/src/agent/model.rs b/server/bleep/src/agent/model.rs
index 7f94ef9ab2..c0cabc3035 100644
--- a/server/bleep/src/agent/model.rs
+++ b/server/bleep/src/agent/model.rs
@@ -30,13 +30,13 @@ pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
     history_headroom: 1024,
     system_prompt: prompts::answer_article_prompt_finetuned,
 };
-
+// to use 24k out of 128k (add 128-24 to the headrooms)
 pub const GPT_4_TURBO_24K: AnswerModel = AnswerModel {
     tokenizer: "gpt-4-1106-preview",
     model_name: "gpt-4-1106-preview",
-    answer_headroom: 1024 + 120000 - 16000,
-    prompt_headroom: 2500 + 120000 - 16000,
-    history_headroom: 2048 + 120000 - 16000,
+    answer_headroom: 1024 + 104000,
+    prompt_headroom: 2500 + 104000,
+    history_headroom: 2048 + 104000,
     system_prompt: prompts::answer_article_prompt,
 };
 
diff --git a/server/bleep/src/webserver/answer.rs b/server/bleep/src/webserver/answer.rs
index 860f83baa0..0cc354097d 100644
--- a/server/bleep/src/webserver/answer.rs
+++ b/server/bleep/src/webserver/answer.rs
@@ -85,7 +85,7 @@ fn default_thread_id() -> uuid::Uuid {
 }
 
 fn default_model() -> agent::model::AnswerModel {
-    agent::model::GPT_3_5_TURBO_FINETUNED
+    agent::model::GPT_4_TURBO_24K
 }
 
 fn default_agent_model() -> agent::model::AnswerModel {

From 2c1b59c1b19980f14bdacadaac1b3fc4f0659da9 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:35:35 -0500
Subject: [PATCH 5/9] fmt

---
 server/bleep/src/agent/model.rs      | 1 -
 server/bleep/src/webserver/answer.rs | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/server/bleep/src/agent/model.rs b/server/bleep/src/agent/model.rs
index c0cabc3035..8327a36c4e 100644
--- a/server/bleep/src/agent/model.rs
+++ b/server/bleep/src/agent/model.rs
@@ -57,7 +57,6 @@ impl FromStr for AnswerModel {
             "gpt-4" => Ok(GPT_4),
             "gpt-4-turbo-24k" => Ok(GPT_4_TURBO_24K),
             "gpt-3.5-turbo-finetuned" | _ => Ok(GPT_3_5_TURBO_FINETUNED),
-            
         }
     }
 }
diff --git a/server/bleep/src/webserver/answer.rs b/server/bleep/src/webserver/answer.rs
index 0cc354097d..615d70a7c4 100644
--- a/server/bleep/src/webserver/answer.rs
+++ b/server/bleep/src/webserver/answer.rs
@@ -220,7 +220,7 @@ async fn try_execute_agent(
         agent_model,
         ..
     } = params.clone();
-    
+
     let llm_gateway = user
         .llm_gateway(&app)
         .await?
@@ -257,7 +257,6 @@ async fn try_execute_agent(
         }
     };
 
-    
     let stream = async_stream::try_stream! {
         let (exchange_tx, exchange_rx) = tokio::sync::mpsc::channel(10);
 

From 86588753821d50d7515b943266f1ed1f03c97854 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:49:23 -0500
Subject: [PATCH 6/9] fmt

---
 client/src/components/Chat/index.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/client/src/components/Chat/index.tsx b/client/src/components/Chat/index.tsx
index 9d2c012149..2a5c4204d7 100644
--- a/client/src/components/Chat/index.tsx
+++ b/client/src/components/Chat/index.tsx
@@ -95,7 +95,9 @@ const Chat = () => {
             }`
           : ''
       }&model=${
-        preferredAnswerSpeed === 'normal' ? 'gpt-4-turbo-24k' : 'gpt-3.5-turbo-finetuned'
+        preferredAnswerSpeed === 'normal'
+          ? 'gpt-4-turbo-24k'
+          : 'gpt-3.5-turbo-finetuned'
       }`;
       console.log(url);
       const eventSource = new EventSource(url);

From c106a40a68d934596917f986d73ca556457199b3 Mon Sep 17 00:00:00 2001
From: Gabriel Gordon-Hall <ggordonhall@gmail.com>
Date: Fri, 1 Dec 2023 10:03:31 +0000
Subject: [PATCH 7/9] disambiguate agent and answer models

---
 client/src/consts/codeStudio.ts        |  2 +-
 server/bleep/src/agent.rs              |  6 ++---
 server/bleep/src/agent/model.rs        | 14 +++++-----
 server/bleep/src/agent/tools/answer.rs | 36 ++++++++++++++------------
 server/bleep/src/webserver/answer.rs   | 18 ++++++-------
 5 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/client/src/consts/codeStudio.ts b/client/src/consts/codeStudio.ts
index 29de4c9550..a4a742edf8 100644
--- a/client/src/consts/codeStudio.ts
+++ b/client/src/consts/codeStudio.ts
@@ -1 +1 @@
-export const TOKEN_LIMIT = 7000;
+export const TOKEN_LIMIT = 21000;
diff --git a/server/bleep/src/agent.rs b/server/bleep/src/agent.rs
index f2adfdfa8e..e2cdc40f8a 100644
--- a/server/bleep/src/agent.rs
+++ b/server/bleep/src/agent.rs
@@ -57,8 +57,8 @@ pub struct Agent {
     pub thread_id: uuid::Uuid,
     pub query_id: uuid::Uuid,
 
-    pub model: model::AnswerModel,
-    pub agent_model: model::AnswerModel,
+    pub answer_model: model::LLMModel,
+    pub agent_model: model::LLMModel,
 
     /// Indicate whether the request was answered.
     ///
@@ -485,7 +485,7 @@ impl Agent {
 
 fn trim_history(
     mut history: Vec<llm_gateway::api::Message>,
-    model: model::AnswerModel,
+    model: model::LLMModel,
 ) -> Result<Vec<llm_gateway::api::Message>> {
     const HIDDEN: &str = "[HIDDEN]";
 
diff --git a/server/bleep/src/agent/model.rs b/server/bleep/src/agent/model.rs
index 8327a36c4e..9b4e5932f8 100644
--- a/server/bleep/src/agent/model.rs
+++ b/server/bleep/src/agent/model.rs
@@ -2,7 +2,7 @@ use crate::agent::prompts;
 use std::str::FromStr;
 
 #[derive(Debug, Copy, Clone)]
-pub struct AnswerModel {
+pub struct LLMModel {
     /// The name of this model according to tiktoken
     pub tokenizer: &'static str,
 
@@ -22,7 +22,7 @@ pub struct AnswerModel {
     pub system_prompt: fn(&str) -> String,
 }
 
-pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
+pub const GPT_3_5_TURBO_FINETUNED: LLMModel = LLMModel {
     tokenizer: "gpt-3.5-turbo-0613",
     model_name: "gpt-3.5-turbo-finetuned",
     answer_headroom: 512,
@@ -31,7 +31,7 @@ pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
     system_prompt: prompts::answer_article_prompt_finetuned,
 };
 // to use 24k out of 128k (add 128-24 to the headrooms)
-pub const GPT_4_TURBO_24K: AnswerModel = AnswerModel {
+pub const GPT_4_TURBO_24K: LLMModel = LLMModel {
     tokenizer: "gpt-4-1106-preview",
     model_name: "gpt-4-1106-preview",
     answer_headroom: 1024 + 104000,
@@ -40,7 +40,7 @@ pub const GPT_4_TURBO_24K: AnswerModel = AnswerModel {
     system_prompt: prompts::answer_article_prompt,
 };
 
-pub const GPT_4: AnswerModel = AnswerModel {
+pub const GPT_4: LLMModel = LLMModel {
     tokenizer: "gpt-4-0613",
     model_name: "gpt-4-0613",
     answer_headroom: 1024,
@@ -49,7 +49,7 @@ pub const GPT_4: AnswerModel = AnswerModel {
     system_prompt: prompts::answer_article_prompt,
 };
 
-impl FromStr for AnswerModel {
+impl FromStr for LLMModel {
     type Err = ();
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         #[allow(clippy::wildcard_in_or_patterns)]
@@ -61,13 +61,13 @@ impl FromStr for AnswerModel {
     }
 }
 
-impl<'de> serde::Deserialize<'de> for AnswerModel {
+impl<'de> serde::Deserialize<'de> for LLMModel {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         let s = String::deserialize(deserializer)?;
-        s.parse::<AnswerModel>()
+        s.parse::<LLMModel>()
             .map_err(|_| serde::de::Error::custom("failed to deserialize"))
     }
 }
diff --git a/server/bleep/src/agent/tools/answer.rs b/server/bleep/src/agent/tools/answer.rs
index 249228d2af..7d40b13f48 100644
--- a/server/bleep/src/agent/tools/answer.rs
+++ b/server/bleep/src/agent/tools/answer.rs
@@ -41,16 +41,16 @@ impl Agent {
         }
 
         let context = self.answer_context(aliases).await?;
-        let system_prompt = (self.model.system_prompt)(&context);
+        let system_prompt = (self.answer_model.system_prompt)(&context);
         let system_message = llm_gateway::api::Message::system(&system_prompt);
         let history = {
             let h = self.utter_history().collect::<Vec<_>>();
             let system_headroom = tiktoken_rs::num_tokens_from_messages(
-                self.model.tokenizer,
+                self.answer_model.tokenizer,
                 &[(&system_message).into()],
             )?;
-            let headroom = self.model.answer_headroom + system_headroom;
-            trim_utter_history(h, headroom, self.model)?
+            let headroom = self.answer_model.answer_headroom + system_headroom;
+            trim_utter_history(h, headroom, self.answer_model)?
         };
         let messages = Some(system_message)
             .into_iter()
@@ -60,12 +60,14 @@ impl Agent {
         let mut stream = pin!(
             self.llm_gateway
                 .clone()
-                .model(self.model.model_name)
-                .frequency_penalty(if self.model.model_name == "gpt-3.5-turbo-finetuned" {
-                    Some(0.2)
-                } else {
-                    Some(0.0)
-                })
+                .model(self.answer_model.model_name)
+                .frequency_penalty(
+                    if self.answer_model.model_name == "gpt-3.5-turbo-finetuned" {
+                        Some(0.2)
+                    } else {
+                        Some(0.0)
+                    }
+                )
                 .chat_stream(&messages, None)
                 .await?
         );
@@ -108,7 +110,7 @@ impl Agent {
                 .with_payload("query_history", &history)
                 .with_payload("response", &response)
                 .with_payload("raw_prompt", &system_prompt)
-                .with_payload("model", self.model.model_name),
+                .with_payload("model", self.answer_model.model_name),
         );
 
         Ok(())
@@ -145,9 +147,9 @@ impl Agent {
         // Sometimes, there are just too many code chunks in the context, and deduplication still
         // doesn't trim enough chunks. So, we enforce a hard limit here that stops adding tokens
         // early if we reach a heuristic limit.
-        let bpe = tiktoken_rs::get_bpe_from_model(self.model.tokenizer)?;
+        let bpe = tiktoken_rs::get_bpe_from_model(self.answer_model.tokenizer)?;
         let mut remaining_prompt_tokens =
-            tiktoken_rs::get_completion_max_tokens(self.model.tokenizer, &s)?;
+            tiktoken_rs::get_completion_max_tokens(self.answer_model.tokenizer, &s)?;
 
         // Select as many recent chunks as possible
         let mut recent_chunks = Vec::new();
@@ -166,7 +168,7 @@ impl Agent {
 
             let snippet_tokens = bpe.encode_ordinary(&formatted_snippet).len();
 
-            if snippet_tokens >= remaining_prompt_tokens - self.model.prompt_headroom {
+            if snippet_tokens >= remaining_prompt_tokens - self.answer_model.prompt_headroom {
                 info!("breaking at {} tokens", remaining_prompt_tokens);
                 break;
             }
@@ -251,8 +253,8 @@ impl Agent {
         /// Making this closure to 1 means that more of the context is taken up by source code.
         const CONTEXT_CODE_RATIO: f32 = 0.5;
 
-        let bpe = tiktoken_rs::get_bpe_from_model(self.model.tokenizer).unwrap();
-        let context_size = tiktoken_rs::model::get_context_size(self.model.tokenizer);
+        let bpe = tiktoken_rs::get_bpe_from_model(self.answer_model.tokenizer).unwrap();
+        let context_size = tiktoken_rs::model::get_context_size(self.answer_model.tokenizer);
         let max_tokens = (context_size as f32 * CONTEXT_CODE_RATIO) as usize;
 
         // Note: The end line number here is *not* inclusive.
@@ -412,7 +414,7 @@ impl Agent {
 fn trim_utter_history(
     mut history: Vec<llm_gateway::api::Message>,
     headroom: usize,
-    model: model::AnswerModel,
+    model: model::LLMModel,
 ) -> Result<Vec<llm_gateway::api::Message>> {
     let mut tiktoken_msgs: Vec<tiktoken_rs::ChatCompletionRequestMessage> =
         history.iter().map(|m| m.into()).collect::<Vec<_>>();
diff --git a/server/bleep/src/webserver/answer.rs b/server/bleep/src/webserver/answer.rs
index 615d70a7c4..8dce6b5a39 100644
--- a/server/bleep/src/webserver/answer.rs
+++ b/server/bleep/src/webserver/answer.rs
@@ -69,26 +69,26 @@ pub(super) async fn vote(
 pub struct Answer {
     pub q: String,
     pub repo_ref: RepoRef,
-    #[serde(default = "default_model")]
-    pub model: agent::model::AnswerModel,
+    #[serde(default = "default_answer_model")]
+    pub answer_model: agent::model::LLMModel,
+    #[serde(default = "default_agent_model")]
+    pub agent_model: agent::model::LLMModel,
     #[serde(default = "default_thread_id")]
     pub thread_id: uuid::Uuid,
     /// Optional id of the parent of the exchange to overwrite
     /// If this UUID is nil, then overwrite the first exchange in the thread
     pub parent_exchange_id: Option<uuid::Uuid>,
-    #[serde(default = "default_agent_model")]
-    pub agent_model: agent::model::AnswerModel,
 }
 
 fn default_thread_id() -> uuid::Uuid {
     uuid::Uuid::new_v4()
 }
 
-fn default_model() -> agent::model::AnswerModel {
+fn default_answer_model() -> agent::model::LLMModel {
     agent::model::GPT_4_TURBO_24K
 }
 
-fn default_agent_model() -> agent::model::AnswerModel {
+fn default_agent_model() -> agent::model::LLMModel {
     agent::model::GPT_4
 }
 
@@ -216,7 +216,7 @@ async fn try_execute_agent(
     let Answer {
         thread_id,
         repo_ref,
-        model,
+        answer_model,
         agent_model,
         ..
     } = params.clone();
@@ -270,7 +270,7 @@ async fn try_execute_agent(
             thread_id,
             query_id,
             exchange_state: ExchangeState::Pending,
-            model,
+            answer_model,
             agent_model
         };
 
@@ -400,7 +400,7 @@ pub async fn explain(
         repo_ref: params.repo_ref,
         thread_id: params.thread_id,
         parent_exchange_id: None,
-        model: agent::model::GPT_4,
+        answer_model: agent::model::GPT_4_TURBO_24K,
         agent_model: agent::model::GPT_4,
     };
 

From 0c9c3f8382750dd3632ec285cba350e294bdd7b4 Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Thu, 7 Dec 2023 11:06:46 -0500
Subject: [PATCH 8/9] better explanantion of headroom

---
 server/bleep/src/agent/model.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/server/bleep/src/agent/model.rs b/server/bleep/src/agent/model.rs
index 9b4e5932f8..917941e931 100644
--- a/server/bleep/src/agent/model.rs
+++ b/server/bleep/src/agent/model.rs
@@ -30,13 +30,21 @@ pub const GPT_3_5_TURBO_FINETUNED: LLMModel = LLMModel {
     history_headroom: 1024,
     system_prompt: prompts::answer_article_prompt_finetuned,
 };
-// to use 24k out of 128k (add 128-24 to the headrooms)
+
+// GPT-4 turbo has a context window of 128k tokens
+const GPT_4_TURBO_MAX_TOKENS: usize = 128_000;
+// We want to use only 24k tokens
+const ACTUAL_MAX_TOKENS: usize = 24_000;
+// 104k tokens should be left unused. This is done by adding 104k to the headrooms
+// (tokens left unused for other purposes answer, prompt...)
+const HEADROOM_CORRECTION: usize = GPT_4_TURBO_MAX_TOKENS - ACTUAL_MAX_TOKENS;
+// PS: when we want to fully utilize the model max context window, the correction is 0
 pub const GPT_4_TURBO_24K: LLMModel = LLMModel {
     tokenizer: "gpt-4-1106-preview",
     model_name: "gpt-4-1106-preview",
-    answer_headroom: 1024 + 104000,
-    prompt_headroom: 2500 + 104000,
-    history_headroom: 2048 + 104000,
+    answer_headroom: 1024 + HEADROOM_CORRECTION,
+    prompt_headroom: 2500 + HEADROOM_CORRECTION,
+    history_headroom: 2048 + HEADROOM_CORRECTION,
     system_prompt: prompts::answer_article_prompt,
 };
 

From 0297a65e54f429ee9f933acd5b0bd54562cc559e Mon Sep 17 00:00:00 2001
From: rafael <22560219+rmuller-ml@users.noreply.github.com>
Date: Thu, 7 Dec 2023 11:12:46 -0500
Subject: [PATCH 9/9] changing /answer call from frontend to use answer_model

---
 client/src/components/Chat/index.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/client/src/components/Chat/index.tsx b/client/src/components/Chat/index.tsx
index 2a5c4204d7..df714a58f3 100644
--- a/client/src/components/Chat/index.tsx
+++ b/client/src/components/Chat/index.tsx
@@ -94,7 +94,7 @@ const Chat = () => {
               queryIdToEdit ? `&parent_query_id=${queryIdToEdit}` : ''
             }`
           : ''
-      }&model=${
+      }&answer_model=${
         preferredAnswerSpeed === 'normal'
           ? 'gpt-4-turbo-24k'
           : 'gpt-3.5-turbo-finetuned'