Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions client/src/components/Chat/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,10 @@ const Chat = () => {
queryIdToEdit ? `&parent_query_id=${queryIdToEdit}` : ''
}`
: ''
}&model=${
preferredAnswerSpeed === 'normal' ? 'gpt-4' : 'gpt-3.5-turbo-finetuned'
}&answer_model=${
preferredAnswerSpeed === 'normal'
? 'gpt-4-turbo-24k'
: 'gpt-3.5-turbo-finetuned'
}`;
console.log(url);
const eventSource = new EventSource(url);
Expand Down
2 changes: 1 addition & 1 deletion client/src/consts/codeStudio.ts
Original file line number Diff line number Diff line change
@@ -1 +1 @@
export const TOKEN_LIMIT = 7000;
export const TOKEN_LIMIT = 21000;
7 changes: 4 additions & 3 deletions server/bleep/src/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ pub struct Agent {
pub thread_id: uuid::Uuid,
pub query_id: uuid::Uuid,

pub model: model::AnswerModel,
pub answer_model: model::LLMModel,
pub agent_model: model::LLMModel,
Comment on lines +60 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think its confusing that Agent has, within itself, an agent_model field, can we call this something else? such as decsicion_model or reasoning_model?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think agent_model is fine.
The abstraction is: the user queries to get an answer and we have an answer_model used in the tool answer, to get the the correct context to answer the query we start an agent which has its own agent_model

Third opinion @ggordonhall ?


/// Indicate whether the request was answered.
///
Expand Down Expand Up @@ -220,7 +221,7 @@ impl Agent {
))];
history.extend(self.history()?);

let trimmed_history = trim_history(history.clone(), self.model)?;
let trimmed_history = trim_history(history.clone(), self.agent_model)?;

let raw_response = self
.llm_gateway
Expand Down Expand Up @@ -484,7 +485,7 @@ impl Agent {

fn trim_history(
mut history: Vec<llm_gateway::api::Message>,
model: model::AnswerModel,
model: model::LLMModel,
) -> Result<Vec<llm_gateway::api::Message>> {
const HIDDEN: &str = "[HIDDEN]";

Expand Down
30 changes: 24 additions & 6 deletions server/bleep/src/agent/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::agent::prompts;
use std::str::FromStr;

#[derive(Debug, Copy, Clone)]
pub struct AnswerModel {
pub struct LLMModel {
/// The name of this model according to tiktoken
pub tokenizer: &'static str,

Expand All @@ -22,7 +22,7 @@ pub struct AnswerModel {
pub system_prompt: fn(&str) -> String,
}

pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
pub const GPT_3_5_TURBO_FINETUNED: LLMModel = LLMModel {
tokenizer: "gpt-3.5-turbo-0613",
model_name: "gpt-3.5-turbo-finetuned",
answer_headroom: 512,
Expand All @@ -31,7 +31,24 @@ pub const GPT_3_5_TURBO_FINETUNED: AnswerModel = AnswerModel {
system_prompt: prompts::answer_article_prompt_finetuned,
};

pub const GPT_4: AnswerModel = AnswerModel {
// GPT-4 turbo has a context window of 128k tokens
const GPT_4_TURBO_MAX_TOKENS: usize = 128_000;
// We want to use only 24k tokens
const ACTUAL_MAX_TOKENS: usize = 24_000;
// 104k tokens should be left unused. This is done by adding 104k to the headrooms
// (tokens left unused for other purposes answer, prompt...)
const HEADROOM_CORRECTION: usize = GPT_4_TURBO_MAX_TOKENS - ACTUAL_MAX_TOKENS;
// PS: when we want to fully utilize the model max context window, the correction is 0
pub const GPT_4_TURBO_24K: LLMModel = LLMModel {
tokenizer: "gpt-4-1106-preview",
model_name: "gpt-4-1106-preview",
answer_headroom: 1024 + HEADROOM_CORRECTION,
prompt_headroom: 2500 + HEADROOM_CORRECTION,
history_headroom: 2048 + HEADROOM_CORRECTION,
system_prompt: prompts::answer_article_prompt,
};

pub const GPT_4: LLMModel = LLMModel {
tokenizer: "gpt-4-0613",
model_name: "gpt-4-0613",
answer_headroom: 1024,
Expand All @@ -40,24 +57,25 @@ pub const GPT_4: AnswerModel = AnswerModel {
system_prompt: prompts::answer_article_prompt,
};

impl FromStr for AnswerModel {
impl FromStr for LLMModel {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
#[allow(clippy::wildcard_in_or_patterns)]
match s {
"gpt-4" => Ok(GPT_4),
"gpt-4-turbo-24k" => Ok(GPT_4_TURBO_24K),
"gpt-3.5-turbo-finetuned" | _ => Ok(GPT_3_5_TURBO_FINETUNED),
}
}
}

impl<'de> serde::Deserialize<'de> for AnswerModel {
impl<'de> serde::Deserialize<'de> for LLMModel {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
s.parse::<AnswerModel>()
s.parse::<LLMModel>()
.map_err(|_| serde::de::Error::custom("failed to deserialize"))
}
}
36 changes: 19 additions & 17 deletions server/bleep/src/agent/tools/answer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,16 @@ impl Agent {
}

let context = self.answer_context(aliases).await?;
let system_prompt = (self.model.system_prompt)(&context);
let system_prompt = (self.answer_model.system_prompt)(&context);
let system_message = llm_gateway::api::Message::system(&system_prompt);
let history = {
let h = self.utter_history().collect::<Vec<_>>();
let system_headroom = tiktoken_rs::num_tokens_from_messages(
self.model.tokenizer,
self.answer_model.tokenizer,
&[(&system_message).into()],
)?;
let headroom = self.model.answer_headroom + system_headroom;
trim_utter_history(h, headroom, self.model)?
let headroom = self.answer_model.answer_headroom + system_headroom;
trim_utter_history(h, headroom, self.answer_model)?
};
let messages = Some(system_message)
.into_iter()
Expand All @@ -60,12 +60,14 @@ impl Agent {
let mut stream = pin!(
self.llm_gateway
.clone()
.model(self.model.model_name)
.frequency_penalty(if self.model.model_name == "gpt-3.5-turbo-finetuned" {
Some(0.2)
} else {
Some(0.0)
})
.model(self.answer_model.model_name)
.frequency_penalty(
if self.answer_model.model_name == "gpt-3.5-turbo-finetuned" {
Some(0.2)
} else {
Some(0.0)
}
)
.chat_stream(&messages, None)
.await?
);
Expand Down Expand Up @@ -108,7 +110,7 @@ impl Agent {
.with_payload("query_history", &history)
.with_payload("response", &response)
.with_payload("raw_prompt", &system_prompt)
.with_payload("model", self.model.model_name),
.with_payload("model", self.answer_model.model_name),
);

Ok(())
Expand Down Expand Up @@ -145,9 +147,9 @@ impl Agent {
// Sometimes, there are just too many code chunks in the context, and deduplication still
// doesn't trim enough chunks. So, we enforce a hard limit here that stops adding tokens
// early if we reach a heuristic limit.
let bpe = tiktoken_rs::get_bpe_from_model(self.model.tokenizer)?;
let bpe = tiktoken_rs::get_bpe_from_model(self.answer_model.tokenizer)?;
let mut remaining_prompt_tokens =
tiktoken_rs::get_completion_max_tokens(self.model.tokenizer, &s)?;
tiktoken_rs::get_completion_max_tokens(self.answer_model.tokenizer, &s)?;

// Select as many recent chunks as possible
let mut recent_chunks = Vec::new();
Expand All @@ -166,7 +168,7 @@ impl Agent {

let snippet_tokens = bpe.encode_ordinary(&formatted_snippet).len();

if snippet_tokens >= remaining_prompt_tokens - self.model.prompt_headroom {
if snippet_tokens >= remaining_prompt_tokens - self.answer_model.prompt_headroom {
info!("breaking at {} tokens", remaining_prompt_tokens);
break;
}
Expand Down Expand Up @@ -251,8 +253,8 @@ impl Agent {
/// Making this closure to 1 means that more of the context is taken up by source code.
const CONTEXT_CODE_RATIO: f32 = 0.5;

let bpe = tiktoken_rs::get_bpe_from_model(self.model.tokenizer).unwrap();
let context_size = tiktoken_rs::model::get_context_size(self.model.tokenizer);
let bpe = tiktoken_rs::get_bpe_from_model(self.answer_model.tokenizer).unwrap();
let context_size = tiktoken_rs::model::get_context_size(self.answer_model.tokenizer);
let max_tokens = (context_size as f32 * CONTEXT_CODE_RATIO) as usize;

// Note: The end line number here is *not* inclusive.
Expand Down Expand Up @@ -412,7 +414,7 @@ impl Agent {
fn trim_utter_history(
mut history: Vec<llm_gateway::api::Message>,
headroom: usize,
model: model::AnswerModel,
model: model::LLMModel,
) -> Result<Vec<llm_gateway::api::Message>> {
let mut tiktoken_msgs: Vec<tiktoken_rs::ChatCompletionRequestMessage> =
history.iter().map(|m| m.into()).collect::<Vec<_>>();
Expand Down
38 changes: 24 additions & 14 deletions server/bleep/src/webserver/answer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,10 @@ pub(super) async fn vote(
pub struct Answer {
pub q: String,
pub repo_ref: RepoRef,
#[serde(default = "default_model")]
pub model: agent::model::AnswerModel,
#[serde(default = "default_answer_model")]
pub answer_model: agent::model::LLMModel,
#[serde(default = "default_agent_model")]
pub agent_model: agent::model::LLMModel,
#[serde(default = "default_thread_id")]
pub thread_id: uuid::Uuid,
/// Optional id of the parent of the exchange to overwrite
Expand All @@ -82,8 +84,12 @@ fn default_thread_id() -> uuid::Uuid {
uuid::Uuid::new_v4()
}

fn default_model() -> agent::model::AnswerModel {
agent::model::GPT_3_5_TURBO_FINETUNED
fn default_answer_model() -> agent::model::LLMModel {
agent::model::GPT_4_TURBO_24K
}

fn default_agent_model() -> agent::model::LLMModel {
agent::model::GPT_4
}

pub(super) async fn answer(
Expand Down Expand Up @@ -207,12 +213,20 @@ async fn try_execute_agent(
Sse<std::pin::Pin<Box<dyn tokio_stream::Stream<Item = Result<sse::Event>> + Send>>>,
> {
QueryLog::new(&app.sql).insert(&params.q).await?;
let Answer {
thread_id,
repo_ref,
answer_model,
agent_model,
..
} = params.clone();

let llm_gateway = user
.llm_gateway(&app)
.await?
.temperature(0.0)
.session_reference_id(conversation_id.to_string());
.session_reference_id(conversation_id.to_string())
.model(agent_model.model_name);

// confirm client compatibility with answer-api
match llm_gateway
Expand Down Expand Up @@ -243,12 +257,6 @@ async fn try_execute_agent(
}
};

let Answer {
thread_id,
repo_ref,
model,
..
} = params.clone();
let stream = async_stream::try_stream! {
let (exchange_tx, exchange_rx) = tokio::sync::mpsc::channel(10);

Expand All @@ -262,7 +270,8 @@ async fn try_execute_agent(
thread_id,
query_id,
exchange_state: ExchangeState::Pending,
model,
answer_model,
agent_model
};

let mut exchange_rx = tokio_stream::wrappers::ReceiverStream::new(exchange_rx);
Expand Down Expand Up @@ -339,7 +348,7 @@ async fn try_execute_agent(
Ok(sse::Event::default()
.json_data(json!({
"thread_id": params.thread_id.to_string(),
"query_id": query_id
"query_id": query_id,
}))
// This should never happen, so we force an unwrap.
.expect("failed to serialize initialization object"))
Expand Down Expand Up @@ -391,7 +400,8 @@ pub async fn explain(
repo_ref: params.repo_ref,
thread_id: params.thread_id,
parent_exchange_id: None,
model: agent::model::GPT_4,
answer_model: agent::model::GPT_4_TURBO_24K,
agent_model: agent::model::GPT_4,
};

let conversation_id = ConversationId {
Expand Down
8 changes: 4 additions & 4 deletions server/bleep/src/webserver/studio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::{

mod diff;

const LLM_GATEWAY_MODEL: &str = "gpt-4-0613";
const LLM_GATEWAY_MODEL: &str = "gpt-4-1106-preview";

fn no_user_id() -> Error {
Error::user("didn't have user ID")
Expand Down Expand Up @@ -472,7 +472,7 @@ async fn token_counts(
})
.collect::<Vec<_>>();

let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-0613").unwrap();
let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-1106-preview").unwrap();
let per_doc_file = stream::iter(doc_context)
.map(|file| async {
if file.hidden {
Expand Down Expand Up @@ -633,14 +633,14 @@ pub async fn get_doc_file_token_count(
.map(|sr| sr.text)
.collect::<String>();

let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-0613").unwrap();
let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-1106-preview").unwrap();
let token_count = core_bpe.encode_ordinary(&content).len();

Ok(Json(token_count))
}

fn count_tokens_for_file(path: &str, body: &str, ranges: &[Range<usize>]) -> usize {
let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-0613").unwrap();
let core_bpe = tiktoken_rs::get_bpe_from_model("gpt-4-1106-preview").unwrap();

let mut chunks = Vec::new();

Expand Down