From 6cc16d2a6f73b177a7f4e04d531c878db1c71e38 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Fri, 1 Dec 2023 09:23:05 -0500 Subject: [PATCH 01/30] saving work symbol tool add metadata to code tool chunks showing metadata add github to repo filter empty symbols fixing some bugs, dedup fmt adding aliases add code to symbol tool new prompt add chunk to exchanges adding symbol ids save work now working adding filter save work saving work remove function call get ref def outside of exchange impl agent refdef to chunks clean chunk generation filter_chunks filter_chunks in new file fix repo_ref add to proc delete symbol tool warnings adding constants relative_path field revert to text user query simplified repo_ref to accept github and local remove filter action, make a function for llm call better naming and adding comments fmt adding some error handling --- Cargo.lock | 4 +- server/bleep/src/agent.rs | 1 + server/bleep/src/agent/exchange.rs | 2 + server/bleep/src/agent/prompts.rs | 2 +- server/bleep/src/agent/symbol.rs | 363 +++++++++++++++++++++ server/bleep/src/agent/tools/answer.rs | 4 + server/bleep/src/agent/tools/code.rs | 10 +- server/bleep/src/agent/tools/proc.rs | 6 + server/bleep/src/webserver.rs | 4 +- server/bleep/src/webserver/answer.rs | 2 + server/bleep/src/webserver/hoverable.rs | 35 +- server/bleep/src/webserver/intelligence.rs | 36 +- 12 files changed, 444 insertions(+), 25 deletions(-) create mode 100644 server/bleep/src/agent/symbol.rs diff --git a/Cargo.lock b/Cargo.lock index 94c1438e7e..b84c44b0c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4866,8 +4866,8 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "ort" -version = "1.16.3" -source = "git+https://github.com/bloopai/ort?branch=env-builder-telemetry#7c37ed9fe5c7c37059a46652b82328623ac4857b" +version = "1.14.8" +source = "git+https://github.com/bloopai/ort?branch=env-builder-telemetry#dfd92ae5cf2dff3318c99de1a8b1a801cb5a3c9e" dependencies = [ "flate2", "half 2.3.1", diff --git a/server/bleep/src/agent.rs b/server/bleep/src/agent.rs index fd5a639bf6..a0eea37ca8 100644 --- a/server/bleep/src/agent.rs +++ b/server/bleep/src/agent.rs @@ -27,6 +27,7 @@ const MAX_STEPS: usize = 10; pub mod exchange; pub mod model; pub mod prompts; +pub mod symbol; pub mod transcoder; /// A collection of modules that each add methods to `Agent`. diff --git a/server/bleep/src/agent/exchange.rs b/server/bleep/src/agent/exchange.rs index f7963fa08b..ff7e3f9edc 100644 --- a/server/bleep/src/agent/exchange.rs +++ b/server/bleep/src/agent/exchange.rs @@ -157,6 +157,8 @@ pub struct CodeChunk { pub start_line: usize, #[serde(rename = "end")] pub end_line: usize, + pub start_byte: usize, + pub end_byte: usize, } impl CodeChunk { diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index e6b3dd2e1d..129b8abb98 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -99,7 +99,7 @@ pub fn system<'a>(paths: impl IntoIterator) -> String { - DO NOT call a function that you've used before with the same arguments - DO NOT assume the structure of the codebase, or the existence of files or folders - Your queries to functions.code or functions.path should be significantly different to previous queries -- Call functions.none with paths that you are confident will help answer the user's query +- Call functions.none with paths that you are confident will help answer the user's query, include paths containing the information needed for a complete answer including definitions and references - If the user query is general (e.g. 'What does this do?', 'What is this repo?') look for READMEs, documentation and entry points in the code (main files, index files, api files etc.) - If the user is referring to, or asking for, information that is in your history, call functions.none - If after attempting to gather information you are still unsure how to answer the query, call functions.none diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs new file mode 100644 index 0000000000..fec7153071 --- /dev/null +++ b/server/bleep/src/agent/symbol.rs @@ -0,0 +1,363 @@ +use futures::TryStreamExt; + +use crate::agent::exchange::CodeChunk; +use crate::agent::Agent; +use crate::intelligence::code_navigation::FileSymbols; +use crate::llm_gateway; +use crate::llm_gateway::api::{Function, FunctionCall}; +use crate::webserver::hoverable::{inner_handle, HoverableRequest, HoverableResponse}; +use crate::webserver::intelligence::{inner_handle as token_info, TokenInfoRequest}; +use tracing::log::warn; + +pub struct ChunkRefDef { + pub chunk: CodeChunk, + pub metadata: Vec, +} + +impl Agent { + pub async fn add_symbols_to_chunk(&self, chunk: CodeChunk) -> ChunkRefDef { + const MAX_NUMBER_REF_DEF: usize = 5; + + let repo_ref = format!("{}", self.repo_ref); + let indexes = self.app.indexes.clone(); + + // get hoverable elements + let hoverable_request = HoverableRequest { + repo_ref: repo_ref.clone(), + relative_path: chunk.path.clone(), + branch: None, + }; + let hoverable_response = inner_handle(hoverable_request, indexes.clone()) + .await + .unwrap_or_else(|_e| HoverableResponse { ranges: Vec::new() }); + + // for each symbol call token-info + let token_info_vec = hoverable_response + .ranges + .iter() + .filter(|range| { + (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) + }) + .map(|range| { + token_info( + TokenInfoRequest { + relative_path: chunk.path.clone(), + repo_ref: repo_ref.clone(), + branch: None, + start: range.start.byte, + end: range.end.byte, + }, + indexes.clone(), + ) + }); + + let token_info_vec = futures::future::join_all(token_info_vec) + .await + .into_iter() + .map(|response| response.unwrap()) + .collect::>(); + + // add metadata and return chunk enriched with metadata (symbols with ref/defs) + + ChunkRefDef { + chunk: chunk.clone(), + metadata: { + let mut metadata = token_info_vec + .into_iter() + .zip(hoverable_response.ranges.into_iter().filter(|range| { + (range.start.byte >= chunk.start_byte) + && (range.start.byte < chunk.end_byte) + })) + .map(|(token_info, range)| { + let filtered_token_info = token_info + .data + .into_iter() + .filter(|file_symbols| file_symbols.file != chunk.path) + .collect::>(); + + RefDefMetadata { + name: chunk.snippet.clone()[(range.start.byte - chunk.start_byte) + ..(range.end.byte - chunk.start_byte)] + .to_string(), + file_symbols: filtered_token_info, + } + }) + .filter(|metadata| { + (metadata.file_symbols.len() < MAX_NUMBER_REF_DEF) + && (metadata.file_symbols.len() > 0) + }) // && + .collect::>(); + metadata.sort_by(|a, b| a.name.cmp(&b.name)); + metadata.dedup_by(|a, b| a.name == b.name); + dbg!("Metadata length: {}", metadata.len()); + metadata + }, + } + } + + pub async fn expand_symbol_into_chunks( + &self, + ref_def_metadata: RefDefMetadata, + ) -> Vec { + const NUMBER_CHUNK_LINES: usize = 10; + + let contents = ref_def_metadata + .file_symbols + .iter() + .map(|f_s| self.get_file_content(&f_s.file)); + + let contents = futures::future::join_all(contents) + .await + .into_iter() + .map(|content_document| content_document.unwrap().unwrap()) + .collect::>(); + + // each symbol may be in multiple files and have multiple occurences in each file + ref_def_metadata + .file_symbols + .iter() + .zip(contents.iter()) + .flat_map(|(file_symbols, content)| { + let filename = file_symbols.file.clone(); + let content = content.content.lines().collect::>(); + + let n_lines = content.len(); + + file_symbols + .data + .iter() + .map(|occurrence| { + let chunk_content = content[occurrence.range.start.line + ..(occurrence.range.end.line + NUMBER_CHUNK_LINES).min(n_lines)] + .to_vec() + .join("\n"); + CodeChunk { + path: filename.clone(), + alias: 0, + snippet: chunk_content, + start_line: occurrence.range.start.line as usize, + end_line: (occurrence.range.end.line + NUMBER_CHUNK_LINES).min(n_lines) + as usize, + start_byte: 0 as usize, + end_byte: 0 as usize, + } + }) + .collect::>() + }) + .collect::>() + } + + pub async fn filter_symbols( + &self, + query: &str, + chunks_with_symbols: Vec, + ) -> Result { + let mut i: i32 = -1; + // we have multiples chunks and each chunk may have multiple symbols + // unique alias (i) per symbol + let symbols = chunks_with_symbols + .into_iter() + .map(|chunk_with_symbol| { + ( + chunk_with_symbol.chunk, + chunk_with_symbol + .metadata + .into_iter() + .map(|symbol| { + i = i + 1; + (i, symbol) + }) + .collect::>(), + ) + }) + .collect::>(); + if i == -1 { + return Err(SymbolError::SymbolListEmptyError); + } + + // Classifier + + // context + let chunks_string = symbols + .iter() + .filter(|(_, s)| s.len() > 0) + .map(|(c, s)| { + let symbols_string = s + .iter() + .map(|(i, refdef)| format!("{}: {}", i, refdef.name)) + .collect::>() + .join("\n"); + + format!( + "Path:{}\n\n{}\n\nSymbols:\n\n{}", + c.path.clone(), + c.snippet.clone(), + symbols_string + ) + }) + .collect::>() + .join("\n\n"); + + // instruction + let prompt = format!("Snippets:\n\n{}\n\nInstruction: Above there are some code chunks and some symbols extracted from the chunks. Your job is to select the most relevant symbol to the user query. Do not answer with the siymbol name, use the symbol key/alias.\n\nQuery:{}", chunks_string.as_str(), query); + + // function_call + let filter_function = serde_json::from_value::>(serde_json::json!([ + { + "name": "filter", + "description": "Select the symbol most likely to contain information to answer the query", + "parameters": { + "type": "object", + "properties": { + "symbol": { + "type": "integer", + "description": "The chunk alias" + } + }, + "required": ["symbol"] + } + }]) + ) + .unwrap(); + + let llm_response = self + .llm_with_function_call(prompt, filter_function) + .await + .unwrap_or({ + warn!("Symbol classfier llm call failed, picking the first symbol."); + FunctionCall { + name: Some("filter".to_string()), + arguments: "{\"symbol\": 0}".to_string(), + } + }); + + let filter_argument: Filter = + match serde_json::from_str(llm_response.clone().arguments.as_str()) { + Ok(argument) => argument, + Err(_e) => { + warn!("Cannot deserialize: {:?}", llm_response); + return Err(SymbolError::DeserializeFilterError); + } + }; + + let selected_symbol = filter_argument.symbol; + + // finding symbol metadata + let output = match symbols + .into_iter() + .flat_map(|(_, symbol_with_alias)| symbol_with_alias) + .find(|(alias, _)| alias.clone() == selected_symbol as i32) + { + Some((_alias, symbol_metadata)) => Ok(symbol_metadata), + _ => Err(SymbolError::SymbolOutOfBoundsError) + }; + + output + } + + pub async fn get_ref_def_extra_chunks(&mut self, chunks: Vec) -> Vec { + const MAX_CHUNKS: usize = 3; + + // get symbols with ref/defs for each chunk + let chunks_with_symbols = chunks + .iter() + .filter(|c| !c.is_empty()) + .map(|c| self.add_symbols_to_chunk(c.clone())); + + let chunks_with_symbols: Vec = + futures::future::join_all(chunks_with_symbols).await; + + // get original user query + let user_query = self.last_exchange().query.target().unwrap(); + + // select one symbol + let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await { + Ok(selected_symbol) => selected_symbol, + Err(e) => { + warn!("Returning no extra chunks: {}", e); + return Vec::new(); + } + }; + + // get expanded chunks for selected symbol + let extra_chunks = self.expand_symbol_into_chunks(selected_symbol).await; + + // take 3 chunks, update path aliases, update enchange chunks + let extra_chunks = extra_chunks + .iter() + .take(MAX_CHUNKS) + .map(|c| { + let chunk = CodeChunk { + path: c.path.clone(), + alias: self.get_path_alias(c.path.as_str()), + snippet: c.snippet.clone(), + start_line: c.start_line, + end_line: c.end_line, + start_byte: 0 as usize, + end_byte: 0 as usize, + }; + self.exchanges + .last_mut() + .unwrap() + .code_chunks + .push(chunk.clone()); + chunk + }) + .collect::>(); + extra_chunks + } + + async fn llm_with_function_call( + &self, + prompt: String, + functions: Vec, + ) -> Result { + let messages = vec![llm_gateway::api::Message::user(prompt.as_str())]; + + let response = self + .llm_gateway + .clone() + .model("gpt-3.5-turbo-0613") + .temperature(0.0) + .chat_stream(&messages, Some(&functions)) + .await? + .try_fold( + llm_gateway::api::FunctionCall::default(), + |acc: FunctionCall, e: String| async move { + let e: FunctionCall = serde_json::from_str(&e).map_err(|err| { + tracing::error!( + "Failed to deserialize to FunctionCall: {:?}. Error: {:?}", + e, + err + ); + err + })?; + Ok(FunctionCall { + name: acc.name.or(e.name), + arguments: acc.arguments + &e.arguments, + }) + }, + ) + .await; + response + } +} + +pub struct RefDefMetadata { + pub name: String, + pub file_symbols: Vec, +} +#[derive(serde::Deserialize)] +struct Filter { + symbol: usize, +} + +#[derive(thiserror::Error, Debug)] +pub enum SymbolError { + #[error("No symbol retrieved in the provided chunks")] + SymbolListEmptyError, + #[error("Cannot deserialize llm function call arguments")] + DeserializeFilterError, + #[error("Selected symbol out of bounds")] + SymbolOutOfBoundsError, +} diff --git a/server/bleep/src/agent/tools/answer.rs b/server/bleep/src/agent/tools/answer.rs index 7c36b475c7..b8990429a9 100644 --- a/server/bleep/src/agent/tools/answer.rs +++ b/server/bleep/src/agent/tools/answer.rs @@ -367,6 +367,8 @@ impl Agent { snippet, start_line: span.start, end_line: span.end, + start_byte: 0, + end_byte: 0, } }) .collect::>(); @@ -383,6 +385,8 @@ impl Agent { snippet: trimmed_snippet.to_string(), start_line: chunk.start_line, end_line: (chunk.start_line + num_trimmed_lines).saturating_sub(1), + start_byte: chunk.start_byte as usize, + end_byte: chunk.end_byte as usize, }] } else { code_chunks diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs index 8f2017c831..66fccc39d6 100644 --- a/server/bleep/src/agent/tools/code.rs +++ b/server/bleep/src/agent/tools/code.rs @@ -57,6 +57,8 @@ impl Agent { snippet: chunk.text, start_line: chunk.start_line as usize, end_line: chunk.end_line as usize, + start_byte: chunk.start_byte as usize, + end_byte: chunk.end_byte as usize, } }) .collect::>(); @@ -71,9 +73,13 @@ impl Agent { .push(chunk.clone()) } + let extra_chunks = self.get_ref_def_extra_chunks(chunks.clone()).await; + + chunks.extend(extra_chunks); + let response = chunks - .iter() - .filter(|c| !c.is_empty()) + .clone() + .into_iter() .map(|c| c.to_string()) .collect::>() .join("\n\n"); diff --git a/server/bleep/src/agent/tools/proc.rs b/server/bleep/src/agent/tools/proc.rs index faeba8735f..ef46042d81 100644 --- a/server/bleep/src/agent/tools/proc.rs +++ b/server/bleep/src/agent/tools/proc.rs @@ -45,6 +45,8 @@ impl Agent { snippet: chunk.text, start_line: chunk.start_line as usize, end_line: chunk.end_line as usize, + start_byte: chunk.start_byte as usize, + end_byte: chunk.end_byte as usize, } }) .collect::>(); @@ -59,6 +61,10 @@ impl Agent { .push(chunk.clone()) } + let extra_chunks = self.get_ref_def_extra_chunks(chunks.clone()).await; + + chunks.extend(extra_chunks); + let response = chunks .iter() .filter(|c| !c.is_empty()) diff --git a/server/bleep/src/webserver.rs b/server/bleep/src/webserver.rs index 3d331895d6..f0d6c470c5 100644 --- a/server/bleep/src/webserver.rs +++ b/server/bleep/src/webserver.rs @@ -21,9 +21,9 @@ mod config; mod docs; mod file; mod github; -mod hoverable; +pub mod hoverable; mod index; -mod intelligence; +pub mod intelligence; pub mod middleware; mod query; mod quota; diff --git a/server/bleep/src/webserver/answer.rs b/server/bleep/src/webserver/answer.rs index 8dce6b5a39..370e590ffc 100644 --- a/server/bleep/src/webserver/answer.rs +++ b/server/bleep/src/webserver/answer.rs @@ -451,6 +451,8 @@ pub async fn explain( start_line: params.line_start, end_line: params.line_end, snippet, + start_byte: 0, + end_byte: 0, }); let action = Action::Answer { paths: vec![0] }; diff --git a/server/bleep/src/webserver/hoverable.rs b/server/bleep/src/webserver/hoverable.rs index dbb379e4b3..440e7c85eb 100644 --- a/server/bleep/src/webserver/hoverable.rs +++ b/server/bleep/src/webserver/hoverable.rs @@ -8,25 +8,48 @@ use serde::{Deserialize, Serialize}; /// The request made to the `hoverable` endpoint. #[derive(Debug, Deserialize)] -pub(super) struct HoverableRequest { +pub struct HoverableRequest { /// The repo_ref of the file of interest - repo_ref: String, + pub repo_ref: String, /// The path to the file of interest, relative to the repo root - relative_path: String, + pub relative_path: String, /// Branch name to use for the lookup, - branch: Option, + pub branch: Option, } /// The response from the `hoverable` endpoint. #[derive(Serialize)] -pub(super) struct HoverableResponse { - ranges: Vec, +pub struct HoverableResponse { + pub ranges: Vec, } impl super::ApiResponse for HoverableResponse {} +pub async fn inner_handle( + payload: HoverableRequest, + indexes: Arc, +) -> Result { + let repo_ref = &payload.repo_ref.parse::().map_err(Error::user)?; + + let document = match indexes + .file + .by_path(repo_ref, &payload.relative_path, payload.branch.as_deref()) + .await + { + Ok(Some(doc)) => doc, + Ok(None) => return Err(Error::user("file not found").with_status(StatusCode::NOT_FOUND)), + Err(e) => return Err(Error::user(e)), + }; + + let ranges = document + .hoverable_ranges() + .ok_or(Error::user("no hoverable ranges for language"))?; + + Ok(HoverableResponse { ranges }) +} + pub(super) async fn handle( Query(payload): Query, Extension(indexes): Extension>, diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index a68ba9b007..b78c53ad64 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -19,25 +19,25 @@ use serde::{Deserialize, Serialize}; /// The request made to the `local-intel` endpoint. #[derive(Debug, Deserialize)] -pub(super) struct TokenInfoRequest { +pub struct TokenInfoRequest { /// The repo_ref of the file of interest - repo_ref: String, + pub repo_ref: String, /// The path to the file of interest, relative to the repo root - relative_path: String, + pub relative_path: String, /// Branch name to use for the lookup, - branch: Option, + pub branch: Option, /// The byte range to look for - start: usize, - end: usize, + pub start: usize, + pub end: usize, } /// The response from the `local-intel` endpoint. #[derive(Serialize, Debug)] -pub(super) struct TokenInfoResponse { - data: Vec, +pub struct TokenInfoResponse { + pub data: Vec, } impl TokenInfoResponse { @@ -52,6 +52,18 @@ pub(super) async fn handle( Query(payload): Query, Extension(indexes): Extension>, ) -> Result { + let result = inner_handle(payload, indexes).await; + + match result { + Ok(response) => Ok(json(response)), + Err(err) => Err(err.into()), + } +} + +pub async fn inner_handle( + payload: TokenInfoRequest, + indexes: Arc, +) -> Result { let repo_ref = payload.repo_ref.parse::().map_err(Error::user)?; let token = Token { @@ -95,7 +107,7 @@ pub(super) async fn handle( let data = ctx.token_info(); if data.is_empty() { - search_nav( + let response = search_nav( Arc::clone(&indexes), &repo_ref, ctx.active_token_text(), @@ -104,10 +116,10 @@ pub(super) async fn handle( &source_document, ) .await - .map(TokenInfoResponse::new) - .map(json) + .map(TokenInfoResponse::new)?; + Ok(response) } else { - Ok(json(TokenInfoResponse { data })) + Ok(TokenInfoResponse { data }) } } From f8207525e2d69d1c34569729346abcf9097b0723 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:39:38 -0500 Subject: [PATCH 02/30] fix error handling --- server/bleep/src/agent/symbol.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index fec7153071..40230e3906 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -220,16 +220,19 @@ impl Agent { ) .unwrap(); - let llm_response = self - .llm_with_function_call(prompt, filter_function) - .await - .unwrap_or({ - warn!("Symbol classfier llm call failed, picking the first symbol."); + let llm_response = match self.llm_with_function_call(prompt, filter_function).await { + Ok(llm_response) => llm_response, + Err(e) => { + warn!( + "Symbol classifier llm call failed, picking the first symbol: {}", + e + ); FunctionCall { name: Some("filter".to_string()), arguments: "{\"symbol\": 0}".to_string(), } - }); + } + }; let filter_argument: Filter = match serde_json::from_str(llm_response.clone().arguments.as_str()) { @@ -247,10 +250,10 @@ impl Agent { .into_iter() .flat_map(|(_, symbol_with_alias)| symbol_with_alias) .find(|(alias, _)| alias.clone() == selected_symbol as i32) - { - Some((_alias, symbol_metadata)) => Ok(symbol_metadata), - _ => Err(SymbolError::SymbolOutOfBoundsError) - }; + { + Some((_alias, symbol_metadata)) => Ok(symbol_metadata), + _ => Err(SymbolError::SymbolOutOfBoundsError), + }; output } From b8d5a77132c8f65b0be64f8a803f2983c4071d25 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Mon, 18 Dec 2023 18:03:48 -0500 Subject: [PATCH 03/30] clippy --- server/bleep/src/agent/symbol.rs | 43 ++++++++++------------ server/bleep/src/agent/tools/answer.rs | 4 +- server/bleep/src/webserver/intelligence.rs | 2 +- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 40230e3906..7b288fc77c 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -84,7 +84,7 @@ impl Agent { }) .filter(|metadata| { (metadata.file_symbols.len() < MAX_NUMBER_REF_DEF) - && (metadata.file_symbols.len() > 0) + && (!metadata.file_symbols.is_empty()) }) // && .collect::>(); metadata.sort_by(|a, b| a.name.cmp(&b.name)); @@ -135,11 +135,10 @@ impl Agent { path: filename.clone(), alias: 0, snippet: chunk_content, - start_line: occurrence.range.start.line as usize, - end_line: (occurrence.range.end.line + NUMBER_CHUNK_LINES).min(n_lines) - as usize, - start_byte: 0 as usize, - end_byte: 0 as usize, + start_line: occurrence.range.start.line, + end_line: (occurrence.range.end.line + NUMBER_CHUNK_LINES).min(n_lines), + start_byte: 0, + end_byte: 0, } }) .collect::>() @@ -164,7 +163,7 @@ impl Agent { .metadata .into_iter() .map(|symbol| { - i = i + 1; + i += 1; (i, symbol) }) .collect::>(), @@ -172,7 +171,7 @@ impl Agent { }) .collect::>(); if i == -1 { - return Err(SymbolError::SymbolListEmptyError); + return Err(SymbolError::ListEmpty); } // Classifier @@ -180,7 +179,7 @@ impl Agent { // context let chunks_string = symbols .iter() - .filter(|(_, s)| s.len() > 0) + .filter(|(_, s)| !s.is_empty()) .map(|(c, s)| { let symbols_string = s .iter() @@ -239,23 +238,22 @@ impl Agent { Ok(argument) => argument, Err(_e) => { warn!("Cannot deserialize: {:?}", llm_response); - return Err(SymbolError::DeserializeFilterError); + return Err(SymbolError::DeserializeFilter); } }; let selected_symbol = filter_argument.symbol; // finding symbol metadata - let output = match symbols + match symbols .into_iter() .flat_map(|(_, symbol_with_alias)| symbol_with_alias) - .find(|(alias, _)| alias.clone() == selected_symbol as i32) + .find(|(alias, _)| *alias == selected_symbol as i32) { Some((_alias, symbol_metadata)) => Ok(symbol_metadata), - _ => Err(SymbolError::SymbolOutOfBoundsError), - }; + _ => Err(SymbolError::OutOfBounds), + } - output } pub async fn get_ref_def_extra_chunks(&mut self, chunks: Vec) -> Vec { @@ -296,8 +294,8 @@ impl Agent { snippet: c.snippet.clone(), start_line: c.start_line, end_line: c.end_line, - start_byte: 0 as usize, - end_byte: 0 as usize, + start_byte: 0, + end_byte: 0, }; self.exchanges .last_mut() @@ -317,7 +315,7 @@ impl Agent { ) -> Result { let messages = vec![llm_gateway::api::Message::user(prompt.as_str())]; - let response = self + self .llm_gateway .clone() .model("gpt-3.5-turbo-0613") @@ -341,8 +339,7 @@ impl Agent { }) }, ) - .await; - response + .await } } @@ -358,9 +355,9 @@ struct Filter { #[derive(thiserror::Error, Debug)] pub enum SymbolError { #[error("No symbol retrieved in the provided chunks")] - SymbolListEmptyError, + ListEmpty, #[error("Cannot deserialize llm function call arguments")] - DeserializeFilterError, + DeserializeFilter, #[error("Selected symbol out of bounds")] - SymbolOutOfBoundsError, + OutOfBounds, } diff --git a/server/bleep/src/agent/tools/answer.rs b/server/bleep/src/agent/tools/answer.rs index b8990429a9..40ad7fc126 100644 --- a/server/bleep/src/agent/tools/answer.rs +++ b/server/bleep/src/agent/tools/answer.rs @@ -385,8 +385,8 @@ impl Agent { snippet: trimmed_snippet.to_string(), start_line: chunk.start_line, end_line: (chunk.start_line + num_trimmed_lines).saturating_sub(1), - start_byte: chunk.start_byte as usize, - end_byte: chunk.end_byte as usize, + start_byte: chunk.start_byte, + end_byte: chunk.end_byte, }] } else { code_chunks diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index b78c53ad64..e1b4fff381 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -56,7 +56,7 @@ pub(super) async fn handle( match result { Ok(response) => Ok(json(response)), - Err(err) => Err(err.into()), + Err(err) => Err(err), } } From 164eff6673c228565d3d3701cee0fd06bca38828 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Mon, 18 Dec 2023 18:04:59 -0500 Subject: [PATCH 04/30] fmt --- server/bleep/src/agent/symbol.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 7b288fc77c..c506b9cb2b 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -253,7 +253,6 @@ impl Agent { Some((_alias, symbol_metadata)) => Ok(symbol_metadata), _ => Err(SymbolError::OutOfBounds), } - } pub async fn get_ref_def_extra_chunks(&mut self, chunks: Vec) -> Vec { @@ -315,8 +314,7 @@ impl Agent { ) -> Result { let messages = vec![llm_gateway::api::Message::user(prompt.as_str())]; - self - .llm_gateway + self.llm_gateway .clone() .model("gpt-3.5-turbo-0613") .temperature(0.0) From d12b13d7ce6665c223b03994cf73b486e1e172ed Mon Sep 17 00:00:00 2001 From: Gabriel Gordon-Hall Date: Tue, 19 Dec 2023 23:18:17 +0000 Subject: [PATCH 05/30] wip: refactor changes --- server/bleep/src/agent/symbol.rs | 229 +++++++++++---------- server/bleep/src/agent/tools/code.rs | 2 +- server/bleep/src/agent/tools/proc.rs | 2 +- server/bleep/src/webserver/hoverable.rs | 29 +-- server/bleep/src/webserver/intelligence.rs | 92 ++++----- 5 files changed, 169 insertions(+), 185 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index c506b9cb2b..fa83ddff2e 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -1,108 +1,113 @@ use futures::TryStreamExt; -use crate::agent::exchange::CodeChunk; -use crate::agent::Agent; -use crate::intelligence::code_navigation::FileSymbols; +use crate::agent::{exchange::CodeChunk, Agent}; +use crate::intelligence::{code_navigation::FileSymbols, Language, TSLanguage}; use crate::llm_gateway; -use crate::llm_gateway::api::{Function, FunctionCall}; -use crate::webserver::hoverable::{inner_handle, HoverableRequest, HoverableResponse}; -use crate::webserver::intelligence::{inner_handle as token_info, TokenInfoRequest}; -use tracing::log::warn; +use crate::webserver::intelligence::{get_token_info, TokenInfoRequest}; +use anyhow::{Context, Result}; +use tracing::log::{debug, info, warn}; -pub struct ChunkRefDef { +pub struct ChunkWithSymbols { pub chunk: CodeChunk, - pub metadata: Vec, + pub symbols: Vec, } impl Agent { - pub async fn add_symbols_to_chunk(&self, chunk: CodeChunk) -> ChunkRefDef { - const MAX_NUMBER_REF_DEF: usize = 5; - - let repo_ref = format!("{}", self.repo_ref); - let indexes = self.app.indexes.clone(); + pub async fn extract_symbols(&self, chunk: CodeChunk) -> Result { + const MAX_REF_DEFS: usize = 5; // Ignore symbols with more than this many cross-file refs/defs // get hoverable elements - let hoverable_request = HoverableRequest { - repo_ref: repo_ref.clone(), - relative_path: chunk.path.clone(), - branch: None, + let document = self + .app + .indexes + .file + .by_path(&self.repo_ref, &chunk.path, None) + .await? + .with_context(|| format!("failed to read path: {}", &chunk.path))?; + + let hoverable_ranges = document + .hoverable_ranges() + .ok_or_else(|| anyhow::anyhow!("no hoverable ranges"))?; + + let all_docs = { + let associated_langs = match document.lang.as_deref().map(TSLanguage::from_id) { + Some(Language::Supported(config)) => config.language_ids, + _ => &[], + }; + self.app + .indexes + .file + .by_repo(&self.repo_ref, associated_langs.iter(), None) + .await }; - let hoverable_response = inner_handle(hoverable_request, indexes.clone()) - .await - .unwrap_or_else(|_e| HoverableResponse { ranges: Vec::new() }); - // for each symbol call token-info - let token_info_vec = hoverable_response - .ranges - .iter() - .filter(|range| { - (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) - }) - .map(|range| { - token_info( - TokenInfoRequest { - relative_path: chunk.path.clone(), - repo_ref: repo_ref.clone(), - branch: None, - start: range.start.byte, - end: range.end.byte, - }, - indexes.clone(), - ) - }); + // get references and definitions for each symbol + let related_symbols = futures::future::join_all( + hoverable_ranges + .iter() + .filter(|range| { + (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) + }) + .map(|range| { + get_token_info( + TokenInfoRequest { + relative_path: chunk.path.clone(), + repo_ref: String::new(), // FIXME + branch: None, + start: range.start.byte, + end: range.end.byte, + }, + &self.repo_ref, + self.app.indexes.clone(), + &document, + &all_docs, + ) + }), + ) + .await; - let token_info_vec = futures::future::join_all(token_info_vec) - .await + // filter references and definitions + let mut symbols = related_symbols .into_iter() - .map(|response| response.unwrap()) + .filter_map(Result::ok) + .zip(hoverable_ranges.into_iter().filter(|range| { + (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) + })) + .map(|(token_info, range)| { + let filtered_token_info = token_info + .into_iter() + .filter(|file_symbols| file_symbols.file != chunk.path) + .collect::>(); + + Symbol { + name: chunk.snippet.clone()[(range.start.byte - chunk.start_byte) + ..(range.end.byte - chunk.start_byte)] + .to_string(), + related_symbols: filtered_token_info, + } + }) + .filter(|metadata| { + (metadata.related_symbols.len() < MAX_REF_DEFS) + && (!metadata.related_symbols.is_empty()) + }) .collect::>(); - // add metadata and return chunk enriched with metadata (symbols with ref/defs) + symbols.sort_by(|a, b| a.name.cmp(&b.name)); + symbols.dedup_by(|a, b| a.name == b.name); + + debug!("Attatched {} symbols", symbols.len()); - ChunkRefDef { + Ok(ChunkWithSymbols { chunk: chunk.clone(), - metadata: { - let mut metadata = token_info_vec - .into_iter() - .zip(hoverable_response.ranges.into_iter().filter(|range| { - (range.start.byte >= chunk.start_byte) - && (range.start.byte < chunk.end_byte) - })) - .map(|(token_info, range)| { - let filtered_token_info = token_info - .data - .into_iter() - .filter(|file_symbols| file_symbols.file != chunk.path) - .collect::>(); - - RefDefMetadata { - name: chunk.snippet.clone()[(range.start.byte - chunk.start_byte) - ..(range.end.byte - chunk.start_byte)] - .to_string(), - file_symbols: filtered_token_info, - } - }) - .filter(|metadata| { - (metadata.file_symbols.len() < MAX_NUMBER_REF_DEF) - && (!metadata.file_symbols.is_empty()) - }) // && - .collect::>(); - metadata.sort_by(|a, b| a.name.cmp(&b.name)); - metadata.dedup_by(|a, b| a.name == b.name); - dbg!("Metadata length: {}", metadata.len()); - metadata - }, - } + symbols, + }) } - pub async fn expand_symbol_into_chunks( - &self, - ref_def_metadata: RefDefMetadata, - ) -> Vec { + pub async fn expand_symbol_into_chunks(&self, ref_def_metadata: Symbol) -> Vec { const NUMBER_CHUNK_LINES: usize = 10; let contents = ref_def_metadata - .file_symbols + .related_symbols .iter() .map(|f_s| self.get_file_content(&f_s.file)); @@ -114,7 +119,7 @@ impl Agent { // each symbol may be in multiple files and have multiple occurences in each file ref_def_metadata - .file_symbols + .related_symbols .iter() .zip(contents.iter()) .flat_map(|(file_symbols, content)| { @@ -149,8 +154,8 @@ impl Agent { pub async fn filter_symbols( &self, query: &str, - chunks_with_symbols: Vec, - ) -> Result { + chunks_with_symbols: Vec, + ) -> Result { let mut i: i32 = -1; // we have multiples chunks and each chunk may have multiple symbols // unique alias (i) per symbol @@ -160,7 +165,7 @@ impl Agent { ( chunk_with_symbol.chunk, chunk_with_symbol - .metadata + .symbols .into_iter() .map(|symbol| { i += 1; @@ -201,7 +206,7 @@ impl Agent { let prompt = format!("Snippets:\n\n{}\n\nInstruction: Above there are some code chunks and some symbols extracted from the chunks. Your job is to select the most relevant symbol to the user query. Do not answer with the siymbol name, use the symbol key/alias.\n\nQuery:{}", chunks_string.as_str(), query); // function_call - let filter_function = serde_json::from_value::>(serde_json::json!([ + let filter_function = serde_json::from_value::>(serde_json::json!([ { "name": "filter", "description": "Select the symbol most likely to contain information to answer the query", @@ -226,7 +231,7 @@ impl Agent { "Symbol classifier llm call failed, picking the first symbol: {}", e ); - FunctionCall { + llm_gateway::api::FunctionCall { name: Some("filter".to_string()), arguments: "{\"symbol\": 0}".to_string(), } @@ -255,17 +260,20 @@ impl Agent { } } - pub async fn get_ref_def_extra_chunks(&mut self, chunks: Vec) -> Vec { + pub async fn get_related_chunks(&mut self, chunks: Vec) -> Vec { const MAX_CHUNKS: usize = 3; // get symbols with ref/defs for each chunk - let chunks_with_symbols = chunks - .iter() - .filter(|c| !c.is_empty()) - .map(|c| self.add_symbols_to_chunk(c.clone())); - - let chunks_with_symbols: Vec = - futures::future::join_all(chunks_with_symbols).await; + let chunks_with_symbols = futures::future::join_all( + chunks + .iter() + .filter(|c| !c.is_empty()) + .map(|c| self.extract_symbols(c.clone())), // TODO: Log failure + ) + .await + .into_iter() + .filter_map(Result::ok) + .collect(); // get original user query let user_query = self.last_exchange().query.target().unwrap(); @@ -274,7 +282,7 @@ impl Agent { let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await { Ok(selected_symbol) => selected_symbol, Err(e) => { - warn!("Returning no extra chunks: {}", e); + info!("Returning no extra chunks: {}", e); return Vec::new(); } }; @@ -310,8 +318,8 @@ impl Agent { async fn llm_with_function_call( &self, prompt: String, - functions: Vec, - ) -> Result { + functions: Vec, + ) -> Result { let messages = vec![llm_gateway::api::Message::user(prompt.as_str())]; self.llm_gateway @@ -322,16 +330,17 @@ impl Agent { .await? .try_fold( llm_gateway::api::FunctionCall::default(), - |acc: FunctionCall, e: String| async move { - let e: FunctionCall = serde_json::from_str(&e).map_err(|err| { - tracing::error!( - "Failed to deserialize to FunctionCall: {:?}. Error: {:?}", - e, + |acc: llm_gateway::api::FunctionCall, e: String| async move { + let e: llm_gateway::api::FunctionCall = + serde_json::from_str(&e).map_err(|err| { + tracing::error!( + "Failed to deserialize to FunctionCall: {:?}. Error: {:?}", + e, + err + ); err - ); - err - })?; - Ok(FunctionCall { + })?; + Ok(llm_gateway::api::FunctionCall { name: acc.name.or(e.name), arguments: acc.arguments + &e.arguments, }) @@ -341,9 +350,9 @@ impl Agent { } } -pub struct RefDefMetadata { +pub struct Symbol { pub name: String, - pub file_symbols: Vec, + pub related_symbols: Vec, } #[derive(serde::Deserialize)] struct Filter { diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs index 66fccc39d6..965ef896f2 100644 --- a/server/bleep/src/agent/tools/code.rs +++ b/server/bleep/src/agent/tools/code.rs @@ -73,7 +73,7 @@ impl Agent { .push(chunk.clone()) } - let extra_chunks = self.get_ref_def_extra_chunks(chunks.clone()).await; + let extra_chunks = self.get_related_chunks(chunks.clone()).await; chunks.extend(extra_chunks); diff --git a/server/bleep/src/agent/tools/proc.rs b/server/bleep/src/agent/tools/proc.rs index ef46042d81..0dbbc554b4 100644 --- a/server/bleep/src/agent/tools/proc.rs +++ b/server/bleep/src/agent/tools/proc.rs @@ -61,7 +61,7 @@ impl Agent { .push(chunk.clone()) } - let extra_chunks = self.get_ref_def_extra_chunks(chunks.clone()).await; + let extra_chunks = self.get_related_chunks(chunks.clone()).await; chunks.extend(extra_chunks); diff --git a/server/bleep/src/webserver/hoverable.rs b/server/bleep/src/webserver/hoverable.rs index 440e7c85eb..78fcd288db 100644 --- a/server/bleep/src/webserver/hoverable.rs +++ b/server/bleep/src/webserver/hoverable.rs @@ -10,13 +10,13 @@ use serde::{Deserialize, Serialize}; #[derive(Debug, Deserialize)] pub struct HoverableRequest { /// The repo_ref of the file of interest - pub repo_ref: String, + repo_ref: String, /// The path to the file of interest, relative to the repo root - pub relative_path: String, + relative_path: String, /// Branch name to use for the lookup, - pub branch: Option, + branch: Option, } /// The response from the `hoverable` endpoint. @@ -27,29 +27,6 @@ pub struct HoverableResponse { impl super::ApiResponse for HoverableResponse {} -pub async fn inner_handle( - payload: HoverableRequest, - indexes: Arc, -) -> Result { - let repo_ref = &payload.repo_ref.parse::().map_err(Error::user)?; - - let document = match indexes - .file - .by_path(repo_ref, &payload.relative_path, payload.branch.as_deref()) - .await - { - Ok(Some(doc)) => doc, - Ok(None) => return Err(Error::user("file not found").with_status(StatusCode::NOT_FOUND)), - Err(e) => return Err(Error::user(e)), - }; - - let ranges = document - .hoverable_ranges() - .ok_or(Error::user("no hoverable ranges for language"))?; - - Ok(HoverableResponse { ranges }) -} - pub(super) async fn handle( Query(payload): Query, Extension(indexes): Extension>, diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index e1b4fff381..242c75f5b5 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -52,33 +52,15 @@ pub(super) async fn handle( Query(payload): Query, Extension(indexes): Extension>, ) -> Result { - let result = inner_handle(payload, indexes).await; - - match result { - Ok(response) => Ok(json(response)), - Err(err) => Err(err), - } -} - -pub async fn inner_handle( - payload: TokenInfoRequest, - indexes: Arc, -) -> Result { let repo_ref = payload.repo_ref.parse::().map_err(Error::user)?; - let token = Token { - relative_path: payload.relative_path.as_str(), - start_byte: payload.start, - end_byte: payload.end, - }; - - let source_document = indexes + let source_doc = indexes .file .by_path(&repo_ref, &payload.relative_path, payload.branch.as_deref()) .await .map_err(Error::user)? .ok_or_else(|| Error::user("path not found").with_status(StatusCode::NOT_FOUND))?; - let lang = source_document.lang.as_deref(); + let lang = source_doc.lang.as_deref(); let all_docs = { let associated_langs = match lang.map(TSLanguage::from_id) { Some(Language::Supported(config)) => config.language_ids, @@ -94,33 +76,11 @@ pub async fn inner_handle( .await }; - let source_document_idx = all_docs - .iter() - .position(|doc| doc.relative_path == payload.relative_path) - .ok_or(Error::internal("invalid language"))?; - - let ctx = CodeNavigationContext { - token, - all_docs: &all_docs, - source_document_idx, - }; - - let data = ctx.token_info(); - if data.is_empty() { - let response = search_nav( - Arc::clone(&indexes), - &repo_ref, - ctx.active_token_text(), - ctx.active_token_range(), - payload.branch.as_deref(), - &source_document, - ) + let symbols = get_token_info(payload, &repo_ref, indexes, &source_doc, &all_docs) .await - .map(TokenInfoResponse::new)?; - Ok(response) - } else { - Ok(TokenInfoResponse { data }) - } + .map_err(Error::internal)?; + + Ok(json(TokenInfoResponse::new(symbols))) } /// The request made to the `related-files` endpoint. @@ -347,6 +307,44 @@ pub(super) async fn token_value( Ok(json(TokenValueResponse { range, content })) } +pub async fn get_token_info( + params: TokenInfoRequest, + repo_ref: &RepoRef, + indexes: Arc, + source_doc: &ContentDocument, + all_docs: &Vec, +) -> anyhow::Result> { + let source_document_idx = all_docs + .iter() + .position(|doc| doc.relative_path == source_doc.relative_path) + .unwrap(); // FIXME: handle this + + let ctx: CodeNavigationContext<'_, '_> = CodeNavigationContext { + token: Token { + relative_path: params.relative_path.as_str(), + start_byte: params.start, + end_byte: params.end, + }, + all_docs, + source_document_idx, + }; + + let data = ctx.token_info(); + if data.is_empty() { + search_nav( + Arc::clone(&indexes), + repo_ref, + ctx.active_token_text(), + ctx.active_token_range(), + params.branch.as_deref(), + source_doc, + ) + .await + } else { + Ok(data) + } +} + async fn search_nav( indexes: Arc, repo_ref: &RepoRef, @@ -354,7 +352,7 @@ async fn search_nav( payload_range: std::ops::Range, branch: Option<&str>, source_document: &ContentDocument, -) -> Result> { +) -> anyhow::Result> { use crate::{ indexes::{reader::ContentReader, DocumentRead}, query::compiler::trigrams, From fb67ade9192662ab05bf3910e808fce93d4f9951 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:38:13 -0500 Subject: [PATCH 06/30] using snippet from file_symbol --- server/bleep/src/agent/symbol.rs | 46 ++++++------------- .../bleep/src/intelligence/code_navigation.rs | 30 +++++++++--- server/bleep/src/webserver/intelligence.rs | 25 ++++++++-- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index fa83ddff2e..50285dd9e1 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -15,6 +15,7 @@ pub struct ChunkWithSymbols { impl Agent { pub async fn extract_symbols(&self, chunk: CodeChunk) -> Result { const MAX_REF_DEFS: usize = 5; // Ignore symbols with more than this many cross-file refs/defs + const NUMBER_CHUNK_LINES: usize = 10; // get hoverable elements let document = self @@ -61,6 +62,8 @@ impl Agent { self.app.indexes.clone(), &document, &all_docs, + Some(0), + Some(NUMBER_CHUNK_LINES), ) }), ) @@ -95,7 +98,7 @@ impl Agent { symbols.sort_by(|a, b| a.name.cmp(&b.name)); symbols.dedup_by(|a, b| a.name == b.name); - debug!("Attatched {} symbols", symbols.len()); + debug!("Attached {} symbols", symbols.len()); Ok(ChunkWithSymbols { chunk: chunk.clone(), @@ -104,47 +107,24 @@ impl Agent { } pub async fn expand_symbol_into_chunks(&self, ref_def_metadata: Symbol) -> Vec { - const NUMBER_CHUNK_LINES: usize = 10; - - let contents = ref_def_metadata - .related_symbols - .iter() - .map(|f_s| self.get_file_content(&f_s.file)); - - let contents = futures::future::join_all(contents) - .await - .into_iter() - .map(|content_document| content_document.unwrap().unwrap()) - .collect::>(); - // each symbol may be in multiple files and have multiple occurences in each file ref_def_metadata .related_symbols .iter() - .zip(contents.iter()) - .flat_map(|(file_symbols, content)| { + .flat_map(|file_symbols| { let filename = file_symbols.file.clone(); - let content = content.content.lines().collect::>(); - - let n_lines = content.len(); file_symbols .data .iter() - .map(|occurrence| { - let chunk_content = content[occurrence.range.start.line - ..(occurrence.range.end.line + NUMBER_CHUNK_LINES).min(n_lines)] - .to_vec() - .join("\n"); - CodeChunk { - path: filename.clone(), - alias: 0, - snippet: chunk_content, - start_line: occurrence.range.start.line, - end_line: (occurrence.range.end.line + NUMBER_CHUNK_LINES).min(n_lines), - start_byte: 0, - end_byte: 0, - } + .map(|occurrence| CodeChunk { + path: filename.clone(), + alias: 0, + snippet: occurrence.snippet.data.clone(), + start_line: occurrence.snippet.line_range.start, + end_line: occurrence.snippet.line_range.end, + start_byte: 0, + end_byte: 0, }) .collect::>() }) diff --git a/server/bleep/src/intelligence/code_navigation.rs b/server/bleep/src/intelligence/code_navigation.rs index b3e8a878b5..d275c319f2 100644 --- a/server/bleep/src/intelligence/code_navigation.rs +++ b/server/bleep/src/intelligence/code_navigation.rs @@ -50,6 +50,7 @@ pub struct CodeNavigationContext<'a, 'b> { pub token: Token<'a>, pub all_docs: &'b [ContentDocument], pub source_document_idx: usize, + pub snipper: Option, } impl<'a, 'b> CodeNavigationContext<'a, 'b> { @@ -134,6 +135,7 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { start_byte: source_sg.graph[idx].range().start.byte, end_byte: source_sg.graph[idx].range().end.byte, }, + snipper: None, } .local_definitions() .is_none() @@ -179,6 +181,7 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { all_docs: std::slice::from_ref(source_document), source_document_idx: 0, token, + snipper: None, } } @@ -303,7 +306,11 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .map(|idx| Occurrence { kind: OccurrenceKind::Definition, range: scope_graph.graph[idx].range(), - snippet: to_occurrence(self.source_document(), scope_graph.graph[idx].range()), + snippet: to_occurrence( + self.source_document(), + scope_graph.graph[idx].range(), + self.snipper, + ), }) .collect::>(); @@ -335,7 +342,7 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .map(|idx| Occurrence { kind: OccurrenceKind::Definition, range: scope_graph.graph[idx].range(), - snippet: to_occurrence(doc, scope_graph.graph[idx].range()), + snippet: to_occurrence(doc, scope_graph.graph[idx].range(), self.snipper), }) .collect::>(); @@ -360,7 +367,11 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .map(|idx| Occurrence { kind: OccurrenceKind::Reference, range: scope_graph.graph[idx].range(), - snippet: to_occurrence(self.source_document(), scope_graph.graph[idx].range()), + snippet: to_occurrence( + self.source_document(), + scope_graph.graph[idx].range(), + self.snipper, + ), }) .collect::>(); @@ -396,7 +407,7 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .map(|idx| Occurrence { kind: OccurrenceKind::Reference, range: scope_graph.graph[idx].range(), - snippet: to_occurrence(doc, scope_graph.graph[idx].range()), + snippet: to_occurrence(doc, scope_graph.graph[idx].range(), self.snipper), }) .collect::>(); @@ -418,7 +429,11 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .map(|idx| Occurrence { kind: OccurrenceKind::Definition, range: scope_graph.graph[idx].range(), - snippet: to_occurrence(self.source_document(), scope_graph.graph[idx].range()), + snippet: to_occurrence( + self.source_document(), + scope_graph.graph[idx].range(), + self.snipper, + ), }) .collect::>(); @@ -437,11 +452,12 @@ pub struct Token<'a> { pub end_byte: usize, } -fn to_occurrence(doc: &ContentDocument, range: TextRange) -> Snippet { +fn to_occurrence(doc: &ContentDocument, range: TextRange, snipper: Option) -> Snippet { let src = &doc.content; let line_end_indices = &doc.line_end_indices; let highlight = range.start.byte..range.end.byte; - Snipper::default() + snipper + .unwrap_or(Snipper::default()) .expand(highlight, src, line_end_indices) .reify(src, &[]) } diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index 242c75f5b5..3c7a452321 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -76,9 +76,17 @@ pub(super) async fn handle( .await }; - let symbols = get_token_info(payload, &repo_ref, indexes, &source_doc, &all_docs) - .await - .map_err(Error::internal)?; + let symbols = get_token_info( + payload, + &repo_ref, + indexes, + &source_doc, + &all_docs, + None, + None, + ) + .await + .map_err(Error::internal)?; Ok(json(TokenInfoResponse::new(symbols))) } @@ -313,12 +321,17 @@ pub async fn get_token_info( indexes: Arc, source_doc: &ContentDocument, all_docs: &Vec, + context_before: Option, + context_after: Option, ) -> anyhow::Result> { let source_document_idx = all_docs .iter() .position(|doc| doc.relative_path == source_doc.relative_path) .unwrap(); // FIXME: handle this + let snipper = + Some(Snipper::default().context(context_before.unwrap_or(0), context_after.unwrap_or(0))); + let ctx: CodeNavigationContext<'_, '_> = CodeNavigationContext { token: Token { relative_path: params.relative_path.as_str(), @@ -327,6 +340,7 @@ pub async fn get_token_info( }, all_docs, source_document_idx, + snipper, }; let data = ctx.token_info(); @@ -338,6 +352,7 @@ pub async fn get_token_info( ctx.active_token_range(), params.branch.as_deref(), source_doc, + snipper, ) .await } else { @@ -352,6 +367,7 @@ async fn search_nav( payload_range: std::ops::Range, branch: Option<&str>, source_document: &ContentDocument, + snipper: Option, ) -> anyhow::Result> { use crate::{ indexes::{reader::ContentReader, DocumentRead}, @@ -478,7 +494,8 @@ async fn search_nav( }) .unwrap_or_default(); let highlight = start_byte..end_byte; - let snippet = Snipper::default() + let snippet = snipper + .unwrap_or(Snipper::default()) .expand(highlight, &doc.content, &doc.line_end_indices) .reify(&doc.content, &[]); From 3ac57fcf95f66da2dee1adf20318bca0ee05657d Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:53:58 -0500 Subject: [PATCH 07/30] adding repo_ref, reverting empty filter, adding filter comments --- server/bleep/src/agent/symbol.rs | 9 +++++---- server/bleep/src/agent/tools/code.rs | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 50285dd9e1..5a11099612 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -53,7 +53,7 @@ impl Agent { get_token_info( TokenInfoRequest { relative_path: chunk.path.clone(), - repo_ref: String::new(), // FIXME + repo_ref: self.repo_ref.display_name(), branch: None, start: range.start.byte, end: range.end.byte, @@ -70,12 +70,13 @@ impl Agent { .await; // filter references and definitions + // 1: symbol shouldn't be in the same file + // 2: number of refs/defs should be less than 5 to avoid very common symbols (iter, unwrap...) + // 3: also filter out symbols without refs/defs let mut symbols = related_symbols .into_iter() .filter_map(Result::ok) - .zip(hoverable_ranges.into_iter().filter(|range| { - (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) - })) + .zip(hoverable_ranges.into_iter()) .map(|(token_info, range)| { let filtered_token_info = token_info .into_iter() diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs index 965ef896f2..577647882f 100644 --- a/server/bleep/src/agent/tools/code.rs +++ b/server/bleep/src/agent/tools/code.rs @@ -80,6 +80,7 @@ impl Agent { let response = chunks .clone() .into_iter() + .filter(|c| !c.is_empty()) .map(|c| c.to_string()) .collect::>() .join("\n\n"); From c8e7a789ef3d3528bd1a221ee34f94ea443b0f96 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:59:03 -0500 Subject: [PATCH 08/30] tokeninfo response not pub anymore --- server/bleep/src/webserver/intelligence.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index 3c7a452321..049734c45e 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -36,8 +36,8 @@ pub struct TokenInfoRequest { /// The response from the `local-intel` endpoint. #[derive(Serialize, Debug)] -pub struct TokenInfoResponse { - pub data: Vec, +pub(super) struct TokenInfoResponse { + data: Vec, } impl TokenInfoResponse { From 16ca1265ad22fb5d789676cd3ebdfc9b3e7eb1d8 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:02:16 -0500 Subject: [PATCH 09/30] simplifying how we update path in a chunk --- server/bleep/src/agent/symbol.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 5a11099612..693469a36e 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -277,13 +277,8 @@ impl Agent { .take(MAX_CHUNKS) .map(|c| { let chunk = CodeChunk { - path: c.path.clone(), alias: self.get_path_alias(c.path.as_str()), - snippet: c.snippet.clone(), - start_line: c.start_line, - end_line: c.end_line, - start_byte: 0, - end_byte: 0, + ..c.clone() }; self.exchanges .last_mut() From 6e12d94517004655112e9a5870f8b3588dac21a5 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:03:25 -0500 Subject: [PATCH 10/30] symbol alias instead of chunk alias --- server/bleep/src/agent/symbol.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 693469a36e..4de2fc11e8 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -196,7 +196,7 @@ impl Agent { "properties": { "symbol": { "type": "integer", - "description": "The chunk alias" + "description": "The symbol alias" } }, "required": ["symbol"] From d93353461aa251cbdc0f39fb7f749a6e8593eed6 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:11:37 -0500 Subject: [PATCH 11/30] moving prompt to prompts.rs --- server/bleep/src/agent/prompts.rs | 13 +++++++++++++ server/bleep/src/agent/symbol.rs | 4 +++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index 129b8abb98..82d0a94446 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -386,6 +386,19 @@ Here is the full context for reference: ) } +pub fn symbol_classification_prompt(snippets: &str, query: &str) -> String { + format!( + r#""Snippets: + +{snippets} + +Instruction: Above there are some code chunks and some symbols extracted from the chunks. Your job is to select the most relevant symbol to the user query. Do not answer with the symbol name, use the symbol key/alias. + +Query:{query}"# + ) +} + + pub fn hypothetical_document_prompt(query: &str) -> String { format!( r#"Write a code snippet that could hypothetically be returned by a code search engine as the answer to the query: {query} diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 4de2fc11e8..a0b0ea9e78 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -7,6 +7,8 @@ use crate::webserver::intelligence::{get_token_info, TokenInfoRequest}; use anyhow::{Context, Result}; use tracing::log::{debug, info, warn}; +use super::prompts::symbol_classification_prompt; + pub struct ChunkWithSymbols { pub chunk: CodeChunk, pub symbols: Vec, @@ -184,7 +186,7 @@ impl Agent { .join("\n\n"); // instruction - let prompt = format!("Snippets:\n\n{}\n\nInstruction: Above there are some code chunks and some symbols extracted from the chunks. Your job is to select the most relevant symbol to the user query. Do not answer with the siymbol name, use the symbol key/alias.\n\nQuery:{}", chunks_string.as_str(), query); + let prompt = symbol_classification_prompt(chunks_string.as_str(), query); // function_call let filter_function = serde_json::from_value::>(serde_json::json!([ From dd3de64bd8ad91e1e90ce557d1953a3aca049572 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:22:11 -0500 Subject: [PATCH 12/30] reverting filter --- server/bleep/src/agent/prompts.rs | 1 - server/bleep/src/agent/symbol.rs | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index 82d0a94446..22a45d7477 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -398,7 +398,6 @@ Query:{query}"# ) } - pub fn hypothetical_document_prompt(query: &str) -> String { format!( r#"Write a code snippet that could hypothetically be returned by a code search engine as the answer to the query: {query} diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index a0b0ea9e78..4e7482eb25 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -78,7 +78,9 @@ impl Agent { let mut symbols = related_symbols .into_iter() .filter_map(Result::ok) - .zip(hoverable_ranges.into_iter()) + .zip(hoverable_ranges.into_iter().filter(|range| { + (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) + })) .map(|(token_info, range)| { let filtered_token_info = token_info .into_iter() From 1c2f1a6d2382cd8af7ec7aeb358482b339216344 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:40:33 -0500 Subject: [PATCH 13/30] fix unwrap in get_token_info --- server/bleep/src/webserver/intelligence.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index 049734c45e..d2046f503f 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -327,7 +327,7 @@ pub async fn get_token_info( let source_document_idx = all_docs .iter() .position(|doc| doc.relative_path == source_doc.relative_path) - .unwrap(); // FIXME: handle this + .ok_or(anyhow::anyhow!("invalid language"))?; let snipper = Some(Snipper::default().context(context_before.unwrap_or(0), context_after.unwrap_or(0))); From b132a75336bcb94ebd00217cf49621f649039bbf Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:50:02 -0500 Subject: [PATCH 14/30] adding some comments --- server/bleep/src/agent/symbol.rs | 6 ++++++ server/bleep/src/webserver/intelligence.rs | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 4e7482eb25..670e9c67fd 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -14,6 +14,12 @@ pub struct ChunkWithSymbols { pub symbols: Vec, } +/// This helps the code and proc tool return related chunks based on references and definitions. +/// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks +/// For each input chunk, we extract the symbols. We then pick ONE symbol using a classifier. +/// This symbol (reference and/ord definition) may be present in many files one or more times. +/// We extract the surrounding code for each occurence and pick `MAX_CHUNKS` occurrences/chunks. + impl Agent { pub async fn extract_symbols(&self, chunk: CodeChunk) -> Result { const MAX_REF_DEFS: usize = 5; // Ignore symbols with more than this many cross-file refs/defs diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index d2046f503f..cf1ca235ff 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -327,7 +327,7 @@ pub async fn get_token_info( let source_document_idx = all_docs .iter() .position(|doc| doc.relative_path == source_doc.relative_path) - .ok_or(anyhow::anyhow!("invalid language"))?; + .ok_or(anyhow::anyhow!("invalid language"))?; let snipper = Some(Snipper::default().context(context_before.unwrap_or(0), context_after.unwrap_or(0))); From 229ea001c0e8154427fddd5c18131cc9b1b4dfc4 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:50:21 -0500 Subject: [PATCH 15/30] typo --- server/bleep/src/agent/symbol.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 670e9c67fd..14afe4c9fb 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -17,7 +17,7 @@ pub struct ChunkWithSymbols { /// This helps the code and proc tool return related chunks based on references and definitions. /// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks /// For each input chunk, we extract the symbols. We then pick ONE symbol using a classifier. -/// This symbol (reference and/ord definition) may be present in many files one or more times. +/// This symbol (reference and/or definition) may be present in many files one or more times. /// We extract the surrounding code for each occurence and pick `MAX_CHUNKS` occurrences/chunks. impl Agent { From 60dfd352ed66702e0f718d9622fa61db8542152b Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:56:16 -0500 Subject: [PATCH 16/30] better doc --- server/bleep/src/agent/symbol.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 14afe4c9fb..e3548f9f16 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -16,9 +16,12 @@ pub struct ChunkWithSymbols { /// This helps the code and proc tool return related chunks based on references and definitions. /// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks -/// For each input chunk, we extract the symbols. We then pick ONE symbol using a classifier. -/// This symbol (reference and/or definition) may be present in many files one or more times. -/// We extract the surrounding code for each occurence and pick `MAX_CHUNKS` occurrences/chunks. +/// For each input chunk, we extract all symbols (variables, function names, structs...). +/// Then we search for symbol occurrences OUTSIDE the file of the current chunk. +/// We disconsider symbols with too many occurences (> `MAX_REF_DEFS`) as they are typically language related. +/// We then pick ONE symbol using a classifier (`filter_symbols`), where the classifier has access to user query, original chunks and filtered list of symbols. +/// This selected symbol may be present in many files one or more times. +/// We extract the surrounding code (up to `NUMBER_CHUNK_LINES` lines) for each occurence and pick `MAX_CHUNKS` occurrences/chunks. impl Agent { pub async fn extract_symbols(&self, chunk: CodeChunk) -> Result { From c5f3d82ef8ae7c0ad59e3f77a44d8e410a1e8505 Mon Sep 17 00:00:00 2001 From: Gabriel Gordon-Hall Date: Thu, 21 Dec 2023 10:00:26 +0000 Subject: [PATCH 17/30] make start_byte/end_byte usize and move function call def into prompts.rs --- server/bleep/src/agent/exchange.rs | 6 +-- server/bleep/src/agent/prompts.rs | 18 +++++++++ server/bleep/src/agent/symbol.rs | 53 ++++++++++---------------- server/bleep/src/agent/tools/answer.rs | 4 +- server/bleep/src/agent/tools/code.rs | 4 +- server/bleep/src/agent/tools/proc.rs | 4 +- server/bleep/src/webserver/answer.rs | 4 +- 7 files changed, 48 insertions(+), 45 deletions(-) diff --git a/server/bleep/src/agent/exchange.rs b/server/bleep/src/agent/exchange.rs index ff7e3f9edc..50b079b8ba 100644 --- a/server/bleep/src/agent/exchange.rs +++ b/server/bleep/src/agent/exchange.rs @@ -149,16 +149,14 @@ impl SearchStep { #[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct CodeChunk { pub path: String, - #[serde(rename = "alias")] pub alias: usize, - #[serde(rename = "snippet")] pub snippet: String, #[serde(rename = "start")] pub start_line: usize, #[serde(rename = "end")] pub end_line: usize, - pub start_byte: usize, - pub end_byte: usize, + pub start_byte: Option, + pub end_byte: Option, } impl CodeChunk { diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index 22a45d7477..b5ea297669 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -398,6 +398,24 @@ Query:{query}"# ) } +pub fn filter_function() -> serde_json::Value { + serde_json::json!([ + { + "name": "filter", + "description": "Select the symbol most likely to contain information to answer the query", + "parameters": { + "type": "object", + "properties": { + "symbol": { + "type": "integer", + "description": "The symbol alias" + } + }, + "required": ["symbol"] + } + }]) +} + pub fn hypothetical_document_prompt(query: &str) -> String { format!( r#"Write a code snippet that could hypothetically be returned by a code search engine as the answer to the query: {query} diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index e3548f9f16..c0212f5222 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -7,7 +7,7 @@ use crate::webserver::intelligence::{get_token_info, TokenInfoRequest}; use anyhow::{Context, Result}; use tracing::log::{debug, info, warn}; -use super::prompts::symbol_classification_prompt; +use super::prompts::{filter_function, symbol_classification_prompt}; pub struct ChunkWithSymbols { pub chunk: CodeChunk, @@ -58,7 +58,8 @@ impl Agent { hoverable_ranges .iter() .filter(|range| { - (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) + (range.start.byte >= chunk.start_byte.unwrap_or_default()) + && (range.start.byte < chunk.end_byte.unwrap_or_default()) }) .map(|range| { get_token_info( @@ -88,7 +89,8 @@ impl Agent { .into_iter() .filter_map(Result::ok) .zip(hoverable_ranges.into_iter().filter(|range| { - (range.start.byte >= chunk.start_byte) && (range.start.byte < chunk.end_byte) + (range.start.byte >= chunk.start_byte.unwrap_or_default()) + && (range.start.byte < chunk.end_byte.unwrap_or_default()) })) .map(|(token_info, range)| { let filtered_token_info = token_info @@ -97,8 +99,8 @@ impl Agent { .collect::>(); Symbol { - name: chunk.snippet.clone()[(range.start.byte - chunk.start_byte) - ..(range.end.byte - chunk.start_byte)] + name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default()) + ..(range.end.byte - chunk.start_byte.unwrap_or_default())] .to_string(), related_symbols: filtered_token_info, } @@ -120,9 +122,9 @@ impl Agent { }) } - pub async fn expand_symbol_into_chunks(&self, ref_def_metadata: Symbol) -> Vec { + pub async fn expand_symbol_into_chunks(&self, symbol: Symbol) -> Vec { // each symbol may be in multiple files and have multiple occurences in each file - ref_def_metadata + symbol .related_symbols .iter() .flat_map(|file_symbols| { @@ -137,8 +139,8 @@ impl Agent { snippet: occurrence.snippet.data.clone(), start_line: occurrence.snippet.line_range.start, end_line: occurrence.snippet.line_range.end, - start_byte: 0, - end_byte: 0, + start_byte: None, + end_byte: None, }) .collect::>() }) @@ -150,9 +152,13 @@ impl Agent { query: &str, chunks_with_symbols: Vec, ) -> Result { - let mut i: i32 = -1; + if chunks_with_symbols.is_empty() { + return Err(SymbolError::ListEmpty); + } + // we have multiples chunks and each chunk may have multiple symbols // unique alias (i) per symbol + let mut i: i32 = -1; let symbols = chunks_with_symbols .into_iter() .map(|chunk_with_symbol| { @@ -169,9 +175,6 @@ impl Agent { ) }) .collect::>(); - if i == -1 { - return Err(SymbolError::ListEmpty); - } // Classifier @@ -199,26 +202,10 @@ impl Agent { // instruction let prompt = symbol_classification_prompt(chunks_string.as_str(), query); - // function_call - let filter_function = serde_json::from_value::>(serde_json::json!([ - { - "name": "filter", - "description": "Select the symbol most likely to contain information to answer the query", - "parameters": { - "type": "object", - "properties": { - "symbol": { - "type": "integer", - "description": "The symbol alias" - } - }, - "required": ["symbol"] - } - }]) - ) - .unwrap(); - - let llm_response = match self.llm_with_function_call(prompt, filter_function).await { + let llm_response = match self + .llm_with_function_call(prompt, serde_json::from_value(filter_function()).unwrap()) + .await + { Ok(llm_response) => llm_response, Err(e) => { warn!( diff --git a/server/bleep/src/agent/tools/answer.rs b/server/bleep/src/agent/tools/answer.rs index 40ad7fc126..0f0eb4b46b 100644 --- a/server/bleep/src/agent/tools/answer.rs +++ b/server/bleep/src/agent/tools/answer.rs @@ -367,8 +367,8 @@ impl Agent { snippet, start_line: span.start, end_line: span.end, - start_byte: 0, - end_byte: 0, + start_byte: None, + end_byte: None, } }) .collect::>(); diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs index 577647882f..de05047e26 100644 --- a/server/bleep/src/agent/tools/code.rs +++ b/server/bleep/src/agent/tools/code.rs @@ -57,8 +57,8 @@ impl Agent { snippet: chunk.text, start_line: chunk.start_line as usize, end_line: chunk.end_line as usize, - start_byte: chunk.start_byte as usize, - end_byte: chunk.end_byte as usize, + start_byte: Some(chunk.start_byte as usize), + end_byte: Some(chunk.end_byte as usize), } }) .collect::>(); diff --git a/server/bleep/src/agent/tools/proc.rs b/server/bleep/src/agent/tools/proc.rs index 0dbbc554b4..d6e81e2a6e 100644 --- a/server/bleep/src/agent/tools/proc.rs +++ b/server/bleep/src/agent/tools/proc.rs @@ -45,8 +45,8 @@ impl Agent { snippet: chunk.text, start_line: chunk.start_line as usize, end_line: chunk.end_line as usize, - start_byte: chunk.start_byte as usize, - end_byte: chunk.end_byte as usize, + start_byte: Some(chunk.start_byte as usize), + end_byte: Some(chunk.end_byte as usize), } }) .collect::>(); diff --git a/server/bleep/src/webserver/answer.rs b/server/bleep/src/webserver/answer.rs index 370e590ffc..b632b0c74f 100644 --- a/server/bleep/src/webserver/answer.rs +++ b/server/bleep/src/webserver/answer.rs @@ -451,8 +451,8 @@ pub async fn explain( start_line: params.line_start, end_line: params.line_end, snippet, - start_byte: 0, - end_byte: 0, + start_byte: None, + end_byte: None, }); let action = Action::Answer { paths: vec![0] }; From 12c4bb8ee85fe600a0e2062474e3569b4b2e4d7a Mon Sep 17 00:00:00 2001 From: Gabriel Gordon-Hall Date: Thu, 21 Dec 2023 12:14:49 +0000 Subject: [PATCH 18/30] log selected symbol --- server/bleep/src/agent/symbol.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index c0212f5222..7b60f056a9 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -261,18 +261,20 @@ impl Agent { // select one symbol let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await { - Ok(selected_symbol) => selected_symbol, + Ok(selected_symbol) => { + info!("Selected symbol: {}", selected_symbol.name); + selected_symbol + } Err(e) => { info!("Returning no extra chunks: {}", e); return Vec::new(); } }; - // get expanded chunks for selected symbol - let extra_chunks = self.expand_symbol_into_chunks(selected_symbol).await; - // take 3 chunks, update path aliases, update enchange chunks - let extra_chunks = extra_chunks + let extra_chunks = self + .expand_symbol_into_chunks(selected_symbol) + .await .iter() .take(MAX_CHUNKS) .map(|c| { From ad7f0bebf154e5816114e4b17e12d0cbdb3c11fe Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Thu, 21 Dec 2023 09:16:54 -0500 Subject: [PATCH 19/30] unwrap_or_default --- server/bleep/src/intelligence/code_navigation.rs | 2 +- server/bleep/src/webserver/intelligence.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/bleep/src/intelligence/code_navigation.rs b/server/bleep/src/intelligence/code_navigation.rs index d275c319f2..2284d1ea5c 100644 --- a/server/bleep/src/intelligence/code_navigation.rs +++ b/server/bleep/src/intelligence/code_navigation.rs @@ -457,7 +457,7 @@ fn to_occurrence(doc: &ContentDocument, range: TextRange, snipper: Option Date: Thu, 21 Dec 2023 09:41:32 -0500 Subject: [PATCH 20/30] revert Cargo.lock --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b84c44b0c4..94c1438e7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4866,8 +4866,8 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "ort" -version = "1.14.8" -source = "git+https://github.com/bloopai/ort?branch=env-builder-telemetry#dfd92ae5cf2dff3318c99de1a8b1a801cb5a3c9e" +version = "1.16.3" +source = "git+https://github.com/bloopai/ort?branch=env-builder-telemetry#7c37ed9fe5c7c37059a46652b82328623ac4857b" dependencies = [ "flate2", "half 2.3.1", From 5bf128e4ea9b05594b9318d0d59e266fa3dadfa9 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Wed, 3 Jan 2024 22:12:06 -0500 Subject: [PATCH 21/30] latency --- server/bleep/src/agent/symbol.rs | 21 +++++++++++++++++++ .../bleep/src/intelligence/code_navigation.rs | 9 ++++++++ server/bleep/src/webserver/intelligence.rs | 1 + 3 files changed, 31 insertions(+) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 7b60f056a9..6cdd5b7e77 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -14,6 +14,9 @@ pub struct ChunkWithSymbols { pub symbols: Vec, } +use std::time::{Instant}; + + /// This helps the code and proc tool return related chunks based on references and definitions. /// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks /// For each input chunk, we extract all symbols (variables, function names, structs...). @@ -244,6 +247,12 @@ impl Agent { pub async fn get_related_chunks(&mut self, chunks: Vec) -> Vec { const MAX_CHUNKS: usize = 3; + let start_time = Instant::now(); + + + + + // get symbols with ref/defs for each chunk let chunks_with_symbols = futures::future::join_all( chunks @@ -256,8 +265,13 @@ impl Agent { .filter_map(Result::ok) .collect(); + let elapsed = start_time.elapsed(); + println!("Time taken extract symbols from chunks: {:?}", elapsed); + // get original user query let user_query = self.last_exchange().query.target().unwrap(); + let start_time2 = Instant::now(); + // select one symbol let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await { @@ -271,6 +285,10 @@ impl Agent { } }; + let elapsed = start_time2.elapsed(); + println!("Time taken to classify chunks: {:?}", elapsed); + let start_time3 = Instant::now(); + // take 3 chunks, update path aliases, update enchange chunks let extra_chunks = self .expand_symbol_into_chunks(selected_symbol) @@ -290,6 +308,9 @@ impl Agent { chunk }) .collect::>(); + let elapsed = start_time3.elapsed(); + println!("Time taken to expand symbol: {:?}", elapsed); + extra_chunks } diff --git a/server/bleep/src/intelligence/code_navigation.rs b/server/bleep/src/intelligence/code_navigation.rs index 2284d1ea5c..f2a51383fe 100644 --- a/server/bleep/src/intelligence/code_navigation.rs +++ b/server/bleep/src/intelligence/code_navigation.rs @@ -190,7 +190,10 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { } pub fn token_info(&self) -> Vec { + dbg!("tokeninfo"); + if self.is_definition() { + dbg!("def"); let local_references = self.local_references(); let repo_wide_references = self .is_top_level() @@ -202,6 +205,8 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .chain(repo_wide_references) .collect() } else if self.is_reference() { + dbg!("ref"); + let local_definitions = self.local_definitions(); let repo_wide_definitions = local_definitions .is_none() @@ -224,6 +229,8 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .chain(repo_wide_references) .collect() } else if self.is_import() { + dbg!("import"); + let local_references = self.local_references(); let repo_wide_definitions = self.repo_wide_definitions(); @@ -232,6 +239,8 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .chain(local_references) .collect() } else { + dbg!("empty"); + Vec::new() } } diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index c46747a5bf..edb6efb057 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -345,6 +345,7 @@ pub async fn get_token_info( let data = ctx.token_info(); if data.is_empty() { + dbg!("empty, search_nav"); search_nav( Arc::clone(&indexes), repo_ref, From d591371363f968a040fd348d137c2448cfe7ca47 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Thu, 4 Jan 2024 15:36:14 -0500 Subject: [PATCH 22/30] use all hoverable symbols, tokeninfo only for the selected one --- server/bleep/src/agent/symbol.rs | 113 ++++++++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 10 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 6cdd5b7e77..8165abada2 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -14,8 +14,12 @@ pub struct ChunkWithSymbols { pub symbols: Vec, } -use std::time::{Instant}; +pub struct ChunkWithHoverableSymbols { + pub chunk: CodeChunk, + pub symbols: Vec, +} +use std::time::Instant; /// This helps the code and proc tool return related chunks based on references and definitions. /// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks @@ -27,6 +31,55 @@ use std::time::{Instant}; /// We extract the surrounding code (up to `NUMBER_CHUNK_LINES` lines) for each occurence and pick `MAX_CHUNKS` occurrences/chunks. impl Agent { + pub async fn extract_hoverable_symbols( + &self, + chunk: CodeChunk, + ) -> Result { + // get hoverable elements + let document = self + .app + .indexes + .file + .by_path(&self.repo_ref, &chunk.path, None) + .await? + .with_context(|| format!("failed to read path: {}", &chunk.path))?; + + let hoverable_ranges = document + .hoverable_ranges() + .ok_or_else(|| anyhow::anyhow!("no hoverable ranges"))?; + + let mut symbols = hoverable_ranges + .into_iter() + .filter(|range| { + (range.start.byte >= chunk.start_byte.unwrap_or_default()) + && (range.start.byte < chunk.end_byte.unwrap_or_default()) + }) + .map(|range| HoverableSymbol { + name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default()) + ..(range.end.byte - chunk.start_byte.unwrap_or_default())] + .to_string(), + token_info_request: TokenInfoRequest { + relative_path: chunk.path.clone(), + repo_ref: self.repo_ref.display_name(), + branch: None, + start: range.start.byte, + end: range.end.byte, + }, + path: chunk.path.clone(), + }) + .collect::>(); + + symbols.sort_by(|a, b| a.name.cmp(&b.name)); + symbols.dedup_by(|a, b| a.name == b.name); + + debug!("Attached {} symbols", symbols.len()); + + Ok(ChunkWithHoverableSymbols { + chunk: chunk.clone(), + symbols, + }) + } + pub async fn extract_symbols(&self, chunk: CodeChunk) -> Result { const MAX_REF_DEFS: usize = 5; // Ignore symbols with more than this many cross-file refs/defs const NUMBER_CHUNK_LINES: usize = 10; @@ -153,7 +206,7 @@ impl Agent { pub async fn filter_symbols( &self, query: &str, - chunks_with_symbols: Vec, + chunks_with_symbols: Vec, ) -> Result { if chunks_with_symbols.is_empty() { return Err(SymbolError::ListEmpty); @@ -239,7 +292,47 @@ impl Agent { .flat_map(|(_, symbol_with_alias)| symbol_with_alias) .find(|(alias, _)| *alias == selected_symbol as i32) { - Some((_alias, symbol_metadata)) => Ok(symbol_metadata), + Some((_alias, symbol_metadata)) => Ok(Symbol { + name: symbol_metadata.name, + related_symbols: { + let document = self + .app + .indexes + .file + .by_path(&self.repo_ref, &symbol_metadata.path, None) + .await + .unwrap() + .unwrap(); + + let all_docs = { + let associated_langs = + match document.lang.as_deref().map(TSLanguage::from_id) { + Some(Language::Supported(config)) => config.language_ids, + _ => &[], + }; + self.app + .indexes + .file + .by_repo(&self.repo_ref, associated_langs.iter(), None) + .await + }; + + get_token_info( + symbol_metadata.token_info_request, + &self.repo_ref, + self.app.indexes.clone(), + &document, + &all_docs, + Some(0), + Some(10), + ) + .await + .unwrap() + .into_iter() + .filter(|file_symbol| file_symbol.file != symbol_metadata.path) + .collect::>() + }, + }), _ => Err(SymbolError::OutOfBounds), } } @@ -249,16 +342,12 @@ impl Agent { let start_time = Instant::now(); - - - - // get symbols with ref/defs for each chunk let chunks_with_symbols = futures::future::join_all( chunks .iter() .filter(|c| !c.is_empty()) - .map(|c| self.extract_symbols(c.clone())), // TODO: Log failure + .map(|c| self.extract_hoverable_symbols(c.clone())), // TODO: Log failure ) .await .into_iter() @@ -272,7 +361,6 @@ impl Agent { let user_query = self.last_exchange().query.target().unwrap(); let start_time2 = Instant::now(); - // select one symbol let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await { Ok(selected_symbol) => { @@ -323,7 +411,7 @@ impl Agent { self.llm_gateway .clone() - .model("gpt-3.5-turbo-0613") + .model("gpt-4-0613") .temperature(0.0) .chat_stream(&messages, Some(&functions)) .await? @@ -349,6 +437,11 @@ impl Agent { } } +pub struct HoverableSymbol { + pub name: String, + pub token_info_request: TokenInfoRequest, + pub path: String, +} pub struct Symbol { pub name: String, pub related_symbols: Vec, From cc6516995d36ab1290c28135d2cbdd197591b407 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Fri, 5 Jan 2024 10:07:35 -0500 Subject: [PATCH 23/30] no function_calling --- server/bleep/src/agent/prompts.rs | 22 +++++++++++---- server/bleep/src/agent/symbol.rs | 45 ++++++++++++++++++------------- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index b5ea297669..4daf793cfc 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -386,15 +386,27 @@ Here is the full context for reference: ) } -pub fn symbol_classification_prompt(snippets: &str, query: &str) -> String { - format!( - r#""Snippets: +pub fn symbol_classification_prompt(snippets: &str, query: &str) -> (String, String) { + ( + format!( + r#"Snippets: {snippets} -Instruction: Above there are some code chunks and some symbols extracted from the chunks. Your job is to select the most relevant symbol to the user query. Do not answer with the symbol name, use the symbol key/alias. +You are a coding assistant. Above there are some code chunks and some symbols extracted from the chunks and their integer alias. Your job is to select the most relevant symbol to the user query. Use the symbol alias to answer. + +Example: +Query: how does ranking work? +23 + +Query: which function makes an api call +3"# + ), + format!( + r#"Query: {query} -Query:{query}"# +Answer with only the symbol alias."# + ), ) } diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 8165abada2..575aad8e53 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -258,33 +258,24 @@ impl Agent { // instruction let prompt = symbol_classification_prompt(chunks_string.as_str(), query); - let llm_response = match self - .llm_with_function_call(prompt, serde_json::from_value(filter_function()).unwrap()) - .await - { + let llm_response = match self.llm_without_function_call(prompt.0, prompt.1).await { Ok(llm_response) => llm_response, Err(e) => { warn!( "Symbol classifier llm call failed, picking the first symbol: {}", e ); - llm_gateway::api::FunctionCall { - name: Some("filter".to_string()), - arguments: "{\"symbol\": 0}".to_string(), - } + "0".to_string() } }; - let filter_argument: Filter = - match serde_json::from_str(llm_response.clone().arguments.as_str()) { - Ok(argument) => argument, - Err(_e) => { - warn!("Cannot deserialize: {:?}", llm_response); - return Err(SymbolError::DeserializeFilter); - } - }; - - let selected_symbol = filter_argument.symbol; + let selected_symbol = match llm_response.as_str().parse::() { + Ok(symbol) => symbol, + Err(e) => { + warn!("Parsing to integer failed, picking the first symbol: {}", e); + 0 + } + }; // finding symbol metadata match symbols @@ -402,6 +393,24 @@ impl Agent { extra_chunks } + async fn llm_without_function_call( + &self, + system_message: String, + user_message: String, + ) -> Result { + let messages = vec![ + llm_gateway::api::Message::system(system_message.as_str()), + llm_gateway::api::Message::user(user_message.as_str()), + ]; + + self.llm_gateway + .clone() + .model("gpt-4-0613") + .temperature(0.0) + .chat(&messages, None) + .await + } + async fn llm_with_function_call( &self, prompt: String, From 3c7801862e655426eff74a5808ae39e2b22b151e Mon Sep 17 00:00:00 2001 From: Akshay Date: Fri, 5 Jan 2024 15:34:04 +0000 Subject: [PATCH 24/30] filter out local symbols --- server/bleep/src/agent/symbol.rs | 34 ++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 8165abada2..05b328ad53 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -19,8 +19,6 @@ pub struct ChunkWithHoverableSymbols { pub symbols: Vec, } -use std::time::Instant; - /// This helps the code and proc tool return related chunks based on references and definitions. /// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks /// For each input chunk, we extract all symbols (variables, function names, structs...). @@ -44,6 +42,11 @@ impl Agent { .await? .with_context(|| format!("failed to read path: {}", &chunk.path))?; + let graph = document + .symbol_locations + .scope_graph() + .with_context(|| format!("no scope graph for file: {}", &chunk.path))?; + let hoverable_ranges = document .hoverable_ranges() .ok_or_else(|| anyhow::anyhow!("no hoverable ranges"))?; @@ -54,6 +57,15 @@ impl Agent { (range.start.byte >= chunk.start_byte.unwrap_or_default()) && (range.start.byte < chunk.end_byte.unwrap_or_default()) }) + .filter_map(|range| { + // if this node can be resolved locally in the scope-graph, omit it + if let Some(node_by_range) = graph.node_by_range(range.start.byte, range.end.byte) { + if graph.is_reference(node_by_range) || graph.is_definition(node_by_range) { + return None; + } + } + Some(range) + }) .map(|range| HoverableSymbol { name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default()) ..(range.end.byte - chunk.start_byte.unwrap_or_default())] @@ -72,7 +84,11 @@ impl Agent { symbols.sort_by(|a, b| a.name.cmp(&b.name)); symbols.dedup_by(|a, b| a.name == b.name); - debug!("Attached {} symbols", symbols.len()); + debug!( + "Attached {} symbols: {:?}", + symbols.len(), + symbols.iter().map(|s| s.name.as_str()).collect::>() + ); Ok(ChunkWithHoverableSymbols { chunk: chunk.clone(), @@ -340,8 +356,6 @@ impl Agent { pub async fn get_related_chunks(&mut self, chunks: Vec) -> Vec { const MAX_CHUNKS: usize = 3; - let start_time = Instant::now(); - // get symbols with ref/defs for each chunk let chunks_with_symbols = futures::future::join_all( chunks @@ -354,12 +368,8 @@ impl Agent { .filter_map(Result::ok) .collect(); - let elapsed = start_time.elapsed(); - println!("Time taken extract symbols from chunks: {:?}", elapsed); - // get original user query let user_query = self.last_exchange().query.target().unwrap(); - let start_time2 = Instant::now(); // select one symbol let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await { @@ -373,10 +383,6 @@ impl Agent { } }; - let elapsed = start_time2.elapsed(); - println!("Time taken to classify chunks: {:?}", elapsed); - let start_time3 = Instant::now(); - // take 3 chunks, update path aliases, update enchange chunks let extra_chunks = self .expand_symbol_into_chunks(selected_symbol) @@ -396,8 +402,6 @@ impl Agent { chunk }) .collect::>(); - let elapsed = start_time3.elapsed(); - println!("Time taken to expand symbol: {:?}", elapsed); extra_chunks } From d8036c6c55982e6669b03adcc96180ee05616d45 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Fri, 5 Jan 2024 11:27:27 -0500 Subject: [PATCH 25/30] removing unused functions --- server/bleep/src/agent/prompts.rs | 18 ---- server/bleep/src/agent/symbol.rs | 147 +----------------------------- 2 files changed, 1 insertion(+), 164 deletions(-) diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index 4daf793cfc..d03c0620b7 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -410,24 +410,6 @@ Answer with only the symbol alias."# ) } -pub fn filter_function() -> serde_json::Value { - serde_json::json!([ - { - "name": "filter", - "description": "Select the symbol most likely to contain information to answer the query", - "parameters": { - "type": "object", - "properties": { - "symbol": { - "type": "integer", - "description": "The symbol alias" - } - }, - "required": ["symbol"] - } - }]) -} - pub fn hypothetical_document_prompt(query: &str) -> String { format!( r#"Write a code snippet that could hypothetically be returned by a code search engine as the answer to the query: {query} diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index dcdee6d266..340b7d315b 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -1,5 +1,3 @@ -use futures::TryStreamExt; - use crate::agent::{exchange::CodeChunk, Agent}; use crate::intelligence::{code_navigation::FileSymbols, Language, TSLanguage}; use crate::llm_gateway; @@ -7,12 +5,7 @@ use crate::webserver::intelligence::{get_token_info, TokenInfoRequest}; use anyhow::{Context, Result}; use tracing::log::{debug, info, warn}; -use super::prompts::{filter_function, symbol_classification_prompt}; - -pub struct ChunkWithSymbols { - pub chunk: CodeChunk, - pub symbols: Vec, -} +use super::prompts::symbol_classification_prompt; pub struct ChunkWithHoverableSymbols { pub chunk: CodeChunk, @@ -96,104 +89,6 @@ impl Agent { }) } - pub async fn extract_symbols(&self, chunk: CodeChunk) -> Result { - const MAX_REF_DEFS: usize = 5; // Ignore symbols with more than this many cross-file refs/defs - const NUMBER_CHUNK_LINES: usize = 10; - - // get hoverable elements - let document = self - .app - .indexes - .file - .by_path(&self.repo_ref, &chunk.path, None) - .await? - .with_context(|| format!("failed to read path: {}", &chunk.path))?; - - let hoverable_ranges = document - .hoverable_ranges() - .ok_or_else(|| anyhow::anyhow!("no hoverable ranges"))?; - - let all_docs = { - let associated_langs = match document.lang.as_deref().map(TSLanguage::from_id) { - Some(Language::Supported(config)) => config.language_ids, - _ => &[], - }; - self.app - .indexes - .file - .by_repo(&self.repo_ref, associated_langs.iter(), None) - .await - }; - - // get references and definitions for each symbol - let related_symbols = futures::future::join_all( - hoverable_ranges - .iter() - .filter(|range| { - (range.start.byte >= chunk.start_byte.unwrap_or_default()) - && (range.start.byte < chunk.end_byte.unwrap_or_default()) - }) - .map(|range| { - get_token_info( - TokenInfoRequest { - relative_path: chunk.path.clone(), - repo_ref: self.repo_ref.display_name(), - branch: None, - start: range.start.byte, - end: range.end.byte, - }, - &self.repo_ref, - self.app.indexes.clone(), - &document, - &all_docs, - Some(0), - Some(NUMBER_CHUNK_LINES), - ) - }), - ) - .await; - - // filter references and definitions - // 1: symbol shouldn't be in the same file - // 2: number of refs/defs should be less than 5 to avoid very common symbols (iter, unwrap...) - // 3: also filter out symbols without refs/defs - let mut symbols = related_symbols - .into_iter() - .filter_map(Result::ok) - .zip(hoverable_ranges.into_iter().filter(|range| { - (range.start.byte >= chunk.start_byte.unwrap_or_default()) - && (range.start.byte < chunk.end_byte.unwrap_or_default()) - })) - .map(|(token_info, range)| { - let filtered_token_info = token_info - .into_iter() - .filter(|file_symbols| file_symbols.file != chunk.path) - .collect::>(); - - Symbol { - name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default()) - ..(range.end.byte - chunk.start_byte.unwrap_or_default())] - .to_string(), - related_symbols: filtered_token_info, - } - }) - .filter(|metadata| { - (metadata.related_symbols.len() < MAX_REF_DEFS) - && (!metadata.related_symbols.is_empty()) - }) - .collect::>(); - - symbols.sort_by(|a, b| a.name.cmp(&b.name)); - symbols.dedup_by(|a, b| a.name == b.name); - - debug!("Attached {} symbols", symbols.len()); - - Ok(ChunkWithSymbols { - chunk: chunk.clone(), - symbols, - }) - } - pub async fn expand_symbol_into_chunks(&self, symbol: Symbol) -> Vec { // each symbol may be in multiple files and have multiple occurences in each file symbol @@ -414,40 +309,6 @@ impl Agent { .chat(&messages, None) .await } - - async fn llm_with_function_call( - &self, - prompt: String, - functions: Vec, - ) -> Result { - let messages = vec![llm_gateway::api::Message::user(prompt.as_str())]; - - self.llm_gateway - .clone() - .model("gpt-4-0613") - .temperature(0.0) - .chat_stream(&messages, Some(&functions)) - .await? - .try_fold( - llm_gateway::api::FunctionCall::default(), - |acc: llm_gateway::api::FunctionCall, e: String| async move { - let e: llm_gateway::api::FunctionCall = - serde_json::from_str(&e).map_err(|err| { - tracing::error!( - "Failed to deserialize to FunctionCall: {:?}. Error: {:?}", - e, - err - ); - err - })?; - Ok(llm_gateway::api::FunctionCall { - name: acc.name.or(e.name), - arguments: acc.arguments + &e.arguments, - }) - }, - ) - .await - } } pub struct HoverableSymbol { @@ -459,17 +320,11 @@ pub struct Symbol { pub name: String, pub related_symbols: Vec, } -#[derive(serde::Deserialize)] -struct Filter { - symbol: usize, -} #[derive(thiserror::Error, Debug)] pub enum SymbolError { #[error("No symbol retrieved in the provided chunks")] ListEmpty, - #[error("Cannot deserialize llm function call arguments")] - DeserializeFilter, #[error("Selected symbol out of bounds")] OutOfBounds, } From 037a3bf454edd7c1b555d62f13f6d503dc2fe3c0 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:42:58 -0500 Subject: [PATCH 26/30] clippy filter_map --- server/bleep/src/agent/symbol.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 340b7d315b..46b16998fc 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -50,14 +50,14 @@ impl Agent { (range.start.byte >= chunk.start_byte.unwrap_or_default()) && (range.start.byte < chunk.end_byte.unwrap_or_default()) }) - .filter_map(|range| { + .filter(|range| { // if this node can be resolved locally in the scope-graph, omit it if let Some(node_by_range) = graph.node_by_range(range.start.byte, range.end.byte) { if graph.is_reference(node_by_range) || graph.is_definition(node_by_range) { - return None; + return false; } } - Some(range) + true }) .map(|range| HoverableSymbol { name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default()) From 8be5318b1cdb763ed719c42d977943514952d9a8 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:51:29 -0500 Subject: [PATCH 27/30] prompt improved --- server/bleep/src/agent/prompts.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index d03c0620b7..a8e3d2b21e 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -393,8 +393,7 @@ pub fn symbol_classification_prompt(snippets: &str, query: &str) -> (String, Str {snippets} -You are a coding assistant. Above there are some code chunks and some symbols extracted from the chunks and their integer alias. Your job is to select the most relevant symbol to the user query. Use the symbol alias to answer. - +Above are code chunks and non-local symbols that have been extracted from the chunks. Each chunk is followed by an enumerated list of symbols that it contains. Given a user query, select the symbol which is most relevant to it, e.g. the references or definition of this symbol would help somebody answer the query. Symbols which are language builtins or which come from third party libraries are unlikely to be helpful. Example: Query: how does ranking work? 23 @@ -405,7 +404,7 @@ Query: which function makes an api call format!( r#"Query: {query} -Answer with only the symbol alias."# +Do not answer with the symbol name, use the symbol index."# ), ) } From ed8bd146bdb17af83ac0ca43b04bb095738dc431 Mon Sep 17 00:00:00 2001 From: rafael <22560219+rmuller-ml@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:07:42 -0500 Subject: [PATCH 28/30] fix explanation and add a const --- server/bleep/src/agent/symbol.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 46b16998fc..25110aad2b 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -15,8 +15,7 @@ pub struct ChunkWithHoverableSymbols { /// This helps the code and proc tool return related chunks based on references and definitions. /// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks /// For each input chunk, we extract all symbols (variables, function names, structs...). -/// Then we search for symbol occurrences OUTSIDE the file of the current chunk. -/// We disconsider symbols with too many occurences (> `MAX_REF_DEFS`) as they are typically language related. +/// Then we disconsider symbols that are defined in the same file using the scope graph. /// We then pick ONE symbol using a classifier (`filter_symbols`), where the classifier has access to user query, original chunks and filtered list of symbols. /// This selected symbol may be present in many files one or more times. /// We extract the surrounding code (up to `NUMBER_CHUNK_LINES` lines) for each occurence and pick `MAX_CHUNKS` occurrences/chunks. @@ -123,6 +122,8 @@ impl Agent { return Err(SymbolError::ListEmpty); } + const NUMBER_CHUNK_LINES: usize = 10; + // we have multiples chunks and each chunk may have multiple symbols // unique alias (i) per symbol let mut i: i32 = -1; @@ -226,7 +227,7 @@ impl Agent { &document, &all_docs, Some(0), - Some(10), + Some(NUMBER_CHUNK_LINES), ) .await .unwrap() From e8b97824c5a7279bfe02abcd33e524c91da8812f Mon Sep 17 00:00:00 2001 From: Gabriel Gordon-Hall Date: Mon, 8 Jan 2024 12:21:25 +0000 Subject: [PATCH 29/30] reformat prompt and inline llm call --- server/bleep/src/agent/prompts.rs | 24 +++++++----------- server/bleep/src/agent/symbol.rs | 42 +++++++++++++------------------ 2 files changed, 26 insertions(+), 40 deletions(-) diff --git a/server/bleep/src/agent/prompts.rs b/server/bleep/src/agent/prompts.rs index a8e3d2b21e..0659931284 100644 --- a/server/bleep/src/agent/prompts.rs +++ b/server/bleep/src/agent/prompts.rs @@ -386,26 +386,20 @@ Here is the full context for reference: ) } -pub fn symbol_classification_prompt(snippets: &str, query: &str) -> (String, String) { - ( - format!( - r#"Snippets: - -{snippets} +pub fn symbol_classification_prompt(snippets: &str) -> String { + format!( + r#"{snippets} Above are code chunks and non-local symbols that have been extracted from the chunks. Each chunk is followed by an enumerated list of symbols that it contains. Given a user query, select the symbol which is most relevant to it, e.g. the references or definition of this symbol would help somebody answer the query. Symbols which are language builtins or which come from third party libraries are unlikely to be helpful. -Example: -Query: how does ranking work? + +Do not answer with the symbol name, use the symbol index. + +### Examples ### +Q: how does ranking work? 23 -Query: which function makes an api call +Q: which function makes an api call 3"# - ), - format!( - r#"Query: {query} - -Do not answer with the symbol name, use the symbol index."# - ), ) } diff --git a/server/bleep/src/agent/symbol.rs b/server/bleep/src/agent/symbol.rs index 25110aad2b..f279724e82 100644 --- a/server/bleep/src/agent/symbol.rs +++ b/server/bleep/src/agent/symbol.rs @@ -158,7 +158,7 @@ impl Agent { .join("\n"); format!( - "Path:{}\n\n{}\n\nSymbols:\n\n{}", + "```{}\n{}```\n\n{}", c.path.clone(), c.snippet.clone(), symbols_string @@ -168,20 +168,30 @@ impl Agent { .join("\n\n"); // instruction - let prompt = symbol_classification_prompt(chunks_string.as_str(), query); + let messages = vec![ + llm_gateway::api::Message::system(&symbol_classification_prompt(&chunks_string)), + llm_gateway::api::Message::user(query), + ]; - let llm_response = match self.llm_without_function_call(prompt.0, prompt.1).await { - Ok(llm_response) => llm_response, + let response = match self + .llm_gateway + .clone() + .model("gpt-4-0613") + .temperature(0.0) + .chat(&messages, None) + .await + { + Ok(response) => response, Err(e) => { warn!( "Symbol classifier llm call failed, picking the first symbol: {}", e ); - "0".to_string() + "0".into() } }; - let selected_symbol = match llm_response.as_str().parse::() { + let selected_symbol = match response.as_str().parse::() { Ok(symbol) => symbol, Err(e) => { warn!("Parsing to integer failed, picking the first symbol: {}", e); @@ -193,7 +203,7 @@ impl Agent { match symbols .into_iter() .flat_map(|(_, symbol_with_alias)| symbol_with_alias) - .find(|(alias, _)| *alias == selected_symbol as i32) + .find(|(alias, _)| *alias == selected_symbol) { Some((_alias, symbol_metadata)) => Ok(Symbol { name: symbol_metadata.name, @@ -292,24 +302,6 @@ impl Agent { extra_chunks } - - async fn llm_without_function_call( - &self, - system_message: String, - user_message: String, - ) -> Result { - let messages = vec![ - llm_gateway::api::Message::system(system_message.as_str()), - llm_gateway::api::Message::user(user_message.as_str()), - ]; - - self.llm_gateway - .clone() - .model("gpt-4-0613") - .temperature(0.0) - .chat(&messages, None) - .await - } } pub struct HoverableSymbol { From 8b2f65f36b0518524a6657dba2f94f83ecf54bb4 Mon Sep 17 00:00:00 2001 From: Gabriel Gordon-Hall Date: Mon, 8 Jan 2024 12:53:12 +0000 Subject: [PATCH 30/30] remove dbg! calls --- server/bleep/src/intelligence/code_navigation.rs | 9 --------- server/bleep/src/webserver/intelligence.rs | 1 - 2 files changed, 10 deletions(-) diff --git a/server/bleep/src/intelligence/code_navigation.rs b/server/bleep/src/intelligence/code_navigation.rs index f2a51383fe..2284d1ea5c 100644 --- a/server/bleep/src/intelligence/code_navigation.rs +++ b/server/bleep/src/intelligence/code_navigation.rs @@ -190,10 +190,7 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { } pub fn token_info(&self) -> Vec { - dbg!("tokeninfo"); - if self.is_definition() { - dbg!("def"); let local_references = self.local_references(); let repo_wide_references = self .is_top_level() @@ -205,8 +202,6 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .chain(repo_wide_references) .collect() } else if self.is_reference() { - dbg!("ref"); - let local_definitions = self.local_definitions(); let repo_wide_definitions = local_definitions .is_none() @@ -229,8 +224,6 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .chain(repo_wide_references) .collect() } else if self.is_import() { - dbg!("import"); - let local_references = self.local_references(); let repo_wide_definitions = self.repo_wide_definitions(); @@ -239,8 +232,6 @@ impl<'a, 'b> CodeNavigationContext<'a, 'b> { .chain(local_references) .collect() } else { - dbg!("empty"); - Vec::new() } } diff --git a/server/bleep/src/webserver/intelligence.rs b/server/bleep/src/webserver/intelligence.rs index edb6efb057..c46747a5bf 100644 --- a/server/bleep/src/webserver/intelligence.rs +++ b/server/bleep/src/webserver/intelligence.rs @@ -345,7 +345,6 @@ pub async fn get_token_info( let data = ctx.token_info(); if data.is_empty() { - dbg!("empty, search_nav"); search_nav( Arc::clone(&indexes), repo_ref,