Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6cc16d2
saving work
rmuller-ml Dec 1, 2023
f820752
fix error handling
rmuller-ml Dec 18, 2023
b8d5a77
clippy
rmuller-ml Dec 18, 2023
164eff6
fmt
rmuller-ml Dec 18, 2023
d12b13d
wip: refactor changes
ggordonhall Dec 19, 2023
fb67ade
using snippet from file_symbol
rmuller-ml Dec 20, 2023
3ac57fc
adding repo_ref, reverting empty filter, adding filter comments
rmuller-ml Dec 20, 2023
c8e7a78
tokeninfo response not pub anymore
rmuller-ml Dec 20, 2023
16ca126
simplifying how we update path in a chunk
rmuller-ml Dec 20, 2023
6e12d94
symbol alias instead of chunk alias
rmuller-ml Dec 20, 2023
d933534
moving prompt to prompts.rs
rmuller-ml Dec 20, 2023
dd3de64
reverting filter
rmuller-ml Dec 20, 2023
1c2f1a6
fix unwrap in get_token_info
rmuller-ml Dec 20, 2023
b132a75
adding some comments
rmuller-ml Dec 20, 2023
229ea00
typo
rmuller-ml Dec 20, 2023
60dfd35
better doc
rmuller-ml Dec 20, 2023
c5f3d82
make start_byte/end_byte usize and move function call def into prompt…
ggordonhall Dec 21, 2023
12c4bb8
log selected symbol
ggordonhall Dec 21, 2023
ad7f0be
unwrap_or_default
rmuller-ml Dec 21, 2023
affab47
revert Cargo.lock
rmuller-ml Dec 21, 2023
5bf128e
latency
rmuller-ml Jan 4, 2024
d591371
use all hoverable symbols, tokeninfo only for the selected one
rmuller-ml Jan 4, 2024
cc65169
no function_calling
rmuller-ml Jan 5, 2024
3c78018
filter out local symbols
oppiliappan Jan 5, 2024
36b7cf4
Merge branch 'ref_def_proc_fast_no_function_call' into ref_def_proc_fast
rmuller-ml Jan 5, 2024
d8036c6
removing unused functions
rmuller-ml Jan 5, 2024
037a3bf
clippy filter_map
rmuller-ml Jan 5, 2024
8be5318
prompt improved
rmuller-ml Jan 5, 2024
ed8bd14
fix explanation and add a const
rmuller-ml Jan 5, 2024
e8b9782
reformat prompt and inline llm call
ggordonhall Jan 8, 2024
8b2f65f
remove dbg! calls
ggordonhall Jan 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions server/bleep/src/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const MAX_STEPS: usize = 10;
pub mod exchange;
pub mod model;
pub mod prompts;
pub mod symbol;
pub mod transcoder;

/// A collection of modules that each add methods to `Agent`.
Expand Down
4 changes: 2 additions & 2 deletions server/bleep/src/agent/exchange.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,14 @@ impl SearchStep {
#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct CodeChunk {
pub path: String,
#[serde(rename = "alias")]
pub alias: usize,
#[serde(rename = "snippet")]
pub snippet: String,
#[serde(rename = "start")]
pub start_line: usize,
#[serde(rename = "end")]
pub end_line: usize,
pub start_byte: Option<usize>,
pub end_byte: Option<usize>,
}

impl CodeChunk {
Expand Down
19 changes: 18 additions & 1 deletion server/bleep/src/agent/prompts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub fn system<'a>(paths: impl IntoIterator<Item = &'a str>) -> String {
- DO NOT call a function that you've used before with the same arguments
- DO NOT assume the structure of the codebase, or the existence of files or folders
- Your queries to functions.code or functions.path should be significantly different to previous queries
- Call functions.none with paths that you are confident will help answer the user's query
- Call functions.none with paths that you are confident will help answer the user's query, include paths containing the information needed for a complete answer including definitions and references
- If the user query is general (e.g. 'What does this do?', 'What is this repo?') look for READMEs, documentation and entry points in the code (main files, index files, api files etc.)
- If the user is referring to, or asking for, information that is in your history, call functions.none
- If after attempting to gather information you are still unsure how to answer the query, call functions.none
Expand Down Expand Up @@ -386,6 +386,23 @@ Here is the full context for reference:
)
}

pub fn symbol_classification_prompt(snippets: &str) -> String {
format!(
r#"{snippets}

Above are code chunks and non-local symbols that have been extracted from the chunks. Each chunk is followed by an enumerated list of symbols that it contains. Given a user query, select the symbol which is most relevant to it, e.g. the references or definition of this symbol would help somebody answer the query. Symbols which are language builtins or which come from third party libraries are unlikely to be helpful.

Do not answer with the symbol name, use the symbol index.

### Examples ###
Q: how does ranking work?
23

Q: which function makes an api call
3"#
)
}

pub fn hypothetical_document_prompt(query: &str) -> String {
format!(
r#"Write a code snippet that could hypothetically be returned by a code search engine as the answer to the query: {query}
Expand Down
323 changes: 323 additions & 0 deletions server/bleep/src/agent/symbol.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
use crate::agent::{exchange::CodeChunk, Agent};
use crate::intelligence::{code_navigation::FileSymbols, Language, TSLanguage};
use crate::llm_gateway;
use crate::webserver::intelligence::{get_token_info, TokenInfoRequest};
use anyhow::{Context, Result};
use tracing::log::{debug, info, warn};

use super::prompts::symbol_classification_prompt;

pub struct ChunkWithHoverableSymbols {
pub chunk: CodeChunk,
pub symbols: Vec<HoverableSymbol>,
}

/// This helps the code and proc tool return related chunks based on references and definitions.
/// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks
/// For each input chunk, we extract all symbols (variables, function names, structs...).
/// Then we disconsider symbols that are defined in the same file using the scope graph.
/// We then pick ONE symbol using a classifier (`filter_symbols`), where the classifier has access to user query, original chunks and filtered list of symbols.
/// This selected symbol may be present in many files one or more times.
/// We extract the surrounding code (up to `NUMBER_CHUNK_LINES` lines) for each occurence and pick `MAX_CHUNKS` occurrences/chunks.

impl Agent {
pub async fn extract_hoverable_symbols(
&self,
chunk: CodeChunk,
) -> Result<ChunkWithHoverableSymbols> {
// get hoverable elements
let document = self
.app
.indexes
.file
.by_path(&self.repo_ref, &chunk.path, None)
.await?
.with_context(|| format!("failed to read path: {}", &chunk.path))?;

let graph = document
.symbol_locations
.scope_graph()
.with_context(|| format!("no scope graph for file: {}", &chunk.path))?;

let hoverable_ranges = document
.hoverable_ranges()
.ok_or_else(|| anyhow::anyhow!("no hoverable ranges"))?;

let mut symbols = hoverable_ranges
.into_iter()
.filter(|range| {
(range.start.byte >= chunk.start_byte.unwrap_or_default())
&& (range.start.byte < chunk.end_byte.unwrap_or_default())
})
.filter(|range| {
// if this node can be resolved locally in the scope-graph, omit it
if let Some(node_by_range) = graph.node_by_range(range.start.byte, range.end.byte) {
if graph.is_reference(node_by_range) || graph.is_definition(node_by_range) {
return false;
}
}
true
})
.map(|range| HoverableSymbol {
name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default())
..(range.end.byte - chunk.start_byte.unwrap_or_default())]
.to_string(),
token_info_request: TokenInfoRequest {
relative_path: chunk.path.clone(),
repo_ref: self.repo_ref.display_name(),
branch: None,
start: range.start.byte,
end: range.end.byte,
},
path: chunk.path.clone(),
})
.collect::<Vec<_>>();

symbols.sort_by(|a, b| a.name.cmp(&b.name));
symbols.dedup_by(|a, b| a.name == b.name);

debug!(
"Attached {} symbols: {:?}",
symbols.len(),
symbols.iter().map(|s| s.name.as_str()).collect::<Vec<_>>()
);

Ok(ChunkWithHoverableSymbols {
chunk: chunk.clone(),
symbols,
})
}

pub async fn expand_symbol_into_chunks(&self, symbol: Symbol) -> Vec<CodeChunk> {
// each symbol may be in multiple files and have multiple occurences in each file
symbol
.related_symbols
.iter()
.flat_map(|file_symbols| {
let filename = file_symbols.file.clone();

file_symbols
.data
.iter()
.map(|occurrence| CodeChunk {
path: filename.clone(),
alias: 0,
snippet: occurrence.snippet.data.clone(),
start_line: occurrence.snippet.line_range.start,
end_line: occurrence.snippet.line_range.end,
start_byte: None,
end_byte: None,
})
.collect::<Vec<_>>()
})
.collect::<Vec<_>>()
}

pub async fn filter_symbols(
&self,
query: &str,
chunks_with_symbols: Vec<ChunkWithHoverableSymbols>,
) -> Result<Symbol, SymbolError> {
if chunks_with_symbols.is_empty() {
return Err(SymbolError::ListEmpty);
}

const NUMBER_CHUNK_LINES: usize = 10;

// we have multiples chunks and each chunk may have multiple symbols
// unique alias (i) per symbol
let mut i: i32 = -1;
let symbols = chunks_with_symbols
.into_iter()
.map(|chunk_with_symbol| {
(
chunk_with_symbol.chunk,
chunk_with_symbol
.symbols
.into_iter()
.map(|symbol| {
i += 1;
(i, symbol)
})
.collect::<Vec<_>>(),
)
})
.collect::<Vec<_>>();

// Classifier

// context
let chunks_string = symbols
.iter()
.filter(|(_, s)| !s.is_empty())
.map(|(c, s)| {
let symbols_string = s
.iter()
.map(|(i, refdef)| format!("{}: {}", i, refdef.name))
.collect::<Vec<_>>()
.join("\n");

format!(
"```{}\n{}```\n\n{}",
c.path.clone(),
c.snippet.clone(),
symbols_string
)
})
.collect::<Vec<_>>()
.join("\n\n");

// instruction
let messages = vec![
llm_gateway::api::Message::system(&symbol_classification_prompt(&chunks_string)),
llm_gateway::api::Message::user(query),
];

let response = match self
.llm_gateway
.clone()
.model("gpt-4-0613")
.temperature(0.0)
.chat(&messages, None)
.await
{
Ok(response) => response,
Err(e) => {
warn!(
"Symbol classifier llm call failed, picking the first symbol: {}",
e
);
"0".into()
}
};

let selected_symbol = match response.as_str().parse::<i32>() {
Ok(symbol) => symbol,
Err(e) => {
warn!("Parsing to integer failed, picking the first symbol: {}", e);
0
}
};

// finding symbol metadata
match symbols
.into_iter()
.flat_map(|(_, symbol_with_alias)| symbol_with_alias)
.find(|(alias, _)| *alias == selected_symbol)
{
Some((_alias, symbol_metadata)) => Ok(Symbol {
name: symbol_metadata.name,
related_symbols: {
let document = self
.app
.indexes
.file
.by_path(&self.repo_ref, &symbol_metadata.path, None)
.await
.unwrap()
.unwrap();

let all_docs = {
let associated_langs =
match document.lang.as_deref().map(TSLanguage::from_id) {
Some(Language::Supported(config)) => config.language_ids,
_ => &[],
};
self.app
.indexes
.file
.by_repo(&self.repo_ref, associated_langs.iter(), None)
.await
};

get_token_info(
symbol_metadata.token_info_request,
&self.repo_ref,
self.app.indexes.clone(),
&document,
&all_docs,
Some(0),
Some(NUMBER_CHUNK_LINES),
)
.await
.unwrap()
.into_iter()
.filter(|file_symbol| file_symbol.file != symbol_metadata.path)
.collect::<Vec<_>>()
},
}),
_ => Err(SymbolError::OutOfBounds),
}
}

pub async fn get_related_chunks(&mut self, chunks: Vec<CodeChunk>) -> Vec<CodeChunk> {
const MAX_CHUNKS: usize = 3;

// get symbols with ref/defs for each chunk
let chunks_with_symbols = futures::future::join_all(
chunks
.iter()
.filter(|c| !c.is_empty())
.map(|c| self.extract_hoverable_symbols(c.clone())), // TODO: Log failure
)
.await
.into_iter()
.filter_map(Result::ok)
.collect();

// get original user query
let user_query = self.last_exchange().query.target().unwrap();

// select one symbol
let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await {
Ok(selected_symbol) => {
info!("Selected symbol: {}", selected_symbol.name);
selected_symbol
}
Err(e) => {
info!("Returning no extra chunks: {}", e);
return Vec::new();
}
};

// take 3 chunks, update path aliases, update enchange chunks
let extra_chunks = self
.expand_symbol_into_chunks(selected_symbol)
.await
.iter()
.take(MAX_CHUNKS)
.map(|c| {
let chunk = CodeChunk {
alias: self.get_path_alias(c.path.as_str()),
..c.clone()
};
self.exchanges
.last_mut()
.unwrap()
.code_chunks
.push(chunk.clone());
chunk
})
.collect::<Vec<_>>();

extra_chunks
}
}

pub struct HoverableSymbol {
pub name: String,
pub token_info_request: TokenInfoRequest,
pub path: String,
}
pub struct Symbol {
pub name: String,
pub related_symbols: Vec<FileSymbols>,
}

#[derive(thiserror::Error, Debug)]
pub enum SymbolError {
#[error("No symbol retrieved in the provided chunks")]
ListEmpty,
#[error("Selected symbol out of bounds")]
OutOfBounds,
}
4 changes: 4 additions & 0 deletions server/bleep/src/agent/tools/answer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,8 @@ impl Agent {
snippet,
start_line: span.start,
end_line: span.end,
start_byte: None,
end_byte: None,
}
})
.collect::<Vec<CodeChunk>>();
Expand All @@ -383,6 +385,8 @@ impl Agent {
snippet: trimmed_snippet.to_string(),
start_line: chunk.start_line,
end_line: (chunk.start_line + num_trimmed_lines).saturating_sub(1),
start_byte: chunk.start_byte,
end_byte: chunk.end_byte,
}]
} else {
code_chunks
Expand Down
Loading