diff --git a/server/bleep/src/agent.rs b/server/bleep/src/agent.rs index 9d1abbe5c9..9c8886aea6 100644 --- a/server/bleep/src/agent.rs +++ b/server/bleep/src/agent.rs @@ -463,7 +463,6 @@ impl Agent { query: &str, ) -> impl Iterator + 'a { let langs = self.last_exchange().query.langs.iter().map(Deref::deref); - let user_id = self.user.username().expect("didn't have user ID"); let (repos, branches): (Vec<_>, Vec<_>) = sqlx::query! { @@ -480,6 +479,7 @@ impl Agent { let repo_ref = RepoRef::from_str(&row.repo_ref).ok()?; Some((repo_ref, row.branch)) }) + .filter(|(repo_ref, _)| self.repo_refs.contains(repo_ref)) .unzip(); let branch = branches.first().cloned().flatten(); @@ -488,7 +488,7 @@ impl Agent { self.app .indexes .file - .fuzzy_path_match(repos.into_iter(), branch.as_deref(), query, langs, 50) + .skim_fuzzy_path_match(repos.into_iter(), query, branch.as_deref(), langs, 50) .await } diff --git a/server/bleep/src/agent/tools/path.rs b/server/bleep/src/agent/tools/path.rs index 35ea8b4fb6..74f8c276fc 100644 --- a/server/bleep/src/agent/tools/path.rs +++ b/server/bleep/src/agent/tools/path.rs @@ -64,7 +64,7 @@ impl Agent { let mut paths = paths .iter() - .map(|repo_path| (self.get_path_alias(repo_path), repo_path.path.to_string())) + .map(|repo_path| (self.get_path_alias(repo_path), repo_path.to_string())) .collect::>(); paths.sort_by(|a: &(usize, String), b| a.0.cmp(&b.0)); // Sort by alias diff --git a/server/bleep/src/indexes/file.rs b/server/bleep/src/indexes/file.rs index d5d3592dea..6d866d6206 100644 --- a/server/bleep/src/indexes/file.rs +++ b/server/bleep/src/indexes/file.rs @@ -1,5 +1,5 @@ use std::{ - collections::{HashMap, HashSet}, + collections::HashSet, panic::AssertUnwindSafe, path::{Path, PathBuf}, sync::atomic::{AtomicU64, Ordering}, @@ -32,7 +32,6 @@ use super::{ DocumentRead, Indexable, Indexer, }; use crate::{ - agent::Project, background::SyncHandle, cache::{CacheKeys, FileCache, FileCacheSnapshot}, intelligence::TreeSitterFile, @@ -239,136 +238,12 @@ impl Indexable for File { } impl Indexer { - /// Search this index for paths fuzzily matching a given string. - /// - /// For example, the string `Cargo` can return documents whose path is `foo/Cargo.toml`, - /// or `bar/Cargo.lock`. Constructs regexes that permit an edit-distance of 2. - /// - /// If the regex filter fails to build, an empty list is returned. - pub async fn fuzzy_path_match( - &self, - repos: impl Iterator, - branch: Option<&str>, - query_str: &str, - langs: impl Iterator, - limit: usize, - ) -> impl Iterator + '_ { - // lifted from query::compiler - let searcher = self.reader.searcher(); - let collector = TopDocs::with_limit(5 * limit); // TODO: tune this - let file_source = &self.source; - - let branch_scope = branch - .map(|b| { - trigrams(b) - .map(|token| Term::from_field_text(self.source.branches, token.as_str())) - .map(|term| TermQuery::new(term, IndexRecordOption::Basic)) - .map(Box::new) - .map(|q| q as Box) - .collect::>() - }) - .map(BooleanQuery::intersection); - - let repo_scope = BooleanQuery::union( - repos - .map(|repo| { - Box::new(TermQuery::new( - Term::from_field_text(self.source.repo_name, &repo.to_string()), - IndexRecordOption::Basic, - )) as Box - }) - .collect::>(), - ); - - // hits is a mapping between a document address and the number of trigrams in it that - // matched the query - let langs_query = BooleanQuery::union( - langs - .map(|l| Term::from_field_bytes(self.source.lang, l.as_bytes())) - .map(|t| TermQuery::new(t, IndexRecordOption::Basic)) - .map(Box::new) - .map(|q| q as Box) - .collect::>(), - ); - - let mut hits = trigrams(query_str) - .flat_map(|s| case_permutations(s.as_str())) - .map(|token| Term::from_field_text(self.source.relative_path, token.as_str())) - .map(|term| TermQuery::new(term, IndexRecordOption::Basic)) - .flat_map(|query| { - let mut q: Vec> = - vec![Box::new(repo_scope.clone()), Box::new(query)]; - q.extend(branch_scope.clone().map(|q| Box::new(q) as Box)); - q.push(Box::new(langs_query.clone())); - - searcher - .search(&BooleanQuery::intersection(q), &collector) - .expect("failed to search index") - .into_iter() - .map(move |(_, addr)| addr) - }) - .fold(HashMap::new(), |mut map: HashMap<_, usize>, hit| { - *map.entry(hit).or_insert(0) += 1; - map - }) - .into_iter() - .map(move |(addr, count)| { - let retrieved_doc = searcher - .doc(addr) - .expect("failed to get document by address"); - let doc = FileReader.read_document(file_source, retrieved_doc); - (doc, count) - }) - .collect::>(); - - // order hits in - // - decsending order of number of matched trigrams - // - alphabetical order of relative paths to break ties - // - // - // for a list of hits like so: - // - // apple.rs 2 - // ball.rs 3 - // cat.rs 2 - // - // the ordering produced is: - // - // ball.rs 3 -- highest number of hits - // apple.rs 2 -- same numeber of hits, but alphabetically preceeds cat.rs - // cat.rs 2 - // - hits.sort_by(|(this_doc, this_count), (other_doc, other_count)| { - let order_count_desc = other_count.cmp(this_count); - let order_path_asc = this_doc - .relative_path - .as_str() - .cmp(other_doc.relative_path.as_str()); - - order_count_desc.then(order_path_asc) - }); - - let regex_filter = build_fuzzy_regex_filter(query_str); - - // if the regex filter fails to build for some reason, the filter defaults to returning - // false and zero results are produced - hits.into_iter() - .map(|(doc, _)| doc) - .filter(move |doc| { - regex_filter - .as_ref() - .map(|f| f.is_match(&doc.relative_path)) - .unwrap_or_default() - }) - .filter(|doc| !doc.relative_path.ends_with('/')) // omit directories - .take(limit) - } - pub async fn skim_fuzzy_path_match( &self, repo_refs: impl IntoIterator, query_str: &str, branch: Option<&str>, + langs: impl Iterator, limit: usize, ) -> impl Iterator + '_ { let searcher = self.reader.searcher(); @@ -400,6 +275,19 @@ impl Indexer { }) .map(BooleanQuery::intersection) .map(Box::new); + + let langs_term = langs + .map(|l| Term::from_field_bytes(self.source.lang, l.as_bytes())) + .map(|t| TermQuery::new(t, IndexRecordOption::Basic)) + .map(Box::new) + .map(|q| q as Box) + .collect::>(); + + let langs_term = match langs_term.len() { + 0 => None, + _ => Some(Box::new(BooleanQuery::union(langs_term))), + }; + let search_terms = trigrams(query_str) .flat_map(|s| case_permutations(s.as_str())) .map(|token| Term::from_field_text(self.source.relative_path, token.as_str())) @@ -413,6 +301,10 @@ impl Indexer { .as_ref() .map(Box::clone) .map(|t| t as Box), + langs_term + .as_ref() + .map(Box::clone) + .map(|t| t as Box), ] .into_iter() .flatten() @@ -886,86 +778,3 @@ impl RepoFile { )) } } - -fn build_fuzzy_regex_filter(query_str: &str) -> Option { - fn additions(s: &str, i: usize, j: usize) -> String { - if i > j { - additions(s, j, i) - } else { - let mut s = s.to_owned(); - s.insert_str(j, ".?"); - s.insert_str(i, ".?"); - s - } - } - - fn replacements(s: &str, i: usize, j: usize) -> String { - if i > j { - replacements(s, j, i) - } else { - let mut s = s.to_owned(); - s.remove(j); - s.insert_str(j, ".?"); - - s.remove(i); - s.insert_str(i, ".?"); - - s - } - } - - fn one_of_each(s: &str, i: usize, j: usize) -> String { - if i > j { - one_of_each(s, j, i) - } else { - let mut s = s.to_owned(); - s.remove(j); - s.insert_str(j, ".?"); - - s.insert_str(i, ".?"); - s - } - } - - let all_regexes = (query_str.char_indices().map(|(idx, _)| idx)) - .flat_map(|i| (query_str.char_indices().map(|(idx, _)| idx)).map(move |j| (i, j))) - .filter(|(i, j)| i <= j) - .flat_map(|(i, j)| { - let mut v = vec![]; - if j != query_str.len() { - v.push(one_of_each(query_str, i, j)); - v.push(replacements(query_str, i, j)); - } - v.push(additions(query_str, i, j)); - v - }); - - regex::RegexSetBuilder::new(all_regexes) - // Increased from the default to account for long paths. At the time of writing, - // the default was `10 * (1 << 20)`. - .size_limit(10 * (1 << 25)) - .case_insensitive(true) - .build() - .ok() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn fuzzy_multibyte_should_compile() { - let multibyte_str = "查询解析器在哪"; - let filter = build_fuzzy_regex_filter(multibyte_str); - assert!(filter.is_some()); - - // tests removal of second character - assert!(filter.as_ref().unwrap().is_match("查解析器在哪")); - - // tests replacement of second character with `n` - assert!(filter.as_ref().unwrap().is_match("查n析器在哪")); - - // tests addition of character `n` - assert!(filter.as_ref().unwrap().is_match("查询解析器在哪n")); - } -} diff --git a/server/bleep/src/webserver/search.rs b/server/bleep/src/webserver/search.rs index 25b943c45a..1c88cd8e15 100644 --- a/server/bleep/src/webserver/search.rs +++ b/server/bleep/src/webserver/search.rs @@ -64,6 +64,7 @@ pub(super) async fn fuzzy_path( repo_refs, target, q.first_branch().as_deref(), + std::iter::empty(), args.page_size, ) .await