diff --git a/Cargo.lock b/Cargo.lock index 37adffa5dc..7cf92684f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,15 +76,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "aho-corasick" -version = "0.6.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" -dependencies = [ - "memchr", -] - [[package]] name = "aho-corasick" version = "0.7.20" @@ -535,10 +526,9 @@ dependencies = [ "pretty_assertions", "qdrant-client", "quick-xml 0.29.0", - "rake", "rand 0.8.5", "rayon", - "regex 1.10.2", + "regex", "regex-syntax 0.6.29", "relative-path", "reqwest", @@ -1015,7 +1005,7 @@ dependencies = [ "entities", "memchr", "once_cell", - "regex 1.10.2", + "regex", "slug", "typed-arena", "unicode_categories", @@ -1061,7 +1051,7 @@ dependencies = [ "prost-types", "serde", "serde_json", - "thread_local 1.1.7", + "thread_local", "tokio", "tokio-stream", "tonic", @@ -1217,7 +1207,7 @@ dependencies = [ "oorandom", "plotters", "rayon", - "regex 1.10.2", + "regex", "serde", "serde_derive", "serde_json", @@ -1682,7 +1672,7 @@ dependencies = [ "atty", "humantime", "log", - "regex 1.10.2", + "regex", "termcolor", ] @@ -1785,7 +1775,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2" dependencies = [ "bit-set", - "regex 1.10.2", + "regex", ] [[package]] @@ -2064,7 +2054,7 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94" dependencies = [ - "thread_local 1.1.7", + "thread_local", ] [[package]] @@ -2347,7 +2337,7 @@ dependencies = [ "gix-worktree-stream", "once_cell", "parking_lot 0.12.1", - "regex 1.10.2", + "regex", "reqwest", "signal-hook", "smallvec", @@ -3119,7 +3109,7 @@ dependencies = [ "bstr", "fnv", "log", - "regex 1.10.2", + "regex", ] [[package]] @@ -3523,7 +3513,7 @@ dependencies = [ "phf 0.11.2", "phf_codegen 0.11.2", "polyglot_tokenizer", - "regex 1.10.2", + "regex", "serde", "serde_yaml", ] @@ -3608,9 +3598,9 @@ dependencies = [ "lazy_static", "log", "memchr", - "regex 1.10.2", + "regex", "same-file", - "thread_local 1.1.7", + "thread_local", "walkdir", "winapi-util", ] @@ -3989,7 +3979,7 @@ checksum = "e723bd417b2df60a0f6a2b6825f297ea04b245d4ba52b5a22cb679bdf58b05fa" dependencies = [ "lazy-regex-proc_macros", "once_cell", - "regex 1.10.2", + "regex", ] [[package]] @@ -4000,7 +3990,7 @@ checksum = "0f0a1d9139f0ee2e862e08a9c5d0ba0470f2aa21cd1e1aa1b1562f83116c725f" dependencies = [ "proc-macro2", "quote", - "regex 1.10.2", + "regex", "syn 2.0.38", ] @@ -4082,7 +4072,7 @@ dependencies = [ "memmap2 0.5.10", "partial_sort", "rand 0.8.5", - "regex 1.10.2", + "regex", "serde", "serde_bytes", "thiserror", @@ -5532,15 +5522,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rake" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5555e13968a316e4d4eb1274246ac197a550f9f8e066ddc7076088494943f805" -dependencies = [ - "regex 0.2.11", -] - [[package]] name = "rand" version = "0.4.6" @@ -5751,19 +5732,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "regex" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9329abc99e39129fcceabd24cf5d85b4671ef7c29c50e972bc5afe32438ec384" -dependencies = [ - "aho-corasick 0.6.10", - "memchr", - "regex-syntax 0.5.6", - "thread_local 0.3.6", - "utf8-ranges", -] - [[package]] name = "regex" version = "1.10.2" @@ -5796,15 +5764,6 @@ dependencies = [ "regex-syntax 0.8.2", ] -[[package]] -name = "regex-syntax" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d707a4fa2637f2dca2ef9fd02225ec7661fe01a53623c1e6515b6916511f7a7" -dependencies = [ - "ucd-util", -] - [[package]] name = "regex-syntax" version = "0.6.29" @@ -6249,7 +6208,7 @@ checksum = "18a7b80fa1dd6830a348d38a8d3a9761179047757b7dca29aef82db0118b9670" dependencies = [ "backtrace", "once_cell", - "regex 1.10.2", + "regex", "sentry-core", ] @@ -7029,7 +6988,7 @@ dependencies = [ "once_cell", "oneshot", "rayon", - "regex 1.10.2", + "regex", "rust-stemmers", "rustc-hash", "serde", @@ -7243,7 +7202,7 @@ dependencies = [ "percent-encoding", "rand 0.8.5", "raw-window-handle", - "regex 1.10.2", + "regex", "reqwest", "rfd", "semver", @@ -7304,7 +7263,7 @@ dependencies = [ "png", "proc-macro2", "quote", - "regex 1.10.2", + "regex", "semver", "serde", "serde_json", @@ -7494,15 +7453,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "thread_local" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" -dependencies = [ - "lazy_static", -] - [[package]] name = "thread_local" version = "1.1.7" @@ -7604,7 +7554,7 @@ dependencies = [ "rand 0.8.5", "rayon", "rayon-cond 0.1.0", - "regex 1.10.2", + "regex", "regex-syntax 0.7.5", "serde", "serde_json", @@ -7637,7 +7587,7 @@ dependencies = [ "rand 0.8.5", "rayon", "rayon-cond 0.3.0", - "regex 1.10.2", + "regex", "regex-syntax 0.7.5", "serde", "serde_json", @@ -7977,10 +7927,10 @@ dependencies = [ "matchers", "nu-ansi-term", "once_cell", - "regex 1.10.2", + "regex", "sharded-slab", "smallvec", - "thread_local 1.1.7", + "thread_local", "tracing", "tracing-core", "tracing-log", @@ -7993,7 +7943,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e747b1f9b7b931ed39a548c1fae149101497de3c1fc8d9e18c62c1a66c683d3d" dependencies = [ "cc", - "regex 1.10.2", + "regex", ] [[package]] @@ -8155,12 +8105,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" -[[package]] -name = "ucd-util" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abd2fc5d32b590614af8b0a20d837f32eca055edd0bbead59a9cfe80858be003" - [[package]] name = "uluru" version = "3.0.0" @@ -8604,7 +8548,7 @@ version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aac48ef20ddf657755fdcda8dfed2a7b4fc7e4581acce6fe9b88c3d64f29dee7" dependencies = [ - "regex 1.10.2", + "regex", "serde", "serde_json", "thiserror", diff --git a/server/bleep/Cargo.toml b/server/bleep/Cargo.toml index 78a7e0a715..c7b44cfeb6 100644 --- a/server/bleep/Cargo.toml +++ b/server/bleep/Cargo.toml @@ -111,7 +111,6 @@ git-version = "0.3.5" gix = { git = "https://github.com/BloopAI/gitoxide", version="0.55.2", features = ["blocking-http-transport-reqwest-rust-tls-no-trust-dns", "pack-cache-lru-static"] } # semantic -rake = "0.1" qdrant-client = { version = "1.5.0", default-features = false } tiktoken-rs = "0.4.5" tokenizers = { version = "0.14.0", default-features = false, features = ["progressbar", "cli", "onig", "esaxx_fast"] } diff --git a/server/bleep/src/agent.rs b/server/bleep/src/agent.rs index d07c6304f0..5d470fe9e4 100644 --- a/server/bleep/src/agent.rs +++ b/server/bleep/src/agent.rs @@ -2,8 +2,6 @@ use std::{sync::Arc, time::Duration}; use anyhow::{anyhow, Context, Result}; use futures::{Future, TryStreamExt}; -use once_cell::sync::OnceCell; -use rake::*; use tokio::sync::mpsc::Sender; use tracing::{debug, error, info, instrument}; @@ -11,7 +9,7 @@ use crate::{ analytics::{EventData, QueryEvent}, indexes::reader::{ContentDocument, FileDocument}, llm_gateway::{self, api::FunctionCall}, - query::parser, + query::{parser, stopwords::remove_stopwords}, repo::RepoRef, semantic, webserver::{ @@ -43,19 +41,6 @@ mod tools { pub mod proc; } -static STOPWORDS: OnceCell = OnceCell::new(); -static STOP_WORDS_LIST: &str = include_str!("stopwords.txt"); - -fn stop_words() -> &'static StopWords { - STOPWORDS.get_or_init(|| { - let mut sw = StopWords::new(); - for w in STOP_WORDS_LIST.lines() { - sw.insert(w.to_string()); - } - sw - }) -} - pub enum Error { Timeout(Duration), Processing(anyhow::Error), @@ -196,23 +181,14 @@ impl Agent { // Always make a code search for the user query on the first exchange if self.exchanges.len() == 1 { - // Extract keywords from the query let keywords = { - let sw = stop_words(); - let r = Rake::new(sw.clone()); - let keywords = r.run(s); - - if keywords.is_empty() { + let keys = remove_stopwords(s); + if keys.is_empty() { s.clone() } else { - keywords - .iter() - .map(|k| k.keyword.clone()) - .collect::>() - .join(" ") + keys } }; - self.code_search(&keywords).await?; } s.clone() diff --git a/server/bleep/src/agent/tools/code.rs b/server/bleep/src/agent/tools/code.rs index 758e3fd02f..8f2017c831 100644 --- a/server/bleep/src/agent/tools/code.rs +++ b/server/bleep/src/agent/tools/code.rs @@ -110,6 +110,7 @@ impl Agent { .llm_gateway .clone() .model("gpt-3.5-turbo-0613") + .temperature(0.0) .chat(&prompt, None) .await?; diff --git a/server/bleep/src/query.rs b/server/bleep/src/query.rs index 06c2202f2f..6c7846ba6c 100644 --- a/server/bleep/src/query.rs +++ b/server/bleep/src/query.rs @@ -4,3 +4,4 @@ pub mod languages; pub mod parser; pub mod planner; pub mod ranking; +pub mod stopwords; diff --git a/server/bleep/src/query/stopwords.rs b/server/bleep/src/query/stopwords.rs new file mode 100644 index 0000000000..fff1137f82 --- /dev/null +++ b/server/bleep/src/query/stopwords.rs @@ -0,0 +1,73 @@ +// Portions of this code (the `phrases` function) are modifications of +// https://github.com/yaa110/rake-rs/blob/master/src/rake.rs +// licensed under the MIT License: +/* +Copyright (c) 2018 Navid Fathollahzade + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +use lazy_regex::regex; +use once_cell::sync::Lazy; +use std::collections::HashSet; + +type StopWords = HashSet<&'static str>; + +static STOPWORDS: Lazy = Lazy::new(|| { + let word_list = include_str!("stopwords.txt"); + let mut sw = StopWords::new(); + for w in word_list.lines() { + sw.insert(w); + } + sw +}); + +/// Extract `phrases`, where each phrase is a sequence of non-stopwords +fn phrases<'a>(phrases_iter: impl IntoIterator) -> Vec> { + let phrases_iter = phrases_iter.into_iter(); + let mut phrases = Vec::with_capacity(2 * phrases_iter.size_hint().0); + for s in phrases_iter.filter(|s| !s.is_empty()) { + let mut phrase = Vec::new(); + for word in s.split_whitespace() { + if STOPWORDS.contains(word.to_lowercase().as_str()) { + if !phrase.is_empty() { + phrases.push(phrase.clone()); + phrase.clear(); + } + } else { + phrase.push(word); + } + } + if !phrase.is_empty() { + phrases.push(phrase); + } + } + phrases +} + +pub fn remove_stopwords(text: &str) -> String { + let phrases = phrases(regex!("[^a-zA-Z0-9_/ -]").split(text)); + phrases.into_iter().flatten().collect::>().join(" ") +} diff --git a/server/bleep/src/stopwords.txt b/server/bleep/src/query/stopwords.txt similarity index 100% rename from server/bleep/src/stopwords.txt rename to server/bleep/src/query/stopwords.txt