diff --git a/src/repo_context.rs b/src/repo_context.rs index 02d99c0..2dce47c 100644 --- a/src/repo_context.rs +++ b/src/repo_context.rs @@ -936,17 +936,20 @@ impl RepoContextExtractor { file_content: &str, keywords: &[String], ) -> Vec { - let lines: Vec = file_content.lines().map(|s| s.to_string()).collect(); + let lines: Vec<&str> = file_content.lines().collect(); if lines.is_empty() { return Vec::new(); } + // Pre-calculate lowercased keywords to avoid repeated allocation + let keywords_lower: Vec = keywords.iter().map(|k| k.to_lowercase()).collect(); + // Find line numbers that contain any of the keywords (case-insensitive) let mut matching_lines = HashSet::new(); for (line_idx, line) in lines.iter().enumerate() { let line_lower = line.to_lowercase(); - for keyword in keywords { - if line_lower.contains(&keyword.to_lowercase()) { + for keyword in &keywords_lower { + if line_lower.contains(keyword) { matching_lines.insert(line_idx); break; // Found a match, no need to check other keywords for this line } @@ -985,7 +988,8 @@ impl RepoContextExtractor { // Convert ranges to FileContentMatch structs let mut matches = Vec::new(); for (start, end) in merged_ranges { - let range_lines = lines[start..=end].to_vec(); + let range_lines: Vec = + lines[start..=end].iter().map(|&s| s.to_string()).collect(); matches.push(FileContentMatch { start_line: start + 1, // Convert to 1-based line numbering end_line: end + 1, // Convert to 1-based line numbering diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 53cd5d5..0b66f88 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -28,4 +28,6 @@ pub mod polling_test; #[cfg(test)] pub mod polling_tests; #[cfg(test)] +pub mod repo_context_perf_test; +#[cfg(test)] pub mod repo_context_tests; diff --git a/src/tests/repo_context_perf_test.rs b/src/tests/repo_context_perf_test.rs new file mode 100644 index 0000000..93a5e6f --- /dev/null +++ b/src/tests/repo_context_perf_test.rs @@ -0,0 +1,64 @@ +#[cfg(test)] +mod tests { + use crate::config::AppSettings; + use crate::file_indexer::FileIndexManager; + use crate::gitlab::GitlabApiClient; + use crate::repo_context::RepoContextExtractor; + use std::sync::Arc; + use std::time::Instant; + + #[test] + fn test_extract_relevant_file_sections_perf() { + // Setup minimal extractor + let settings = AppSettings { + context_lines: 5, + ..Default::default() + }; + let settings_arc = Arc::new(settings); + // We use a dummy client since we won't be making network calls, + // but we need to construct it properly to satisfy types. + let mut minimal_settings = AppSettings::default(); + minimal_settings.gitlab_url = "https://example.com".to_string(); + minimal_settings.gitlab_token = "dummy".to_string(); + minimal_settings.openai_api_key = "dummy".to_string(); + + let valid_settings = Arc::new(minimal_settings); + let gitlab_client = Arc::new(GitlabApiClient::new(valid_settings.clone()).unwrap()); + let file_index_manager = Arc::new(FileIndexManager::new(gitlab_client.clone(), 3600)); + + let extractor = RepoContextExtractor::new_with_file_indexer( + gitlab_client, + settings_arc, // This one is used for context_lines + file_index_manager, + ); + + // Generate large content + // 100,000 lines, ~50 chars per line -> ~5MB + let line_count = 100_000; + let mut content = String::with_capacity(line_count * 60); + for i in 0..line_count { + if i % 1000 == 0 { + content.push_str("This line contains the magic keyword TARGET.\n"); + } else { + content.push_str( + "This is a regular line of code with some content that is not relevant.\n", + ); + } + } + + let keywords = vec!["TARGET".to_string()]; + + println!("Starting benchmark with {} lines...", line_count); + let start = Instant::now(); + + // Run multiple times to average? Or just once for large enough dataset. + // 100k lines should be enough to see difference. + let _matches = extractor.extract_relevant_file_sections(&content, &keywords); + + let duration = start.elapsed(); + println!("Extraction took: {:?}", duration); + + // Sanity check + assert!(!_matches.is_empty()); + } +}