From aff6dc83e6397c74447a1bca8f35ab8fd1bbccdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:48:26 +0200 Subject: [PATCH 1/7] feat(agent-runtime): add code_search rollout benchmarks Measure native code_search against the shell baseline so rollout guidance reflects current fallback and index behavior. --- .../agent-runtime/benches/agent_benchmarks.rs | 6 + .../docs/design/code-search-tool.md | 6 + .../examples/code_search_rollout_benchmark.rs | 1320 +++++++++++++++++ clients/agent-runtime/src/search/tests.rs | 1 + .../agent-runtime/tools/code-search.md | 204 +++ .../docs/clients/agent-runtime/tools/core.md | 15 + .../clients/agent-runtime/tools/index.mdx | 2 +- .../agent-runtime/tools/code-search.md | 204 +++ .../es/clients/agent-runtime/tools/core.md | 15 + .../es/clients/agent-runtime/tools/index.mdx | 2 +- .../design.md | 392 +++++ .../proposal.md | 78 + .../specs/code-search-rollout/spec.md | 163 ++ .../state.yaml | 12 + .../tasks.md | 25 + .../verify-report.md | 99 ++ openspec/specs/code-search-rollout/spec.md | 163 ++ 17 files changed, 2705 insertions(+), 2 deletions(-) create mode 100644 clients/agent-runtime/examples/code_search_rollout_benchmark.rs create mode 100644 clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md create mode 100644 clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md create mode 100644 openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md create mode 100644 openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/proposal.md create mode 100644 openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/specs/code-search-rollout/spec.md create mode 100644 openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/state.yaml create mode 100644 openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md create mode 100644 openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md create mode 100644 openspec/specs/code-search-rollout/spec.md diff --git a/clients/agent-runtime/benches/agent_benchmarks.rs b/clients/agent-runtime/benches/agent_benchmarks.rs index 3d1164db1..1b2f2fbdd 100644 --- a/clients/agent-runtime/benches/agent_benchmarks.rs +++ b/clients/agent-runtime/benches/agent_benchmarks.rs @@ -1,3 +1,9 @@ +//! Criterion microbenchmarks for hot-loop `code_search` behavior. +//! +//! These benches are intentionally scoped to low-level timing only. For rollout evidence +//! (shell baseline, no-index/cold-build/warm-index comparisons, parity checks, and docs-ready +//! reporting), run `cargo run --example code_search_rollout_benchmark --manifest-path clients/agent-runtime/Cargo.toml`. + use corvus::security::{AutonomyLevel, SecurityPolicy}; use corvus::tools::traits::Tool; use corvus::tools::CodeSearchTool; diff --git a/clients/agent-runtime/docs/design/code-search-tool.md b/clients/agent-runtime/docs/design/code-search-tool.md index 03a8bdaea..d1a60149e 100644 --- a/clients/agent-runtime/docs/design/code-search-tool.md +++ b/clients/agent-runtime/docs/design/code-search-tool.md @@ -18,6 +18,12 @@ workspace sandboxing. v1 uses brute-force directory walking via the `ignore` cra The tool returns both a human-readable grep-like `output` string and a machine-readable `structured` JSON payload, consistent with the `ToolResult` contract. +> Implementation note (2026-04-05): the current runtime behavior is no longer purely brute-force. +> Safe literal queries may use workspace trigram index narrowing when a compatible index exists, +> while regex requests still fall back from planning with `query_regex_not_supported` to +> discovery plus live verification. For rollout evidence and the canonical behavior summary, see +> `docs/clients/agent-runtime/tools/code-search.md`. + ## 1. Tool Schema (API Shape) Tool name: `code_search` diff --git a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs new file mode 100644 index 000000000..b285de3d8 --- /dev/null +++ b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs @@ -0,0 +1,1320 @@ +use anyhow::{Context, Result}; +use corvus::runtime::NativeRuntime; +use corvus::search::{CandidateCoverage, CandidateRequest, WorkspaceTrigramIndex}; +use corvus::security::{AutonomyLevel, NoopSandbox, SecurityPolicy}; +use corvus::tools::traits::{Tool, ToolResult}; +use corvus::tools::{CodeSearchTool, ShellTool}; +use serde_json::{json, Value}; +use std::collections::BTreeMap; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tempfile::TempDir; + +const DEFAULT_SAMPLES: usize = 5; +const DEFAULT_COLD_BUILD_SAMPLES: usize = 2; +const DEFAULT_PATH: &str = "."; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum QueryKind { + Literal, + Regex, +} + +impl QueryKind { + fn as_str(self) -> &'static str { + match self { + Self::Literal => "literal", + Self::Regex => "regex", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ResultShape { + SmallHit, + LargeHit, + NoHit, +} + +impl ResultShape { + fn as_str(self) -> &'static str { + match self { + Self::SmallHit => "small-hit", + Self::LargeHit => "large-hit", + Self::NoHit => "no-hit", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ExecutionMode { + ShellBaseline, + NativeNoIndex, + NativeColdBuild, + NativeWarmIndex, +} + +impl ExecutionMode { + fn as_str(self) -> &'static str { + match self { + Self::ShellBaseline => "shell_baseline", + Self::NativeNoIndex => "native_no_index", + Self::NativeColdBuild => "native_cold_build", + Self::NativeWarmIndex => "native_warm_index", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PlanMode { + IndexedNarrowing, + FallbackDiscoveryLiveVerification, + IndexUnavailable, +} + +impl PlanMode { + fn as_str(self) -> &'static str { + match self { + Self::IndexedNarrowing => "indexed_narrowing", + Self::FallbackDiscoveryLiveVerification => "fallback_discovery_live_verification", + Self::IndexUnavailable => "index_unavailable", + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct BenchmarkCase { + id: &'static str, + query_kind: QueryKind, + result_shape: ResultShape, + pattern: &'static str, + path: &'static str, + case_sensitive: bool, + whole_word: bool, +} + +impl BenchmarkCase { + fn is_regex(&self) -> bool { + self.query_kind == QueryKind::Regex + } + + fn tool_args(&self) -> Value { + let mut args = json!({ + "pattern": self.pattern, + "is_regex": self.is_regex(), + "case_sensitive": self.case_sensitive, + "whole_word": self.whole_word, + "path": self.path, + "max_results": 500, + }); + if self.path == DEFAULT_PATH { + args.as_object_mut().unwrap().remove("path"); + } + args + } + + fn candidate_request(&self) -> CandidateRequest { + CandidateRequest { + relative_root: self.path.to_string(), + include: Vec::new(), + exclude: Vec::new(), + raw_pattern: self.pattern.to_string(), + is_regex: self.is_regex(), + case_sensitive: self.case_sensitive, + whole_word: self.whole_word, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +struct CanonicalLineMatch { + file: String, + line: usize, + content: String, +} + +#[derive(Debug, Clone)] +struct BenchmarkMeasurement { + case_id: String, + execution_mode: ExecutionMode, + plan_mode: Option, + plan_reason: String, + samples: usize, + median_ms: u64, + p95_ms: u64, + build_median_ms: Option, + search_median_ms: Option, + total_median_ms: Option, + parity_passed: Option, +} + +#[derive(Debug, Clone)] +struct WorkspaceReport { + metadata: EnvironmentMetadata, + matrix: Vec, + measurements: Vec, +} + +#[derive(Debug, Clone)] +struct EnvironmentMetadata { + workspace_label: String, + workspace_root: PathBuf, + workspace_kind: &'static str, + file_count: usize, + os: String, + arch: String, + cpu: String, + rust_profile: &'static str, + benchmarked_at: String, + commit_sha: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum WorkspaceSelection { + Fixture, + Repo, + Both, +} + +#[derive(Debug, Clone)] +struct CliArgs { + workspace: WorkspaceSelection, + repo_path: PathBuf, + samples: usize, + cold_build_samples: usize, +} + +#[derive(Debug)] +struct WorkspaceContext { + label: String, + kind: &'static str, + root: PathBuf, + fixture_guard: Option, + cases: Vec, +} + +#[derive(Debug, Clone)] +struct ShellExecutionSummary { + canonical: Vec, + durations: Vec, +} + +#[derive(Debug, Clone)] +struct NativeExecutionSummary { + canonical: Vec, + search_durations: Vec, + build_durations: Vec, + total_durations: Vec, + plan_mode: PlanMode, + plan_reason: String, +} + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<()> { + let args = parse_args(env::args().skip(1))?; + let mut reports = Vec::new(); + + if matches!( + args.workspace, + WorkspaceSelection::Fixture | WorkspaceSelection::Both + ) { + let workspace = create_fixture_workspace()?; + reports.push(run_workspace_suite(&workspace, args.samples, args.cold_build_samples).await?); + } + + if matches!( + args.workspace, + WorkspaceSelection::Repo | WorkspaceSelection::Both + ) { + let workspace = repo_workspace_context(&args.repo_path)?; + reports.push(run_workspace_suite(&workspace, args.samples, args.cold_build_samples).await?); + } + + for report in reports { + print_workspace_report(&report); + } + + Ok(()) +} + +fn parse_args(mut args: I) -> Result +where + I: Iterator, +{ + let mut workspace = WorkspaceSelection::Both; + let mut repo_path = repo_root_from_manifest()?; + let mut samples = DEFAULT_SAMPLES; + let mut cold_build_samples = DEFAULT_COLD_BUILD_SAMPLES; + + while let Some(arg) = args.next() { + match arg.as_str() { + "--workspace" => { + let value = args.next().context("missing value for --workspace")?; + workspace = match value.as_str() { + "fixture" => WorkspaceSelection::Fixture, + "repo" => WorkspaceSelection::Repo, + "both" => WorkspaceSelection::Both, + other => anyhow::bail!("unsupported --workspace value '{other}'"), + }; + } + "--repo-path" => { + repo_path = PathBuf::from(args.next().context("missing value for --repo-path")?); + } + "--samples" => { + samples = parse_positive_usize( + &args.next().context("missing value for --samples")?, + "--samples", + )?; + } + "--cold-build-samples" => { + cold_build_samples = parse_positive_usize( + &args + .next() + .context("missing value for --cold-build-samples")?, + "--cold-build-samples", + )?; + } + "--help" | "-h" => { + print_help(); + std::process::exit(0); + } + other => anyhow::bail!("unknown argument '{other}'"), + } + } + + Ok(CliArgs { + workspace, + repo_path, + samples, + cold_build_samples, + }) +} + +fn print_help() { + println!( + "code_search rollout benchmark\n\n\ +Usage:\n cargo run --example code_search_rollout_benchmark -- [options]\n\n\ +Options:\n --workspace \n --repo-path \n --samples \n --cold-build-samples \n" + ); +} + +fn parse_positive_usize(raw: &str, flag: &str) -> Result { + let value = raw + .parse::() + .with_context(|| format!("invalid {flag} value '{raw}'"))?; + anyhow::ensure!(value > 0, "{flag} must be > 0"); + Ok(value) +} + +fn repo_root_from_manifest() -> Result { + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest_dir + .parent() + .and_then(Path::parent) + .map(Path::to_path_buf) + .context("failed to derive repo root from CARGO_MANIFEST_DIR") +} + +fn fixture_cases() -> Vec { + vec![ + BenchmarkCase { + id: "literal_small_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::SmallHit, + pattern: "fixture_small_literal_unique", + path: "src", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "literal_large_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::LargeHit, + pattern: "fixture_large_literal_shared", + path: "src", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "literal_no_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::NoHit, + pattern: "fixture_literal_rollout_no_hit", + path: "src", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "regex_small_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::SmallHit, + pattern: "fixture_regex_unique_target", + path: "src", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "regex_large_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::LargeHit, + pattern: "fixture_regex_bulk_case_", + path: "src", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "regex_no_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::NoHit, + pattern: "fixture_regex_rollout_no_match_20260405", + path: "src", + case_sensitive: true, + whole_word: false, + }, + ] +} + +fn repo_cases() -> Vec { + vec![ + BenchmarkCase { + id: "literal_small_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::SmallHit, + pattern: "pub struct ToolResult", + path: "clients/agent-runtime/src/tools", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "literal_large_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::LargeHit, + pattern: "success:", + path: "clients/agent-runtime/src/tools", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "literal_no_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::NoHit, + pattern: "code_search_rollout_literal_no_match_20260405", + path: "clients/agent-runtime/src/tools", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "regex_small_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::SmallHit, + pattern: "ToolResult", + path: "clients/agent-runtime/src/tools", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "regex_large_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::LargeHit, + pattern: "output:", + path: "clients/agent-runtime/src/tools", + case_sensitive: true, + whole_word: false, + }, + BenchmarkCase { + id: "regex_no_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::NoHit, + pattern: "code_search_rollout_regex_no_match_20260405", + path: "clients/agent-runtime/src/tools", + case_sensitive: true, + whole_word: false, + }, + ] +} + +fn create_fixture_workspace() -> Result { + let fixture_guard = TempDir::new().context("failed to create fixture tempdir")?; + let root = fixture_guard.path(); + fs::create_dir_all(root.join("src")).context("failed to create fixture src")?; + fs::create_dir_all(root.join("docs")).context("failed to create fixture docs")?; + + fs::write( + root.join("src/small.rs"), + [ + "fn fixture_regex_unique_target() {", + " let token = \"fixture_small_literal_unique\";", + "}", + ] + .join("\n") + + "\n", + ) + .context("failed to write fixture small.rs")?; + + let mut bulk = String::new(); + for index in 0..24 { + bulk.push_str(&format!( + "fn fixture_regex_bulk_case_{index}() {{ let token = \"fixture_large_literal_shared\"; }}\n" + )); + } + fs::write(root.join("src/bulk.rs"), bulk).context("failed to write fixture bulk.rs")?; + fs::write( + root.join("src/noise.txt"), + "this file is searched too\nfixture_large_literal_shared appears once here\n", + ) + .context("failed to write fixture noise.txt")?; + fs::write(root.join("docs/notes.md"), "documentation only\n") + .context("failed to write fixture docs")?; + + Ok(WorkspaceContext { + label: "deterministic fixture".to_string(), + kind: "fixture", + root: root.to_path_buf(), + fixture_guard: Some(fixture_guard), + cases: fixture_cases(), + }) +} + +fn repo_workspace_context(repo_path: &Path) -> Result { + let root = repo_path + .canonicalize() + .with_context(|| format!("failed to canonicalize repo path '{}'", repo_path.display()))?; + anyhow::ensure!( + root.exists(), + "repo path '{}' does not exist", + root.display() + ); + Ok(WorkspaceContext { + label: "current repo snapshot".to_string(), + kind: "repo_snapshot", + root, + fixture_guard: None, + cases: repo_cases(), + }) +} + +async fn run_workspace_suite( + workspace: &WorkspaceContext, + samples: usize, + cold_build_samples: usize, +) -> Result { + let _fixture_guard = workspace.fixture_guard.as_ref(); + let metadata = capture_environment_metadata(workspace)?; + let security = benchmark_security(&workspace.root); + let shell = benchmark_shell_tool(security.clone()); + let code_search = CodeSearchTool::new(security.clone()); + let index = WorkspaceTrigramIndex::for_workspace(&workspace.root); + let mut measurements = Vec::new(); + + for case in &workspace.cases { + let shell_summary = run_shell_baseline(&shell, case, samples).await?; + measurements.push(BenchmarkMeasurement { + case_id: case.id.to_string(), + execution_mode: ExecutionMode::ShellBaseline, + plan_mode: None, + plan_reason: "shell_grep_baseline".to_string(), + samples, + median_ms: percentile_ms(&shell_summary.durations, 50), + p95_ms: percentile_ms(&shell_summary.durations, 95), + build_median_ms: None, + search_median_ms: None, + total_median_ms: None, + parity_passed: None, + }); + + let no_index = + run_native_no_index(&code_search, &index, security.as_ref(), case, samples).await?; + measurements.push(as_measurement( + case.id, + ExecutionMode::NativeNoIndex, + &no_index, + &shell_summary.canonical, + )); + + let cold = run_native_cold_build( + &code_search, + &index, + security.as_ref(), + case, + cold_build_samples, + ) + .await?; + measurements.push(as_measurement( + case.id, + ExecutionMode::NativeColdBuild, + &cold, + &shell_summary.canonical, + )); + + let warm = + run_native_warm_index(&code_search, &index, security.as_ref(), case, samples).await?; + measurements.push(as_measurement( + case.id, + ExecutionMode::NativeWarmIndex, + &warm, + &shell_summary.canonical, + )); + } + + Ok(WorkspaceReport { + metadata, + matrix: workspace.cases.clone(), + measurements, + }) +} + +fn as_measurement( + case_id: &str, + execution_mode: ExecutionMode, + native: &NativeExecutionSummary, + shell_canonical: &[CanonicalLineMatch], +) -> BenchmarkMeasurement { + BenchmarkMeasurement { + case_id: case_id.to_string(), + execution_mode, + plan_mode: Some(native.plan_mode), + plan_reason: native.plan_reason.clone(), + samples: native.search_durations.len(), + median_ms: percentile_ms(&native.total_durations, 50), + p95_ms: percentile_ms(&native.total_durations, 95), + build_median_ms: (!native.build_durations.is_empty()) + .then(|| percentile_ms(&native.build_durations, 50)), + search_median_ms: Some(percentile_ms(&native.search_durations, 50)), + total_median_ms: Some(percentile_ms(&native.total_durations, 50)), + parity_passed: Some(native.canonical == shell_canonical), + } +} + +fn benchmark_security(workspace: &Path) -> Arc { + let mut policy = SecurityPolicy { + autonomy: AutonomyLevel::Full, + workspace_dir: workspace.to_path_buf(), + max_actions_per_hour: 1_000_000, + ..SecurityPolicy::default() + }; + if !policy + .allowed_commands + .iter() + .any(|command| command == "true") + { + policy.allowed_commands.push("true".to_string()); + } + Arc::new(policy) +} + +fn benchmark_shell_tool(security: Arc) -> ShellTool { + ShellTool::new( + security, + Arc::new(NativeRuntime::new()), + Arc::new(NoopSandbox), + ) +} + +async fn run_shell_baseline( + shell: &ShellTool, + case: &BenchmarkCase, + samples: usize, +) -> Result { + let mut durations = Vec::with_capacity(samples); + let warmup = run_shell_once(shell, case).await?; + let expected = warmup.canonical; + + for _ in 0..samples { + let run = run_shell_once(shell, case).await?; + anyhow::ensure!( + run.canonical == expected, + "shell results drifted for case {}", + case.id + ); + durations.push(run.duration); + } + + Ok(ShellExecutionSummary { + canonical: expected, + durations, + }) +} + +struct ShellRun { + canonical: Vec, + duration: Duration, +} + +async fn run_shell_once(shell: &ShellTool, case: &BenchmarkCase) -> Result { + let command = build_grep_command(case); + let started = Instant::now(); + let result = shell + .execute(json!({ "command": command, "approved": true })) + .await + .context("shell tool execution failed")?; + let duration = started.elapsed(); + anyhow::ensure!( + result.success, + "shell baseline failed for case {}: {:?}", + case.id, + result.error + ); + + Ok(ShellRun { + canonical: canonicalize_shell_output(&result.output)?, + duration, + }) +} + +async fn run_native_no_index( + code_search: &CodeSearchTool, + index: &WorkspaceTrigramIndex, + security: &SecurityPolicy, + case: &BenchmarkCase, + samples: usize, +) -> Result { + let mut search_durations = Vec::with_capacity(samples); + let mut total_durations = Vec::with_capacity(samples); + + clear_index_artifacts(index)?; + let warmup = run_native_search_once(code_search, index, security, case, false).await?; + let canonical = warmup.canonical; + let plan_mode = warmup.plan_mode; + let plan_reason = warmup.plan_reason; + + for _ in 0..samples { + clear_index_artifacts(index)?; + let run = run_native_search_once(code_search, index, security, case, false).await?; + anyhow::ensure!( + run.canonical == canonical.as_slice(), + "native no-index parity drift for case {}", + case.id + ); + anyhow::ensure!( + run.plan_mode == plan_mode, + "native no-index plan mode drift for case {}", + case.id + ); + anyhow::ensure!( + run.plan_reason == plan_reason, + "native no-index plan reason drift for case {}", + case.id + ); + search_durations.push(run.search_duration); + total_durations.push(run.total_duration); + } + + Ok(NativeExecutionSummary { + canonical, + search_durations, + build_durations: Vec::new(), + total_durations, + plan_mode, + plan_reason, + }) +} + +async fn run_native_cold_build( + code_search: &CodeSearchTool, + index: &WorkspaceTrigramIndex, + security: &SecurityPolicy, + case: &BenchmarkCase, + samples: usize, +) -> Result { + let mut build_durations = Vec::with_capacity(samples); + let mut search_durations = Vec::with_capacity(samples); + let mut total_durations = Vec::with_capacity(samples); + + clear_index_artifacts(index)?; + let warmup = run_native_search_once(code_search, index, security, case, true).await?; + let canonical = warmup.canonical; + let plan_mode = warmup.plan_mode; + let plan_reason = warmup.plan_reason; + + for _ in 0..samples { + clear_index_artifacts(index)?; + let run = run_native_search_once(code_search, index, security, case, true).await?; + anyhow::ensure!( + run.canonical == canonical.as_slice(), + "native cold-build parity drift for case {}", + case.id + ); + anyhow::ensure!( + run.plan_mode == plan_mode, + "native cold-build plan mode drift for case {}", + case.id + ); + anyhow::ensure!( + run.plan_reason == plan_reason, + "native cold-build plan reason drift for case {}", + case.id + ); + build_durations.push(run.build_duration.unwrap_or_default()); + search_durations.push(run.search_duration); + total_durations.push(run.total_duration); + } + + Ok(NativeExecutionSummary { + canonical, + search_durations, + build_durations, + total_durations, + plan_mode, + plan_reason, + }) +} + +async fn run_native_warm_index( + code_search: &CodeSearchTool, + index: &WorkspaceTrigramIndex, + security: &SecurityPolicy, + case: &BenchmarkCase, + samples: usize, +) -> Result { + let security_arc = Arc::new(security.clone()); + index + .refresh_or_rebuild(security_arc) + .context("failed to prepare warm index")?; + + let mut search_durations = Vec::with_capacity(samples); + let mut total_durations = Vec::with_capacity(samples); + let warmup = run_native_search_once(code_search, index, security, case, false).await?; + let expected = warmup.canonical; + let expected_plan_mode = warmup.plan_mode; + let expected_plan_reason = warmup.plan_reason; + + for _ in 0..samples { + let run = run_native_search_once(code_search, index, security, case, false).await?; + anyhow::ensure!( + run.canonical == expected, + "native warm-index parity drift for case {}", + case.id + ); + anyhow::ensure!( + run.plan_mode == expected_plan_mode, + "native warm-index plan mode drift for case {}", + case.id + ); + anyhow::ensure!( + run.plan_reason == expected_plan_reason, + "native warm-index plan reason drift for case {}", + case.id + ); + search_durations.push(run.search_duration); + total_durations.push(run.total_duration); + } + + Ok(NativeExecutionSummary { + canonical: expected, + search_durations, + build_durations: Vec::new(), + total_durations, + plan_mode: expected_plan_mode, + plan_reason: expected_plan_reason, + }) +} + +struct NativeRun { + canonical: Vec, + build_duration: Option, + search_duration: Duration, + total_duration: Duration, + plan_mode: PlanMode, + plan_reason: String, +} + +async fn run_native_search_once( + code_search: &CodeSearchTool, + index: &WorkspaceTrigramIndex, + security: &SecurityPolicy, + case: &BenchmarkCase, + build_before_search: bool, +) -> Result { + let total_started = Instant::now(); + let build_duration = if build_before_search { + let build_started = Instant::now(); + index + .refresh_or_rebuild(Arc::new(security.clone())) + .context("failed to build index for cold benchmark")?; + Some(build_started.elapsed()) + } else { + None + }; + + let plan = index + .plan_candidates(security, &case.candidate_request(), 10 * 1024 * 1024) + .context("candidate planning failed")?; + let plan_mode = label_plan_mode(plan.coverage.clone(), &plan.reason); + let plan_reason = plan.reason; + + let search_started = Instant::now(); + let result = code_search + .execute(case.tool_args()) + .await + .context("code_search execution failed")?; + let search_duration = search_started.elapsed(); + let total_duration = total_started.elapsed(); + + anyhow::ensure!( + result.success, + "code_search failed for case {}: {:?}", + case.id, + result.error + ); + + Ok(NativeRun { + canonical: canonicalize_native_result(&result)?, + build_duration, + search_duration, + total_duration, + plan_mode, + plan_reason, + }) +} + +fn build_grep_command(case: &BenchmarkCase) -> String { + let mut parts = vec![ + "grep".to_string(), + "-R".to_string(), + "-n".to_string(), + "-H".to_string(), + ]; + parts.push(match case.query_kind { + QueryKind::Literal => "-F".to_string(), + QueryKind::Regex => "-E".to_string(), + }); + if !case.case_sensitive { + parts.push("-i".to_string()); + } + if case.whole_word { + parts.push("-w".to_string()); + } + parts.push("-e".to_string()); + parts.push(shell_quote(case.pattern)); + parts.push("--".to_string()); + parts.push(shell_quote(case.path)); + + let grep = parts.join(" "); + format!("{grep} || true") +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\"'\"'")) +} + +fn label_plan_mode(coverage: CandidateCoverage, reason: &str) -> PlanMode { + match coverage { + CandidateCoverage::Complete => PlanMode::IndexedNarrowing, + CandidateCoverage::Partial => PlanMode::FallbackDiscoveryLiveVerification, + CandidateCoverage::Unavailable if reason == "index_unavailable" => { + PlanMode::IndexUnavailable + } + CandidateCoverage::Unavailable => PlanMode::FallbackDiscoveryLiveVerification, + } +} + +fn canonicalize_shell_output(output: &str) -> Result> { + let mut seen = BTreeMap::<(String, usize), String>::new(); + + for line in output.lines().filter(|line| !line.trim().is_empty()) { + let mut parts = line.splitn(3, ':'); + let file = parts + .next() + .context("shell output missing file")? + .trim_start_matches("./") + .to_string(); + let line_number = parts + .next() + .context("shell output missing line number")? + .parse::() + .with_context(|| format!("invalid shell line number in '{line}'"))?; + let content = parts + .next() + .context("shell output missing content")? + .to_string(); + insert_canonical_line(&mut seen, file, line_number, content)?; + } + + Ok(seen + .into_iter() + .map(|((file, line), content)| CanonicalLineMatch { + file, + line, + content, + }) + .collect()) +} + +fn canonicalize_native_result(result: &ToolResult) -> Result> { + let structured = result + .structured + .as_ref() + .context("code_search result missing structured payload")?; + let matches = structured["matches"] + .as_array() + .context("code_search structured payload missing matches array")?; + let mut seen = BTreeMap::<(String, usize), String>::new(); + + for entry in matches { + let file = entry["file"] + .as_str() + .context("native match missing file")? + .trim_start_matches("./") + .to_string(); + let line = entry["line"] + .as_u64() + .context("native match missing line")? as usize; + let content = entry["content"] + .as_str() + .context("native match missing content")? + .to_string(); + insert_canonical_line(&mut seen, file, line, content)?; + } + + Ok(seen + .into_iter() + .map(|((file, line), content)| CanonicalLineMatch { + file, + line, + content, + }) + .collect()) +} + +fn insert_canonical_line( + seen: &mut BTreeMap<(String, usize), String>, + file: String, + line: usize, + content: String, +) -> Result<()> { + match seen.get(&(file.clone(), line)) { + Some(existing) if existing == &content => Ok(()), + Some(existing) => anyhow::bail!( + "conflicting canonical content for {}:{}: {:?} != {:?}", + file, + line, + existing, + content + ), + None => { + seen.insert((file, line), content); + Ok(()) + } + } +} + +fn percentile_ms(durations: &[Duration], percentile: usize) -> u64 { + if durations.is_empty() { + return 0; + } + let mut values: Vec = durations.iter().map(Duration::as_millis).collect(); + values.sort_unstable(); + let index = ((values.len() - 1) * percentile) / 100; + u64::try_from(values[index]).unwrap_or(u64::MAX) +} + +fn clear_index_artifacts(index: &WorkspaceTrigramIndex) -> Result<()> { + let state_dir = index + .db_path() + .parent() + .context("index path missing state directory")?; + if !state_dir.exists() { + return Ok(()); + } + + for suffix in ["", "-shm", "-wal"] { + let candidate = index.db_path().with_file_name(format!("index.db{suffix}")); + if candidate.exists() { + fs::remove_file(&candidate) + .with_context(|| format!("failed to remove '{}'", candidate.display()))?; + } + } + Ok(()) +} + +fn capture_environment_metadata(workspace: &WorkspaceContext) -> Result { + Ok(EnvironmentMetadata { + workspace_label: workspace.label.clone(), + workspace_root: workspace.root.clone(), + workspace_kind: workspace.kind, + file_count: count_files(&workspace.root)?, + os: env::consts::OS.to_string(), + arch: env::consts::ARCH.to_string(), + cpu: detect_cpu_descriptor(), + rust_profile: if cfg!(debug_assertions) { + "debug" + } else { + "release" + }, + benchmarked_at: chrono::Utc::now().to_rfc3339(), + commit_sha: if workspace.kind == "repo_snapshot" { + git_commit_sha(&workspace.root) + } else { + None + }, + }) +} + +fn count_files(root: &Path) -> Result { + let mut stack = vec![root.to_path_buf()]; + let mut count = 0_usize; + while let Some(path) = stack.pop() { + for entry in fs::read_dir(&path) + .with_context(|| format!("failed to read directory '{}'", path.display()))? + { + let entry = entry?; + let entry_path = entry.path(); + if entry.file_type()?.is_dir() { + stack.push(entry_path); + } else { + count += 1; + } + } + } + Ok(count) +} + +fn detect_cpu_descriptor() -> String { + if cfg!(target_os = "macos") { + if let Ok(output) = std::process::Command::new("sysctl") + .args(["-n", "machdep.cpu.brand_string"]) + .output() + { + if output.status.success() { + let value = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if !value.is_empty() { + return value; + } + } + } + } + + match std::thread::available_parallelism() { + Ok(parallelism) => format!("{} logical CPUs", parallelism.get()), + Err(_) => "unknown".to_string(), + } +} + +fn git_commit_sha(root: &Path) -> Option { + let output = std::process::Command::new("git") + .args(["rev-parse", "--short", "HEAD"]) + .current_dir(root) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let value = String::from_utf8_lossy(&output.stdout).trim().to_string(); + (!value.is_empty()).then_some(value) +} + +fn print_workspace_report(report: &WorkspaceReport) { + println!( + "# code_search rollout benchmark — {}", + report.metadata.workspace_label + ); + println!(); + println!("- workspace_kind: {}", report.metadata.workspace_kind); + println!( + "- workspace_root: {}", + report.metadata.workspace_root.display() + ); + println!("- file_count: {}", report.metadata.file_count); + println!("- os: {}", report.metadata.os); + println!("- arch: {}", report.metadata.arch); + println!("- cpu: {}", report.metadata.cpu); + println!("- rust_profile: {}", report.metadata.rust_profile); + println!("- benchmarked_at: {}", report.metadata.benchmarked_at); + println!( + "- commit_sha: {}", + report + .metadata + .commit_sha + .clone() + .unwrap_or_else(|| "n/a".to_string()) + ); + println!(); + println!("## Benchmark matrix"); + println!(); + println!("| Case | Query kind | Result shape | Path | Pattern |"); + println!("| --- | --- | --- | --- | --- |"); + for case in &report.matrix { + println!( + "| {} | {} | {} | `{}` | `{}` |", + case.id, + case.query_kind.as_str(), + case.result_shape.as_str(), + case.path, + case.pattern.replace('`', "\\`") + ); + } + println!(); + println!("## Measurements"); + println!(); + println!("| Case | Mode | Plan mode | Plan reason | Samples | Median ms | P95 ms | Build median ms | Search median ms | Total median ms | Parity |"); + println!("| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | --- |"); + for measurement in &report.measurements { + println!( + "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |", + measurement.case_id, + measurement.execution_mode.as_str(), + measurement.plan_mode.map(PlanMode::as_str).unwrap_or("—"), + if measurement.plan_reason.is_empty() { + "—" + } else { + &measurement.plan_reason + }, + measurement.samples, + measurement.median_ms, + measurement.p95_ms, + measurement + .build_median_ms + .map(|value| value.to_string()) + .unwrap_or_else(|| "—".to_string()), + measurement + .search_median_ms + .map(|value| value.to_string()) + .unwrap_or_else(|| "—".to_string()), + measurement + .total_median_ms + .map(|value| value.to_string()) + .unwrap_or_else(|| "—".to_string()), + match measurement.parity_passed { + Some(true) => "pass", + Some(false) => "FAIL", + None => "baseline", + } + ); + } + println!(); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_grep_command_is_deterministic_for_literal_cases() { + let case = BenchmarkCase { + id: "literal_small_hit", + query_kind: QueryKind::Literal, + result_shape: ResultShape::SmallHit, + pattern: "needle", + path: "src", + case_sensitive: true, + whole_word: false, + }; + + assert_eq!( + build_grep_command(&case), + "grep -R -n -H -F -e 'needle' -- 'src' || true" + ); + } + + #[test] + fn build_grep_command_is_deterministic_for_regex_cases() { + let case = BenchmarkCase { + id: "regex_small_hit", + query_kind: QueryKind::Regex, + result_shape: ResultShape::SmallHit, + pattern: "output:", + path: "src/lib", + case_sensitive: false, + whole_word: true, + }; + + assert_eq!( + build_grep_command(&case), + "grep -R -n -H -E -i -w -e 'output:' -- 'src/lib' || true" + ); + } + + #[test] + fn label_plan_mode_marks_regex_fallback_after_index_build() { + assert_eq!( + label_plan_mode(CandidateCoverage::Unavailable, "query_regex_not_supported"), + PlanMode::FallbackDiscoveryLiveVerification + ); + assert_eq!( + label_plan_mode(CandidateCoverage::Unavailable, "index_unavailable"), + PlanMode::IndexUnavailable + ); + assert_eq!( + label_plan_mode(CandidateCoverage::Complete, "indexed_candidates_complete"), + PlanMode::IndexedNarrowing + ); + } + + #[test] + fn canonicalize_native_result_deduplicates_same_line_entries() { + let result = ToolResult { + success: true, + output: String::new(), + error: None, + structured: Some(json!({ + "matches": [ + { "file": "src/main.rs", "line": 7, "content": "let token = \"needle\";" }, + { "file": "src/main.rs", "line": 7, "content": "let token = \"needle\";" }, + { "file": "src/main.rs", "line": 9, "content": "needle again" } + ] + })), + }; + + let canonical = canonicalize_native_result(&result).unwrap(); + assert_eq!(canonical.len(), 2); + assert_eq!(canonical[0].line, 7); + assert_eq!(canonical[1].line, 9); + } + + #[test] + fn canonicalize_native_result_rejects_conflicting_duplicate_lines() { + let result = ToolResult { + success: true, + output: String::new(), + error: None, + structured: Some(json!({ + "matches": [ + { "file": "src/main.rs", "line": 7, "content": "first" }, + { "file": "src/main.rs", "line": 7, "content": "second" } + ] + })), + }; + + let error = canonicalize_native_result(&result).unwrap_err(); + assert!(format!("{error:#}").contains("conflicting canonical content")); + } + + #[tokio::test(flavor = "current_thread")] + async fn fixture_smoke_case_has_shell_native_parity_and_measurements() { + let workspace = create_fixture_workspace().unwrap(); + let security = benchmark_security(&workspace.root); + let shell = benchmark_shell_tool(security.clone()); + let code_search = CodeSearchTool::new(security.clone()); + let index = WorkspaceTrigramIndex::for_workspace(&workspace.root); + let case = fixture_cases() + .into_iter() + .find(|candidate| candidate.id == "literal_small_hit") + .unwrap(); + + let shell_summary = run_shell_baseline(&shell, &case, 1).await.unwrap(); + assert!(!shell_summary.canonical.is_empty()); + assert_eq!(shell_summary.durations.len(), 1); + + let no_index = run_native_no_index(&code_search, &index, security.as_ref(), &case, 1) + .await + .unwrap(); + assert_eq!(no_index.canonical, shell_summary.canonical); + assert_eq!(no_index.search_durations.len(), 1); + + let cold = run_native_cold_build(&code_search, &index, security.as_ref(), &case, 1) + .await + .unwrap(); + assert_eq!(cold.canonical, shell_summary.canonical); + assert_eq!(cold.build_durations.len(), 1); + assert_eq!(cold.search_durations.len(), 1); + + let warm = run_native_warm_index(&code_search, &index, security.as_ref(), &case, 1) + .await + .unwrap(); + assert_eq!(warm.canonical, shell_summary.canonical); + assert_eq!(warm.search_durations.len(), 1); + } +} diff --git a/clients/agent-runtime/src/search/tests.rs b/clients/agent-runtime/src/search/tests.rs index 1059e5f55..aac280640 100644 --- a/clients/agent-runtime/src/search/tests.rs +++ b/clients/agent-runtime/src/search/tests.rs @@ -683,6 +683,7 @@ fn candidate_planner_marks_regex_and_short_patterns_unavailable() { ) .unwrap(); assert_eq!(regex_plan.coverage, CandidateCoverage::Unavailable); + assert_eq!(regex_plan.reason, "query_regex_not_supported"); let short_plan = index .plan_candidates( diff --git a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md new file mode 100644 index 000000000..74cf1bc5d --- /dev/null +++ b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md @@ -0,0 +1,204 @@ +--- +title: code_search +summary: Rollout guidance, benchmark evidence, and current behavior for the native code_search tool. +owner: team-runtime +status: canonical +lastReviewed: 2026-04-05 +appliesTo: main +docType: guide +--- + +# `code_search` + +`code_search` is the native workspace search tool in the Rust runtime. It supports literal and regex correctness, returns structured matches, and always treats live verification against current file contents as authoritative. + +## Current verified behavior + +- Literal queries **may** use indexed candidate narrowing when a compatible workspace trigram index is available and fresh. +- Regex queries are **supported for correctness and safety**, but indexed candidate narrowing does **not** support regex in v1. +- When indexed planning cannot narrow candidates for regex, the planner returns `query_regex_not_supported` and execution is labeled as `fallback_discovery_live_verification`. +- When no compatible index exists, the planner returns `index_unavailable` and execution is labeled `index_unavailable`. +- Final matches always come from live verification of current file contents. Indexed candidates are never treated as authoritative results by themselves. + +## What this page is for + +This page is the rollout evidence page for Issue #360. It is intentionally separate from Criterion microbenchmarks in `clients/agent-runtime/benches/agent_benchmarks.rs`. + +- **Criterion benches**: low-level microbenchmarks only. +- **Rollout benchmark runner**: real shell baseline through `ShellTool`, native no-index, native cold-build, native warm-index, plus canonical line-match parity. + +Runner: + +```bash +cargo run --manifest-path clients/agent-runtime/Cargo.toml \ + --example code_search_rollout_benchmark -- \ + --workspace both \ + --repo-path /path/to/repo \ + --samples 5 \ + --cold-build-samples 2 +``` + +## Benchmark methodology + +### Shell baseline + +The shell baseline uses the real `shell` tool path with `NativeRuntime`, which executes `grep` through `sh -c`. This preserves the same shell-tool wrapping and policy checks that current agent flows use. + +### Native modes + +- `native_no_index`: delete `state/code-search/index.db` before each measured run. +- `native_cold_build`: delete the index, time `refresh_or_rebuild()`, then time the first `code_search` search. +- `native_warm_index`: build or refresh once, then time repeated searches with the reusable index present. + +### Parity rules + +Parity compares shell and native results as canonical line matches: + +```text +file + line + full line content +``` + +This rollout harness only makes recommendation claims for rows where parity passes. + +### Recorded environment + +#### Deterministic fixture workspace + +- workspace kind: `fixture` +- generated by: `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` +- file count: `4` +- benchmarked at: `2026-04-05T19:34:18.060248+00:00` +- host: macOS / aarch64 / Apple M2 Max +- Rust profile: `debug` + +#### Current repo snapshot + +- workspace kind: `repo_snapshot` +- workspace root: `/Users/acosta/Dev/corvus` +- commit SHA: `82fa4896` +- file count: `234763` +- benchmarked at: `2026-04-05T19:47:11.665525+00:00` +- host: macOS / aarch64 / Apple M2 Max +- Rust profile: `debug` + +## Benchmark matrix + +The rollout runner records these six representative cases in both workspaces: + +| Case | Query kind | Result shape | Notes | +| --- | --- | --- | --- | +| `literal_small_hit` | literal | small-hit | one or few matching lines | +| `literal_large_hit` | literal | large-hit | many matching lines | +| `literal_no_hit` | literal | no-hit | zero-match literal miss | +| `regex_small_hit` | regex | small-hit | regex mode, fallback-labeled | +| `regex_large_hit` | regex | large-hit | regex mode, fallback-labeled | +| `regex_no_hit` | regex | no-hit | regex miss, fallback-labeled | + +## Recorded results + +### Deterministic fixture workspace + +| Case | Mode | Plan mode | Reason | Samples | Median ms | P95 ms | Build median ms | Search median ms | Total median ms | Parity | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| literal_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 243 | 244 | — | — | — | baseline | +| literal_small_hit | native_no_index | index_unavailable | index_unavailable | 5 | 14 | 15 | — | 14 | 14 | pass | +| literal_small_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 37 | 37 | 4 | 16 | 37 | pass | +| literal_small_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 33 | 33 | — | 16 | 33 | pass | +| literal_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 244 | 245 | — | — | — | baseline | +| literal_large_hit | native_no_index | index_unavailable | index_unavailable | 5 | 15 | 17 | — | 15 | 15 | pass | +| literal_large_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 36 | 36 | 4 | 16 | 36 | pass | +| literal_large_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 32 | 33 | — | 16 | 32 | pass | +| literal_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 249 | 250 | — | — | — | baseline | +| literal_no_hit | native_no_index | index_unavailable | index_unavailable | 5 | 14 | 15 | — | 14 | 14 | pass | +| literal_no_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 37 | 37 | 4 | 16 | 37 | pass | +| literal_no_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 32 | 33 | — | 16 | 32 | pass | +| regex_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 251 | 251 | — | — | — | baseline | +| regex_small_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 15 | — | 14 | 14 | pass | +| regex_small_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 18 | 18 | 4 | 13 | 18 | pass | +| regex_small_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 14 | — | 14 | 14 | pass | +| regex_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 245 | 246 | — | — | — | baseline | +| regex_large_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 14 | — | 14 | 14 | pass | +| regex_large_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 18 | 18 | 4 | 14 | 18 | pass | +| regex_large_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 14 | — | 14 | 14 | pass | +| regex_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 244 | 244 | — | — | — | baseline | +| regex_no_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 15 | 16 | — | 15 | 15 | pass | +| regex_no_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 18 | 18 | 4 | 13 | 18 | pass | +| regex_no_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 13 | 14 | — | 13 | 13 | pass | + +### Current repo snapshot + +| Case | Mode | Plan mode | Reason | Samples | Median ms | P95 ms | Build median ms | Search median ms | Total median ms | Parity | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| literal_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 237 | 239 | — | — | — | baseline | +| literal_small_hit | native_no_index | index_unavailable | index_unavailable | 5 | 29 | 29 | — | 29 | 29 | pass | +| literal_small_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 27897 | 27897 | 27239 | 326 | 27897 | pass | +| literal_small_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 654 | 656 | — | 326 | 654 | pass | +| literal_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 246 | 257 | — | — | — | baseline | +| literal_large_hit | native_no_index | index_unavailable | index_unavailable | 5 | 33 | 33 | — | 33 | 33 | pass | +| literal_large_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 27706 | 27706 | 27039 | 338 | 27706 | pass | +| literal_large_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 662 | 667 | — | 337 | 662 | pass | +| literal_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 249 | 251 | — | — | — | baseline | +| literal_no_hit | native_no_index | index_unavailable | index_unavailable | 5 | 28 | 28 | — | 28 | 28 | pass | +| literal_no_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 28071 | 28071 | 27385 | 342 | 28071 | pass | +| literal_no_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 655 | 656 | — | 327 | 655 | pass | +| regex_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 244 | 246 | — | — | — | baseline | +| regex_small_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 35 | 36 | — | 35 | 35 | pass | +| regex_small_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 27751 | 27751 | 27708 | 42 | 27751 | pass | +| regex_small_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 36 | 36 | — | 36 | 36 | pass | +| regex_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 248 | 250 | — | — | — | baseline | +| regex_large_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 32 | 32 | — | 32 | 32 | pass | +| regex_large_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 27997 | 27997 | 27957 | 39 | 27997 | pass | +| regex_large_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 32 | 33 | — | 32 | 32 | pass | +| regex_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 248 | 323 | — | — | — | baseline | +| regex_no_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 28 | 28 | — | 28 | 28 | pass | +| regex_no_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 26974 | 26974 | 26939 | 35 | 26974 | pass | +| regex_no_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 29 | 29 | — | 29 | 29 | pass | + +## Rollout guidance + +### SHOULD prefer native `code_search` + +Use native `code_search` for the measured regex cases and for fixture-scale searches where you want structured output plus verified parity. + +Why: + +- every published row passed canonical parity, +- regex rows are correctly labeled as fallback (`query_regex_not_supported` → discovery + live verification), not as regex-aware indexed narrowing, +- even in the large repo snapshot, measured regex fallback rows were materially faster than the shell baseline in this local debug run. + +### MAY prefer native `code_search` + +Use native `code_search` for literal searches when one of these is true: + +- you already have a warm reusable index for the workspace, +- you care more about structured output and verified offsets than raw shell latency, +- the workspace is small enough that index build and warm reuse costs stay low. + +### MAY keep shell / grep + +Shell remains a reasonable choice for one-off literal searches in a large repo snapshot when you do **not** already have a reusable index. + +Why: + +- the repo-snapshot cold-build rows are dominated by full-workspace index construction in this debug-profile run, +- repo-snapshot warm-index literal rows (`654`-`662 ms`) were still slower than the shell baseline (`237`-`249 ms`) for the measured scoped literal cases, +- this change does **not** deprecate shell search and does **not** claim native preference for unmeasured cases. + +## Limitations that matter for rollout + +- Regex candidate narrowing is **not** implemented in v1. +- `query_regex_not_supported` is a planner fallback reason, not a search error. +- The current rollout runner rebuilds the index for the whole workspace, so scoped literal searches in very large repos can make cold-build evidence look worse than shell. +- These recorded numbers are from a local `debug` profile run. Re-running in another environment may change the absolute numbers, but the docs should keep the plan-mode and parity labels intact. + +## Future optimizations (non-v1) + +These are **not** required for the current rollout recommendation: + +- regex-aware indexed candidate narrowing, +- case-insensitive or whole-word-aware index narrowing, +- workspace-scope-aware partial index builds, +- release-profile reruns for published rollout refreshes, +- machine-readable benchmark artifact export in addition to the markdown tables above. + +Those items are future optimization work, not blockers for the current benchmark-backed rollout guidance. diff --git a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/core.md b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/core.md index b69038e25..6604b99ef 100644 --- a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/core.md +++ b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/core.md @@ -10,6 +10,21 @@ docType: reference Core tools provide the foundation for agent autonomy, allowing interaction with the host operating system and the local workspace. +## `code_search` + +Searches workspace files for literal or regex matches and returns both human-readable output and structured match data. + +- **Security Tier:** Read-Only (Safe). +- **Execution:** Native runtime tool with optional literal-query index narrowing and mandatory live verification against current file contents. +- **Current limitation:** Regex correctness is supported, but indexed candidate narrowing does **not** support regex in v1. Regex planning falls back with `query_regex_not_supported` to discovery plus live verification. +- **Rollout evidence:** See the dedicated [`code_search` rollout page](code-search.md) for measured shell-vs-native results, fallback labels, and recommendation guidance. + +### Parameters + +See the dedicated [`code_search` page](code-search.md) for the full parameter contract, benchmark methodology, and rollout guidance. + +--- + ## `shell` Executes an arbitrary shell command within the workspace directory. diff --git a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/index.mdx b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/index.mdx index 45e136e2a..994a52df2 100644 --- a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/index.mdx +++ b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/index.mdx @@ -18,7 +18,7 @@ The tool system is built on a "deny-by-default" security model, ensuring that ev Built-in tools are organized into five primary categories: -- [**Core Tools**](core.md) — System command execution (`shell`) and workspace filesystem access (`file_read`, `file_write`). +- [**Core Tools**](core.md) — System command execution (`shell`), native workspace search (`code_search`), and workspace filesystem access (`file_read`, `file_write`). - [**Web Tools**](web.md) — Web browsing (`browser`), search (`web_search_tool`), and structured API calls (`http_request`). - [**Memory Tools**](memory.md) — Long-term persistence and retrieval of facts and preferences (`memory_store`, `memory_recall`). - [**Automation Tools**](automation.md) — Git repository management, scheduled tasks (Cron/Schedule), and notifications (`pushover`). diff --git a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md new file mode 100644 index 000000000..98ff7828b --- /dev/null +++ b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md @@ -0,0 +1,204 @@ +--- +title: code_search +summary: Guía de rollout, evidencia de benchmarks y comportamiento actual de la herramienta nativa code_search. +owner: team-runtime +status: canonical +lastReviewed: 2026-04-05 +appliesTo: main +docType: guide +--- + +# `code_search` + +`code_search` es la herramienta nativa de búsqueda en el workspace dentro del runtime Rust. Soporta corrección para búsquedas literales y regex, devuelve coincidencias estructuradas y siempre trata la verificación en vivo sobre el contenido actual de los archivos como la fuente de verdad. + +## Comportamiento verificado actual + +- Las consultas literales **pueden** usar reducción de candidatos por índice cuando existe un índice trigram compatible y fresco. +- Las consultas regex están **soportadas para corrección y seguridad**, pero la reducción de candidatos por índice **no** soporta regex en v1. +- Cuando el planner no puede reducir candidatos para regex, devuelve `query_regex_not_supported` y la ejecución se etiqueta como `fallback_discovery_live_verification`. +- Cuando no existe un índice compatible, el planner devuelve `index_unavailable` y la ejecución se etiqueta como `index_unavailable`. +- Las coincidencias finales siempre salen de la verificación en vivo del contenido actual. Los candidatos indexados por sí solos nunca son resultados autoritativos. + +## Para qué sirve esta página + +Esta página es la fuente de evidencia de rollout para el Issue #360. Está separada intencionalmente de los microbenchmarks de Criterion en `clients/agent-runtime/benches/agent_benchmarks.rs`. + +- **Benches de Criterion**: solo microbenchmarks de bajo nivel. +- **Runner de rollout**: baseline real por `ShellTool`, nativo sin índice, nativo con cold-build, nativo con warm-index y comparación de paridad por línea canónica. + +Runner: + +```bash +cargo run --manifest-path clients/agent-runtime/Cargo.toml \ + --example code_search_rollout_benchmark -- \ + --workspace both \ + --repo-path /path/to/repo \ + --samples 5 \ + --cold-build-samples 2 +``` + +## Metodología del benchmark + +### Baseline de shell + +El baseline de shell usa la ruta real de la herramienta `shell` con `NativeRuntime`, que ejecuta `grep` a través de `sh -c`. Eso preserva el mismo wrapping y los mismos chequeos de política que usan hoy los flujos del agente. + +### Modos nativos + +- `native_no_index`: elimina `state/code-search/index.db` antes de cada corrida medida. +- `native_cold_build`: elimina el índice, mide `refresh_or_rebuild()` y luego mide la primera búsqueda de `code_search`. +- `native_warm_index`: construye o refresca una vez y luego mide búsquedas repetidas con el índice reusable presente. + +### Reglas de paridad + +La paridad compara resultados de shell y nativos como coincidencias canónicas por línea: + +```text +archivo + línea + contenido completo de la línea +``` + +Este harness de rollout solo hace recomendaciones sobre filas donde la paridad pasa. + +### Entorno registrado + +#### Workspace fixture determinístico + +- tipo de workspace: `fixture` +- generado por: `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` +- cantidad de archivos: `4` +- fecha del benchmark: `2026-04-05T19:34:18.060248+00:00` +- host: macOS / aarch64 / Apple M2 Max +- perfil de Rust: `debug` + +#### Snapshot actual del repo + +- tipo de workspace: `repo_snapshot` +- raíz del workspace: `/Users/acosta/Dev/corvus` +- commit SHA: `82fa4896` +- cantidad de archivos: `234763` +- fecha del benchmark: `2026-04-05T19:47:11.665525+00:00` +- host: macOS / aarch64 / Apple M2 Max +- perfil de Rust: `debug` + +## Matriz de benchmark + +El runner de rollout registra estos seis casos representativos en ambos workspaces: + +| Caso | Tipo de consulta | Forma del resultado | Nota | +| --- | --- | --- | --- | +| `literal_small_hit` | literal | small-hit | una o pocas líneas coincidentes | +| `literal_large_hit` | literal | large-hit | muchas líneas coincidentes | +| `literal_no_hit` | literal | no-hit | literal sin coincidencias | +| `regex_small_hit` | regex | small-hit | modo regex, etiquetado como fallback | +| `regex_large_hit` | regex | large-hit | modo regex, etiquetado como fallback | +| `regex_no_hit` | regex | no-hit | regex sin coincidencias, etiquetado como fallback | + +## Resultados registrados + +### Workspace fixture determinístico + +| Caso | Modo | Modo de plan | Razón | Samples | Median ms | P95 ms | Build median ms | Search median ms | Total median ms | Paridad | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| literal_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 243 | 244 | — | — | — | baseline | +| literal_small_hit | native_no_index | index_unavailable | index_unavailable | 5 | 14 | 15 | — | 14 | 14 | pass | +| literal_small_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 37 | 37 | 4 | 16 | 37 | pass | +| literal_small_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 33 | 33 | — | 16 | 33 | pass | +| literal_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 244 | 245 | — | — | — | baseline | +| literal_large_hit | native_no_index | index_unavailable | index_unavailable | 5 | 15 | 17 | — | 15 | 15 | pass | +| literal_large_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 36 | 36 | 4 | 16 | 36 | pass | +| literal_large_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 32 | 33 | — | 16 | 32 | pass | +| literal_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 249 | 250 | — | — | — | baseline | +| literal_no_hit | native_no_index | index_unavailable | index_unavailable | 5 | 14 | 15 | — | 14 | 14 | pass | +| literal_no_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 37 | 37 | 4 | 16 | 37 | pass | +| literal_no_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 32 | 33 | — | 16 | 32 | pass | +| regex_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 251 | 251 | — | — | — | baseline | +| regex_small_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 15 | — | 14 | 14 | pass | +| regex_small_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 18 | 18 | 4 | 13 | 18 | pass | +| regex_small_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 14 | — | 14 | 14 | pass | +| regex_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 245 | 246 | — | — | — | baseline | +| regex_large_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 14 | — | 14 | 14 | pass | +| regex_large_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 18 | 18 | 4 | 14 | 18 | pass | +| regex_large_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 14 | 14 | — | 14 | 14 | pass | +| regex_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 244 | 244 | — | — | — | baseline | +| regex_no_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 15 | 16 | — | 15 | 15 | pass | +| regex_no_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 18 | 18 | 4 | 13 | 18 | pass | +| regex_no_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 13 | 14 | — | 13 | 13 | pass | + +### Snapshot actual del repo + +| Caso | Modo | Modo de plan | Razón | Samples | Median ms | P95 ms | Build median ms | Search median ms | Total median ms | Paridad | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| literal_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 237 | 239 | — | — | — | baseline | +| literal_small_hit | native_no_index | index_unavailable | index_unavailable | 5 | 29 | 29 | — | 29 | 29 | pass | +| literal_small_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 27897 | 27897 | 27239 | 326 | 27897 | pass | +| literal_small_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 654 | 656 | — | 326 | 654 | pass | +| literal_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 246 | 257 | — | — | — | baseline | +| literal_large_hit | native_no_index | index_unavailable | index_unavailable | 5 | 33 | 33 | — | 33 | 33 | pass | +| literal_large_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 27706 | 27706 | 27039 | 338 | 27706 | pass | +| literal_large_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 662 | 667 | — | 337 | 662 | pass | +| literal_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 249 | 251 | — | — | — | baseline | +| literal_no_hit | native_no_index | index_unavailable | index_unavailable | 5 | 28 | 28 | — | 28 | 28 | pass | +| literal_no_hit | native_cold_build | indexed_narrowing | indexed_candidates_complete | 2 | 28071 | 28071 | 27385 | 342 | 28071 | pass | +| literal_no_hit | native_warm_index | indexed_narrowing | indexed_candidates_complete | 5 | 655 | 656 | — | 327 | 655 | pass | +| regex_small_hit | shell_baseline | — | shell_grep_baseline | 5 | 244 | 246 | — | — | — | baseline | +| regex_small_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 35 | 36 | — | 35 | 35 | pass | +| regex_small_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 27751 | 27751 | 27708 | 42 | 27751 | pass | +| regex_small_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 36 | 36 | — | 36 | 36 | pass | +| regex_large_hit | shell_baseline | — | shell_grep_baseline | 5 | 248 | 250 | — | — | — | baseline | +| regex_large_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 32 | 32 | — | 32 | 32 | pass | +| regex_large_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 27997 | 27997 | 27957 | 39 | 27997 | pass | +| regex_large_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 32 | 33 | — | 32 | 32 | pass | +| regex_no_hit | shell_baseline | — | shell_grep_baseline | 5 | 248 | 323 | — | — | — | baseline | +| regex_no_hit | native_no_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 28 | 28 | — | 28 | 28 | pass | +| regex_no_hit | native_cold_build | fallback_discovery_live_verification | query_regex_not_supported | 2 | 26974 | 26974 | 26939 | 35 | 26974 | pass | +| regex_no_hit | native_warm_index | fallback_discovery_live_verification | query_regex_not_supported | 5 | 29 | 29 | — | 29 | 29 | pass | + +## Guía de rollout + +### SHOULD prefer native `code_search` + +Usa `code_search` nativo para los casos regex medidos y para búsquedas de tamaño fixture donde quieras salida estructurada con paridad verificada. + +Por qué: + +- todas las filas publicadas pasaron la paridad canónica, +- las filas regex están etiquetadas correctamente como fallback (`query_regex_not_supported` → discovery + live verification), no como reducción regex-aware por índice, +- incluso en el snapshot grande del repo, las filas regex en fallback fueron materialmente más rápidas que el baseline de shell en esta corrida local en `debug`. + +### MAY prefer native `code_search` + +Usa `code_search` nativo para búsquedas literales cuando se cumpla alguna de estas condiciones: + +- ya tienes un índice reusable y warm para el workspace, +- te importa más la salida estructurada y los offsets verificados que la latencia cruda del shell, +- el workspace es lo bastante pequeño para que el costo de build y reuse del índice siga siendo bajo. + +### MAY keep shell / grep + +Shell sigue siendo una opción razonable para búsquedas literales one-shot en un repo grande cuando **no** tienes un índice reusable listo. + +Por qué: + +- las filas cold-build del snapshot del repo están dominadas por la construcción del índice del workspace completo en esta corrida `debug`, +- las filas literales warm-index del snapshot (`654`-`662 ms`) siguieron siendo más lentas que el baseline de shell (`237`-`249 ms`) para los casos literales medidos, +- este cambio **no** depreca la búsqueda por shell y **no** reclama preferencia nativa para casos no medidos. + +## Limitaciones que importan para rollout + +- La reducción de candidatos por índice para regex **no** está implementada en v1. +- `query_regex_not_supported` es una razón de fallback del planner, no un error de búsqueda. +- El runner actual reconstruye el índice para todo el workspace, así que búsquedas literales acotadas dentro de repos muy grandes pueden verse peor que shell en las filas cold-build. +- Estos números vienen de una corrida local en perfil `debug`. Si se re-ejecuta en otro entorno, los números absolutos pueden cambiar, pero las etiquetas de modo de plan y paridad deben mantenerse. + +## Optimizaciones futuras (non-v1) + +Estos puntos **no** son requeridos para la recomendación actual de rollout: + +- reducción de candidatos regex-aware por índice, +- reducción por índice para búsqueda case-insensitive o whole-word, +- builds parciales del índice conscientes del scope del workspace, +- reruns en perfil `release` para refrescar la evidencia publicada, +- exportar un artefacto machine-readable además de las tablas markdown. + +Esos puntos son trabajo futuro de optimización, no bloqueadores para la guía actual respaldada por benchmarks. diff --git a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/core.md b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/core.md index 2d0e0ca80..6d00ed797 100644 --- a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/core.md +++ b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/core.md @@ -12,6 +12,21 @@ docType: reference Las herramientas core proporcionan la base para la autonomía del agente, permitiendo la interacción con el sistema operativo host y el espacio de trabajo local. +## `code_search` + +Busca coincidencias literales o regex dentro del workspace y devuelve tanto salida legible para humanos como datos estructurados de coincidencias. + +- **Nivel de Seguridad:** Solo Lectura (Segura). +- **Ejecución:** Herramienta nativa del runtime con reducción opcional de candidatos por índice para consultas literales y verificación obligatoria en vivo sobre el contenido actual. +- **Limitación actual:** La corrección regex está soportada, pero la reducción de candidatos por índice **no** soporta regex en v1. El planning regex cae en fallback con `query_regex_not_supported` hacia discovery más live verification. +- **Evidencia de rollout:** Consulta la página dedicada de [`code_search`](code-search.md) para resultados medidos shell-vs-native, etiquetas de fallback y guía de recomendación. + +### Parámetros + +Consulta la página dedicada de [`code_search`](code-search.md) para el contrato completo de parámetros, la metodología del benchmark y la guía de rollout. + +--- + ## `shell` Ejecuta un comando de shell arbitrario dentro del directorio del workspace. diff --git a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/index.mdx b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/index.mdx index 2d70e8145..c4a168c38 100644 --- a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/index.mdx +++ b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/index.mdx @@ -18,7 +18,7 @@ El sistema de herramientas se basa en un modelo de seguridad de "denegación por Las herramientas integradas se organizan en cinco categorías principales: -- [**Herramientas Core**](core.md) — Ejecución de comandos del sistema (`shell`) y acceso al sistema de archivos del workspace (`file_read`, `file_write`). +- [**Herramientas Core**](core.md) — Ejecución de comandos del sistema (`shell`), búsqueda nativa en el workspace (`code_search`) y acceso al sistema de archivos del workspace (`file_read`, `file_write`). - [**Herramientas Web**](web.md) — Navegación web (`browser`), búsqueda (`web_search_tool`) y llamadas estructuradas a APIs (`http_request`). - [**Herramientas de Memoria**](memory.md) — Persistencia a largo plazo y recuperación de hechos y preferencias (`memory_store`, `memory_recall`). - [**Herramientas de Automatización**](automation.md) — Gestión de repositorios Git, tareas programadas (Cron/Schedule) y notificaciones (`pushover`). diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md new file mode 100644 index 000000000..9b76e3f06 --- /dev/null +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md @@ -0,0 +1,392 @@ +# Design: `code_search` Benchmark Rollout and Evidence-Based Guidance + +## Technical Approach + +This change will add a dedicated rollout benchmark workflow for `code_search`, publish measured +results in runtime-tool documentation, and derive rollout guidance from those measurements without +changing search semantics or planner behavior. + +The implementation will treat rollout as an evidence pipeline with four parts: + +1. a deterministic benchmark runner that exercises the current shell baseline and native + `code_search` path against the same scenario matrix, +2. explicit native execution states (`no index`, `cold build`, `warm index`) so reviewers can see + the cost of first use versus reuse, +3. a correctness-parity pass that proves the benchmarked shell and native cases return the same + canonical match set before any preference claim is made, +4. documentation updates that record methodology, results, current fallback behavior, rollout + guidance, and deferred work. + +This maps directly to the spec by keeping v1 focused on benchmark coverage, recorded results, +current-behavior documentation, measured rollout guidance, and strict separation of deferred +optimizations. + +## Architecture Decisions + +### Decision: Use a dedicated rollout benchmark runner instead of only extending Criterion microbenches + +**Choice**: Add a dedicated Rust benchmark runner for rollout evidence, while leaving the existing +Criterion microbench file as a lower-level performance harness. + +**Alternatives considered**: +- Extend `clients/agent-runtime/benches/agent_benchmarks.rs` only. +- Measure ad hoc shell and native commands manually and paste results into docs. + +**Rationale**: rollout evidence needs stateful setup/teardown, shell-tool invocation, explicit +index-state control, correctness checks, and a docs-friendly summary. Criterion is good for hot +loops, but awkward for “delete index → build index → search → compare → label plan mode” flows. +Ad hoc measurements would not be reproducible enough for Issue #360. + +### Decision: Benchmark the real baseline as the `shell` tool executing grep through the native runtime + +**Choice**: Treat the shell comparator as the current generic `shell` tool path, using `ShellTool` +with `NativeRuntime`, which in turn executes commands through `sh -c`, and benchmark grep-based +commands through that path. + +**Alternatives considered**: +- Benchmark raw `grep` directly without the shell tool. +- Compare only native `code_search` states and omit the shell baseline. + +**Rationale**: the rollout decision is about replacing or preferring the current agent workflow, +not about comparing Rust code to a hypothetical external command. Using the actual shell-tool path + preserves command-validation, shell startup, and runtime wrapping overhead that users experience +today. + +### Decision: Model native execution as explicit evidence states + +**Choice**: Report native results in three separate states: +- **No index**: `state/code-search/index.db` absent; native search executes with fallback behavior. +- **Cold build**: no reusable index exists, `refresh_or_rebuild()` is timed, then the first search + is timed and reported as `build_ms`, `search_ms`, and `total_ms`. +- **Warm index**: a compatible index already exists; search is timed without rebuild work. + +**Alternatives considered**: +- Report a single “native” number. +- Benchmark only warm-index performance. + +**Rationale**: the rollout question is operational. A single number would hide the difference +between first-use cost and steady-state reuse. The current implementation already exposes the +necessary primitives through `WorkspaceTrigramIndex::refresh_or_rebuild()` and +`WorkspaceTrigramIndex::plan_candidates()`. + +### Decision: Gate rollout claims on canonical correctness parity for the benchmarked overlap set + +**Choice**: Add a parity comparator that canonicalizes shell and native outputs to the same +line-oriented match model and require those benchmark cases to agree before documentation makes a +preference claim. + +**Alternatives considered**: +- Use timing only and assume correctness from existing tests. +- Compare raw text output strings from shell and native tools. + +**Rationale**: rollout guidance must be evidence-based, and timing without parity would be +misleading. Raw text output is not stable enough because `code_search` returns structured matches +and grep returns line-oriented text. Canonical comparison keeps the measurement fair while leaving +broader regex semantics to the existing specs and tests. + +### Decision: Keep deferred planner/search optimizations explicitly out of v1 + +**Choice**: Document deferred items in a separate “Future optimizations / non-v1” section and do +not let them affect rollout success criteria. + +**Alternatives considered**: +- Fold regex-aware narrowing and related planner improvements into the rollout work. +- Mention future ideas inline with the recommendation. + +**Rationale**: the spec is about benchmarking and documentation for today’s verified behavior. +Mixing future planner work into v1 would blur the decision boundary and weaken the evidence trail. + +## Data Flow + +### Benchmark execution flow + +```text +Scenario matrix + │ + ├── Shell baseline case ──→ ShellTool ──→ NativeRuntime ──→ sh -c "grep ..." + │ + └── Native case ──→ WorkspaceTrigramIndex::plan_candidates() + │ + ├── no index / unsupported query ──→ discovery + live verification + └── compatible literal query ──────→ indexed candidate narrowing + live verification +``` + +### Evidence publication flow + +```text +Benchmark runner + │ + ├── measurements by case/state + ├── plan-mode labels (indexed vs fallback) + ├── parity result per case + ▼ +Recorded results table + │ + ├── docs/clients/agent-runtime/tools/code-search.md + └── docs/es/clients/agent-runtime/tools/code-search.md + │ + ▼ +Rollout guidance derived from measured evidence +``` + +### Sequence diagram for one benchmark case + +```text +Runner -> Workspace setup: prepare fixture or repo snapshot metadata +Runner -> Native state prep: delete index / build index / reuse index +Runner -> Planner: plan_candidates(request) +Planner --> Runner: coverage + reason +Runner -> ShellTool: execute(grep command) [shell baseline] +ShellTool --> Runner: shell output + timing +Runner -> CodeSearchTool: execute(args) [native path] +CodeSearchTool --> Runner: structured matches + timing +Runner -> Comparator: canonicalize(shell, native) +Comparator --> Runner: parity pass/fail +Runner -> Reporter: write measurement row with state, plan mode, parity, timings +``` + +## Benchmark Environments and States + +The benchmark matrix will cover two workspace environments: + +1. **Deterministic fixture workspace** + - generated during the run, + - tuned so benchmark cases have stable hit/no-hit characteristics, + - constrained so parity cases have at most one logical match per line. + +2. **Repository snapshot workspace** + - the current checkout used for rollout measurement, + - recorded with commit SHA, file count, and benchmark date, + - used to make the recommendation relevant to real Corvus usage. + +Each benchmark case will run in these native states when applicable: + +| State | Preparation | What it represents | +|------|-------------|--------------------| +| `shell_baseline` | No native state prep; execute shell grep through `ShellTool` | Current generic search workflow | +| `native_no_index` | Remove `state/code-search/index.db` before each measured run | Native search with no reusable index | +| `native_cold_build` | Remove index, time `refresh_or_rebuild()`, then time first `code_search` | First-use cost when native indexing must be built | +| `native_warm_index` | Build or refresh index once before measurement loop | Steady-state native reuse | + +Regex scenarios will still be run in `native_cold_build` and `native_warm_index`, but the runner +will label them as fallback cases because `plan_candidates()` returns +`query_regex_not_supported` and execution continues through discovery plus live verification. + +## Comparison Method + +Each benchmark row will be defined by: + +- workspace environment, +- query kind (`literal` or `regex`), +- result shape (`small-hit`, `large-hit`, `no-hit`), +- optional scope/filter settings (`path`, `include`, `exclude`), +- execution mode (`shell_baseline`, `native_no_index`, `native_cold_build`, `native_warm_index`). + +The runner will measure repeated samples per row and report: + +- `samples`, +- `median_ms`, +- `p95_ms`, +- for cold builds: `build_median_ms`, `search_median_ms`, `total_median_ms`, +- `plan_mode` (`indexed_narrowing`, `fallback_discovery_live_verification`, or + `index_unavailable`), +- `plan_reason` (for example `query_regex_not_supported` or `index_unavailable`). + +Measurement rules: + +1. Use the same immutable workspace contents for shell and native comparisons within a case. +2. Perform one untimed warm-up per row before collecting timed samples. +3. Report medians and p95s instead of means to reduce skew from filesystem jitter. +4. Keep shell and native command shapes equivalent at the benchmark-case level. +5. Record environment metadata alongside the results: OS, CPU, Rust profile, workspace type, + commit SHA (for repo snapshot), and file count. + +## Correctness-Parity Method + +Parity is required for benchmarked comparisons but intentionally scoped to the overlap between the +current shell grep workflow and the current `code_search` behavior. + +### Canonical comparison model + +Both engines will be normalized to: + +```rust +struct CanonicalLineMatch { + file: String, + line: usize, + content: String, +} +``` + +- Shell output will be parsed from `grep -nH` style lines. +- Native output will be derived from `structured.matches`, collapsed to unique line entries. + +### Parity scope rules + +- Benchmark parity cases MUST use queries supported by both Rust regex and the chosen grep mode. +- Regex parity claims are limited to the shared syntax subset (no backreferences, lookaround, or + other features neither engine consistently shares in this workflow). +- Fixture cases MUST avoid multiple relevant matches on the same line so line-level comparison + remains exact. +- Existing unit/spec coverage remains the authority for native-only semantics and fallback + behavior beyond the benchmark overlap set. + +### Parity outcome handling + +- A benchmark row is **eligible for rollout guidance** only if parity passes. +- A parity failure is recorded in the results table and blocks any “native SHOULD be preferred” + statement for that row’s query class until explained or corrected. + +## Rollout Guidance Derivation + +Documentation will derive guidance from the recorded results using a fixed rubric rather than +narrative judgment alone. + +### Gate 1: correctness + +No recommendation is made unless the benchmarked class has parity pass for the shell/native +comparison rows that support the claim. + +### Gate 2: execution-mode interpretation + +- If `plan_mode = indexed_narrowing`, the recommendation may cite indexed candidate narrowing. +- If `plan_mode = fallback_discovery_live_verification`, the recommendation MUST describe the row + as fallback behavior and MUST NOT imply regex-aware narrowing. + +### Gate 3: performance bucket + +For each benchmark class, compare native median time to shell median time: + +- **Native win**: native median is at least 20% faster (`<= 0.8x` shell median). +- **Near parity**: native median is within ±20% of shell (`> 0.8x` and `<= 1.2x`). +- **Shell win**: native median is more than 20% slower (`> 1.2x` shell median). + +### Guidance rules + +- `SHOULD prefer native` when parity passes and native is a **Native win**, or when native is + **Near parity** and the case benefits from structured output plus workspace-safe verification. +- `MAY prefer native` when parity passes but the result depends on warm-index reuse and cold/no-index + costs are materially worse. +- `MAY keep shell` when parity passes but shell is a **Shell win**, especially for ephemeral, + one-off, or regex-heavy fallback scenarios. +- `DO NOT claim native preference` for unmeasured or failed-parity cases. + +This rubric lets the docs explain recommendations with measured evidence while keeping operator +tradeoffs explicit. + +## File Changes + +| File | Action | Description | +|------|--------|-------------| +| `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` | Create | Dedicated rollout benchmark runner that prepares scenarios, executes shell/native states, records timings, labels plan mode, and checks parity. | +| `clients/agent-runtime/benches/agent_benchmarks.rs` | Modify | Add a short note or companion entrypoint reference if needed so low-level microbenches and rollout benchmarks are clearly separated. | +| `clients/agent-runtime/docs/design/code-search-tool.md` | Modify | Align the internal design doc with the implemented indexed-planning reality and point readers to the canonical rollout documentation for current behavior. | +| `docs/clients/agent-runtime/tools/code-search.md` | Create | Canonical English documentation for current behavior, benchmark methodology, recorded results, rollout guidance, fallback reasons, and deferred optimizations. | +| `docs/clients/agent-runtime/tools/core.md` | Modify | Add `code_search` to the core tools reference and link to the dedicated page. | +| `docs/clients/agent-runtime/tools/index.mdx` | Modify | Link the dedicated `code_search` page from the tools index. | +| `docs/es/clients/agent-runtime/tools/code-search.md` | Create | Spanish companion page with the same behavior limits, benchmark summary, rollout guidance, and deferred-work separation. | +| `docs/es/clients/agent-runtime/tools/core.md` | Modify | Add the Spanish `code_search` reference and link to the dedicated page. | +| `docs/es/clients/agent-runtime/tools/index.mdx` | Modify | Add the Spanish navigation link for the dedicated `code_search` page. | + +## Interfaces / Contracts + +The rollout benchmark runner will use explicit scenario and result structs so measurements can be +reported consistently. + +```rust +enum QueryKind { + Literal, + Regex, +} + +enum ResultShape { + SmallHit, + LargeHit, + NoHit, +} + +enum ExecutionMode { + ShellBaseline, + NativeNoIndex, + NativeColdBuild, + NativeWarmIndex, +} + +enum PlanMode { + IndexedNarrowing, + FallbackDiscoveryLiveVerification, + IndexUnavailable, +} + +struct BenchmarkCase { + id: &'static str, + query_kind: QueryKind, + result_shape: ResultShape, + pattern: String, + is_regex: bool, + path: String, + include: Vec, + exclude: Vec, + case_sensitive: bool, + whole_word: bool, +} + +struct BenchmarkMeasurement { + case_id: String, + execution_mode: ExecutionMode, + plan_mode: PlanMode, + plan_reason: String, + samples: usize, + median_ms: u64, + p95_ms: u64, + build_median_ms: Option, + search_median_ms: Option, + parity_passed: bool, +} +``` + +Shell command construction contract: + +- literal cases use grep fixed-string mode, +- regex cases use grep extended-regex mode, +- path/include filters are mapped only for the supported benchmark overlap set, +- command generation is deterministic and test-covered. + +## Testing Strategy + +| Layer | What to Test | Approach | +|-------|--------------|----------| +| Unit | Shell command builder for literal/regex/path/include cases | Example-local tests that verify generated grep commands match the intended benchmark case shape. | +| Unit | Plan-mode labeling | Tests that map `CandidateCoverage` + reason (`query_regex_not_supported`, `index_unavailable`, etc.) to the reported benchmark label. | +| Unit | Canonical comparator | Feed shell lines and native structured matches into the comparator and verify pass/fail behavior, including duplicate-line rejection. | +| Integration | Native state preparation | Run a temp-workspace case that proves `native_no_index`, `native_cold_build`, and `native_warm_index` produce the expected planner state transitions. | +| Integration | Regex fallback labeling | Use a built index plus a regex case and verify the benchmark row is labeled `fallback_discovery_live_verification` with reason `query_regex_not_supported`. | +| Integration | End-to-end benchmark row | Execute one small fixture case through both shell and native modes and assert parity plus non-empty measurement output. | +| Docs | Recorded results consistency | Manual review during this change: benchmark tables in English and Spanish docs must describe the same measured recommendation and explicitly mark deferred items as non-v1. | + +## Migration / Rollout + +No migration is required. + +Rollout is documentation-led: + +1. implement the benchmark runner, +2. execute the benchmark matrix on the chosen environments, +3. record the methodology and results in the dedicated docs page, +4. derive the recommendation using the rubric above, +5. keep deferred optimizations in a separate section clearly labeled out of scope for v1. + +The docs will explicitly separate: + +- **Current measured recommendation**, +- **Current limitations and fallback reasons**, +- **Future optimizations (non-v1)** such as regex-aware index narrowing, case-insensitive index + narrowing, whole-word index narrowing, or other planner/search-engine changes. + +## Open Questions + +- [ ] Confirm whether the rollout benchmark should check in a machine-readable artifact in addition + to the recorded markdown tables, or whether the docs page alone is the repository source of + truth for measured results. +- [ ] Confirm the exact benchmark sample counts for cold-build versus warm-index rows so runtime + cost stays practical in CI/local execution without weakening the evidence. diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/proposal.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/proposal.md new file mode 100644 index 000000000..3afcec217 --- /dev/null +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/proposal.md @@ -0,0 +1,78 @@ +# Proposal: `code_search` Benchmark Rollout and Documentation + +**Issue**: #360 + +## Intent + +Corvus now has a native `code_search` path with verified regex correctness and safety, plus an indexed candidate-planning path for supported literal queries. However, regex requests do not yet participate in indexed narrowing: `plan_candidates()` returns `query_regex_not_supported`, and regex searches safely fall back to workspace discovery plus live verification. + +This change exists to measure whether the native path is worth preferring over the current shell/grep-based workflow, document exactly when and why fallback happens, and provide rollout guidance grounded in representative data rather than assumptions. + +## Scope + +### In Scope +- Define representative benchmark coverage for `code_search` versus the current shell/grep-based search workflow across realistic repository query shapes. +- Capture benchmark results for both cold-index and warm-index native search behavior, including regex requests that currently fall back from index planning to discovery plus live verification. +- Document native `code_search` usage, current limitations, safety/correctness guarantees, and fallback behavior for unsupported index-planning query shapes. +- Document rollout guidance that explains when the native tool should be preferred over shell search and what evidence supports that recommendation. +- Separate non-v1 optimizations and future search-performance ideas from the rollout decision so the initial recommendation is based on today's verified behavior. + +### Out of Scope +- Changing regex semantics, safety rules, or the current fallback behavior itself. +- Adding regex-aware index planning, case-insensitive index narrowing, whole-word index narrowing, or other search-engine optimizations as part of this change. +- Replacing live verification with index-only matches. +- Broad redesign of the `code_search` API or shell tool behavior beyond the benchmark/documentation work needed for rollout. + +## Approach + +Treat this as an evidence-and-guidance change, not a search-engine rewrite. + +1. Establish a benchmark matrix that compares native `code_search` against the shell/grep-based workflow for representative literal and regex searches, including small-hit, large-hit, and no-hit cases. +2. Measure native behavior in at least two index states: cold/no reusable index and warm/compatible reusable index. Regex cases must be called out explicitly because they currently bypass indexed narrowing and execute through discovery plus live verification. +3. Publish benchmark results and methodology in repository documentation so reviewers can reproduce or interpret the rollout recommendation. +4. Update runtime tool documentation to explain tool usage, known limits, index-planning support boundaries, authoritative live verification, and current fallback reasons such as `query_regex_not_supported`. +5. Add a separate future-optimizations section so follow-up opportunities are visible without being mistaken for v1 rollout requirements. + +The decision standard is practical: the rollout guidance should recommend the native path only where the measured behavior, correctness guarantees, and operational ergonomics justify it today. + +## Affected Areas + +| Area | Impact | Description | +|------|--------|-------------| +| `clients/agent-runtime/src/tools/code_search.rs` | Referenced / possibly modified later | Source of truth for current native tool behavior, structured output, and fallback orchestration that benchmarks and docs must describe accurately. | +| `clients/agent-runtime/src/search/index.rs` | Referenced / possibly modified later | Documents the current candidate-planning boundary, including `query_regex_not_supported` for regex queries and other unsupported narrowing modes. | +| `clients/agent-runtime/src/search/tests.rs` | Possibly modified later | Likely home for regression or benchmark-adjacent validation proving documented fallback and index-state behavior remain accurate. | +| `docs/clients/agent-runtime/tools/` | Modified | Add or update English documentation for `code_search` usage, benchmark findings, fallback behavior, rollout guidance, and deferred optimizations. | +| `docs/es/clients/agent-runtime/tools/` | Modified | Keep Spanish runtime-tool documentation aligned if rollout guidance is surfaced in localized docs. | +| `openspec/specs/regex-semantics/spec.md` | Referenced in later phases | Existing spec establishes regex correctness and live-verification authority that rollout docs must reflect. | +| `openspec/specs/workspace-index/spec.md` | Referenced in later phases | Existing spec defines the indexed candidate model, advisory status, and cases where narrowing cannot safely apply. | + +## Risks + +| Risk | Likelihood | Mitigation | +|------|------------|------------| +| Benchmarks are not representative and lead to a bad rollout recommendation | Medium | Define a benchmark matrix with realistic literal/regex, hit/no-hit, and cold/warm scenarios tied to actual repository usage patterns. | +| Documentation overstates native index support for regex queries | Medium | Explicitly document that regex correctness is supported, but regex candidate narrowing is not part of v1 and currently falls back from index planning to discovery plus live verification. | +| Results become stale as search internals evolve | Medium | Document methodology, environment assumptions, and the exact behavior/version being measured so follow-up phases can refresh the numbers cleanly. | +| Rollout guidance is interpreted as a hard deprecation of shell search | Low | Frame the recommendation as conditional guidance with clear tradeoffs, not a blanket removal of shell-based workflows. | +| Future optimization ideas blur into current commitments | Medium | Keep a dedicated non-v1 section that clearly labels deferred work and excludes it from rollout success criteria. | + +## Rollback Plan + +If the benchmark methodology, results, or rollout recommendation prove misleading, revert the benchmark and documentation updates and keep the existing shell-search guidance unchanged. Because this change is intended to produce evidence and documentation rather than alter the core search semantics, rollback is limited to removing or correcting the published benchmark and rollout artifacts. + +## Dependencies + +- Existing native search behavior in `clients/agent-runtime/src/tools/code_search.rs` +- Existing candidate-planning constraints in `clients/agent-runtime/src/search/index.rs` +- Existing correctness guarantees in `openspec/specs/regex-semantics/spec.md` +- Existing indexed-candidate and freshness guarantees in `openspec/specs/workspace-index/spec.md` +- Issue #360 acceptance criteria and verified finding that regex queries currently fall back from index planning via `query_regex_not_supported` + +## Success Criteria + +- [ ] The change defines a representative benchmark matrix comparing native `code_search` and the current shell/grep-based workflow. +- [ ] Benchmark results capture cold-index and warm-index native behavior and explicitly account for regex queries that currently bypass indexed narrowing. +- [ ] Documentation explains native tool usage, correctness guarantees, known limitations, and fallback behavior, including the current regex-planning limitation. +- [ ] Rollout guidance states when the native path should be preferred, when shell search may still be appropriate, and why. +- [ ] Deferred optimizations are listed separately from v1 rollout expectations so future work does not distort the current recommendation. diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/specs/code-search-rollout/spec.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/specs/code-search-rollout/spec.md new file mode 100644 index 000000000..4e997125a --- /dev/null +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/specs/code-search-rollout/spec.md @@ -0,0 +1,163 @@ +# code_search Benchmark Rollout Specification + +## Purpose + +Defines the evidence, documentation, and rollout guidance required to decide when Corvus should +prefer the native `code_search` tool over the current shell/grep-based workflow for Issue #360. + +This specification covers benchmark coverage, recorded benchmark results, documented current +behavior and limits, rollout recommendations grounded in measured data, and explicit separation of +future optimizations from v1 rollout requirements. It does not change `code_search` matching +semantics, safety guarantees, or index-planning behavior. + +## Requirements + +### Requirement: REQ-CSR-001 Benchmark Matrix Coverage + +The change MUST define a benchmark matrix that compares native `code_search` with the current +shell/grep-based workflow across representative repository search shapes. + +The benchmark matrix MUST include, at minimum: + +- literal searches and regex searches, +- small-hit, large-hit, and no-hit queries, +- native search runs in cold/no-reusable-index and warm/reusable-index states, +- explicit coverage for regex requests that currently bypass indexed candidate narrowing and fall + back from planning with `query_regex_not_supported`, +- enough query and repository context for a reviewer to understand what each benchmark is intended + to represent. + +#### Scenario: Benchmark matrix covers representative native and shell comparisons + +- GIVEN the benchmark plan for `code_search` rollout +- WHEN a reviewer inspects the documented benchmark matrix +- THEN the matrix MUST include native-versus-shell comparisons for both literal and regex queries +- AND it MUST include small-hit, large-hit, and no-hit cases +- AND it MUST distinguish cold-index native runs from warm-index native runs + +#### Scenario: Regex fallback cases are benchmarked explicitly + +- GIVEN the verified v1 behavior that regex correctness is supported but indexed narrowing is not +- WHEN the benchmark matrix documents regex scenarios +- THEN it MUST identify those scenarios as fallback cases from index planning +- AND it MUST describe them as discovery plus live verification rather than indexed narrowing +- AND it MUST NOT describe regex benchmarks as evidence of native regex-aware candidate planning + +### Requirement: REQ-CSR-002 Recorded Results and Methodology + +The change MUST publish recorded benchmark results together with enough methodology and environment +context for reviewers to interpret or reproduce the rollout recommendation. + +The recorded results MUST, at minimum: + +- preserve the measured outcomes for each benchmark matrix entry, +- distinguish shell results from native cold-index and native warm-index results where applicable, +- identify which native cases executed through indexed candidate narrowing versus fallback + discovery plus live verification, +- record the repository, environment assumptions, and measurement method used for the published + results, +- avoid presenting anecdotal or partial observations as rollout guidance. + +#### Scenario: Recorded results preserve comparison outcomes by execution mode + +- GIVEN a completed benchmark run for the rollout change +- WHEN the benchmark results are published +- THEN each documented case MUST preserve the measured result for the shell workflow +- AND it MUST preserve the measured result for native search in each relevant index state +- AND it MUST indicate whether the native case used indexed candidate narrowing or fallback + discovery plus live verification + +#### Scenario: Results remain interpretable when behavior evolves later + +- GIVEN benchmark results published for the current version of `code_search` +- WHEN a later reviewer compares those results to newer runtime behavior +- THEN the published methodology MUST identify the measured behavior and environment assumptions +- AND the reviewer MUST be able to tell that the recorded numbers apply to the documented v1 + behavior rather than an unspecified future implementation + +### Requirement: REQ-CSR-003 Documentation of Current Behavior, Limits, and Fallbacks + +The runtime-tool documentation MUST explain the current `code_search` behavior relevant to rollout +without overstating native search capabilities. + +That documentation MUST, at minimum: + +- describe the expected search behavior and current correctness guarantees, +- state that live verification against current file contents remains authoritative, +- state that regex correctness and safety are supported, +- state that indexed candidate narrowing currently does NOT support regex requests and that + `query_regex_not_supported` causes fallback to discovery plus live verification, +- document any known v1 limitations that materially affect rollout interpretation, +- distinguish current verified behavior from deferred optimization ideas. + +#### Scenario: Documentation describes regex support without overstating index support + +- GIVEN the runtime-tool documentation for `code_search` +- WHEN a reader looks for regex behavior details +- THEN the documentation MUST state that regex matching is supported with the existing correctness + and safety guarantees +- AND it MUST state that regex candidate narrowing is not part of v1 indexed planning +- AND it MUST describe `query_regex_not_supported` as a fallback reason rather than a search error + +#### Scenario: Documentation preserves live verification as the source of truth + +- GIVEN a reader evaluating whether indexed search results are authoritative on their own +- WHEN the reader consults the rollout documentation +- THEN the documentation MUST state that final matches come from live verification of current file + contents +- AND it MUST NOT imply that indexed candidates alone are sufficient to report a match + +### Requirement: REQ-CSR-004 Evidence-Based Rollout Guidance + +The change MUST provide rollout guidance that explains when native `code_search` SHOULD be +preferred over shell/grep-based search and when shell search MAY still be appropriate. + +That guidance MUST be grounded in the recorded benchmark results and documented current behavior. +It MUST, at minimum: + +- identify the benchmark-supported cases where native search is recommended, +- identify cases where shell search remains reasonable or preferred, +- explain the tradeoffs using current evidence instead of assumptions, +- avoid presenting the recommendation as a blanket deprecation of shell search, +- align the recommendation with current v1 behavior, including regex fallback and live + verification. + +#### Scenario: Rollout guidance recommends native search only where evidence supports it + +- GIVEN published benchmark results for native and shell workflows +- WHEN the rollout recommendation is written +- THEN the recommendation MUST identify the situations where native `code_search` SHOULD be + preferred based on the measured outcomes and documented ergonomics +- AND it MUST explain why those situations justify preference today +- AND it MUST NOT claim preference for unmeasured or unsupported cases + +#### Scenario: Rollout guidance leaves room for shell search where appropriate + +- GIVEN the current shell/grep-based workflow remains available +- WHEN the rollout guidance discusses non-preferred native cases or operator tradeoffs +- THEN it MUST identify cases where shell search MAY remain appropriate +- AND it MUST frame that guidance as conditional tradeoffs rather than removal of shell-based + workflows + +### Requirement: REQ-CSR-005 Deferred Optimization Separation + +The change MUST separate deferred optimization opportunities from v1 rollout requirements. + +Deferred items MAY include future search-performance or index-planning improvements, but they MUST +be labeled as non-v1 work and MUST NOT be required for the current rollout recommendation. + +#### Scenario: Deferred optimization ideas are documented separately from v1 requirements + +- GIVEN the rollout artifacts for this change +- WHEN a reviewer reads about future opportunities +- THEN those items MUST appear in a clearly separate deferred or future-optimization section +- AND they MUST be labeled as outside the v1 rollout requirements +- AND their absence MUST NOT invalidate the current benchmark-backed recommendation + +#### Scenario: v1 rollout decision does not imply search-engine behavior changes + +- GIVEN this change is limited to benchmarks, documentation, and rollout guidance +- WHEN a reviewer inspects the specification and artifacts +- THEN the v1 requirements MUST NOT require regex-aware indexed candidate narrowing or other new + search-engine optimizations +- AND the rollout decision MUST be expressed using the currently verified behavior \ No newline at end of file diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/state.yaml b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/state.yaml new file mode 100644 index 000000000..b61a549fb --- /dev/null +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/state.yaml @@ -0,0 +1,12 @@ +change: code-search-benchmark-rollout +current_phase: verify +completed: + - explore + - propose + - spec + - design + - tasks + - apply + - verify +next: archive +updated: 2026-04-05T00:00:00Z diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md new file mode 100644 index 000000000..3bcfb78e7 --- /dev/null +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md @@ -0,0 +1,25 @@ +# Tasks: `code_search` Benchmark Rollout and Documentation + +## Phase 1: Benchmark Runner Foundation + +- [x] 1.1 Create `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` with the benchmark case/result structs, environment metadata capture, and CLI entry flow for fixture and repo-snapshot runs. +- [x] 1.2 In `clients/agent-runtime/examples/code_search_rollout_benchmark.rs`, implement deterministic shell command generation plus native execution-state prep for `shell_baseline`, `native_no_index`, `native_cold_build`, and `native_warm_index`. +- [x] 1.3 Add example-local tests in `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` for shell command building, plan-mode labeling, and canonical line-match normalization. + +## Phase 2: Parity and Runtime Verification + +- [x] 2.1 Extend `clients/agent-runtime/src/search/tests.rs` with coverage proving regex requests still label as `fallback_discovery_live_verification` with reason `query_regex_not_supported` after index build. +- [x] 2.2 Add an end-to-end smoke case in `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` that runs one fixture scenario through shell and native paths and asserts canonical parity plus non-empty measurements. +- [x] 2.3 Update `clients/agent-runtime/benches/agent_benchmarks.rs` to separate Criterion microbenches from the rollout benchmark entrypoint so contributors run the right harness. + +## Phase 3: Recorded Results and Rollout Docs + +- [x] 3.1 Run `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` on the deterministic fixture workspace and the current repo snapshot, then record the measured rows and environment metadata in `docs/clients/agent-runtime/tools/code-search.md`. +- [x] 3.2 In `docs/clients/agent-runtime/tools/code-search.md`, document current behavior, live verification authority, regex fallback semantics, rollout recommendations, and a clearly separate non-v1 optimizations section. +- [x] 3.3 Update `clients/agent-runtime/docs/design/code-search-tool.md`, `docs/clients/agent-runtime/tools/core.md`, and `docs/clients/agent-runtime/tools/index.mdx` so internal and public references match the verified planner behavior and link the new page. +- [x] 3.4 Mirror the published benchmark summary, fallback wording, rollout guidance, and navigation updates in `docs/es/clients/agent-runtime/tools/code-search.md`, `docs/es/clients/agent-runtime/tools/core.md`, and `docs/es/clients/agent-runtime/tools/index.mdx`. + +## Phase 4: Final Verification + +- [x] 4.1 Run targeted Rust validation for the new runner and search coverage in `clients/agent-runtime` and fix any parity, planner-label, or example-test regressions before marking the change complete. +- [x] 4.2 Review the English and Spanish docs against `openspec/changes/code-search-benchmark-rollout/specs/code-search-rollout/spec.md` and confirm they never imply indexed regex narrowing or blanket shell deprecation. diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md new file mode 100644 index 000000000..9e01ffc3c --- /dev/null +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md @@ -0,0 +1,99 @@ +# Verification Report + +**Change**: code-search-benchmark-rollout +**Date**: 2026-04-05 + +## Completeness + +| Metric | Value | +|---|---:| +| Tasks total | 10 | +| Tasks complete | 10 | +| Tasks incomplete | 0 | + +All tasks in `openspec/changes/code-search-benchmark-rollout/tasks.md` are marked complete. + +## Build & Test Execution + +### Executed during verification + +1. `cargo test --manifest-path clients/agent-runtime/Cargo.toml --example code_search_rollout_benchmark` + - Result: ✅ passed + - Evidence: 6/6 example tests passed, including shell-command generation, plan-mode labeling, canonicalization, and fixture smoke parity. + +2. `cargo test --manifest-path clients/agent-runtime/Cargo.toml candidate_planner_marks_regex_and_short_patterns_unavailable` + - Result: ✅ passed + - Evidence: targeted planner regression passed; verified `query_regex_not_supported` remains the planner reason for regex. + +3. `cargo run --manifest-path clients/agent-runtime/Cargo.toml --example code_search_rollout_benchmark -- --workspace fixture --samples 1 --cold-build-samples 1` + - Result: ✅ passed + - Evidence: fixture benchmark executed end-to-end, emitted shell/native rows for literal and regex cases, and labeled regex rows as `fallback_discovery_live_verification` with `query_regex_not_supported`. + +4. `cargo fmt --manifest-path clients/agent-runtime/Cargo.toml --all -- --check` + - Result: ✅ passed + - Evidence: formatting re-check completed successfully during re-verification. + +### Not rerun during verification + +- Full repo-snapshot benchmark (`--workspace repo` / `both` with published sample counts) was **not** rerun successfully in this verify pass. +- Orchestration context states that the repo benchmark rerun still times out; this verify pass relies on the recorded repo-snapshot results already published in the docs. + +## Spec Compliance Matrix + +| Requirement | Scenario | Evidence | Result | +|---|---|---|---| +| REQ-CSR-001 | Benchmark matrix covers representative native and shell comparisons | `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` fixture/repo case matrix + successful fixture benchmark run + published matrix tables in docs | ✅ COMPLIANT | +| REQ-CSR-001 | Regex fallback cases are benchmarked explicitly | example runner labels regex fallback; planner regression test; docs label regex as fallback with `query_regex_not_supported` and not regex-aware narrowing | ✅ COMPLIANT | +| REQ-CSR-002 | Recorded results preserve comparison outcomes by execution mode | published English/Spanish result tables include shell, no-index, cold-build, warm-index, plan mode, reason, parity | ✅ COMPLIANT | +| REQ-CSR-002 | Results remain interpretable when behavior evolves later | docs record workspace kind, SHA, host, file count, timestamps, debug profile, methodology | ✅ COMPLIANT | +| REQ-CSR-003 | Documentation describes regex support without overstating index support | English/Spanish `code-search.md`, tool references, and internal design doc explicitly preserve regex correctness + `query_regex_not_supported` fallback semantics | ✅ COMPLIANT | +| REQ-CSR-003 | Documentation preserves live verification as the source of truth | canonical docs explicitly state final matches come from live verification | ✅ COMPLIANT | +| REQ-CSR-004 | Rollout guidance recommends native only where evidence supports it | guidance distinguishes regex measured wins, literal warm/cold tradeoffs, and unmeasured-case limits | ✅ COMPLIANT | +| REQ-CSR-004 | Rollout guidance leaves room for shell where appropriate | docs retain conditional `MAY keep shell / grep` guidance and avoid blanket deprecation | ✅ COMPLIANT | +| REQ-CSR-005 | Deferred optimization ideas are documented separately from v1 requirements | dedicated non-v1 future optimization sections in English/Spanish docs | ✅ COMPLIANT | +| REQ-CSR-005 | v1 rollout decision does not imply search-engine behavior changes | docs and code preserve current planner reason/limits; no regex-aware narrowing added | ✅ COMPLIANT | + +Compliance summary: 10/10 scenarios compliant based on executed tests plus artifact inspection. + +## Correctness (Static) + +| Requirement | Status | Notes | +|---|---|---| +| REQ-CSR-001 | ✅ Implemented | Dedicated rollout runner defines literal/regex, hit-shape, and execution-state coverage for fixture and repo snapshot workspaces. | +| REQ-CSR-002 | ✅ Implemented | Docs publish measured tables, environment metadata, parity, plan mode, and plan reason. | +| REQ-CSR-003 | ✅ Implemented | Docs and tests correctly preserve regex support without implying regex-aware narrowing. | +| REQ-CSR-004 | ✅ Implemented | Rollout guidance is conditional and evidence-based, not blanket shell deprecation. | +| REQ-CSR-005 | ✅ Implemented | Future optimizations are clearly separated from v1 rollout guidance. | + +## Coherence (Design) + +| Decision | Followed? | Notes | +|---|---|---| +| Dedicated rollout benchmark runner | ✅ Yes | Implemented at `clients/agent-runtime/examples/code_search_rollout_benchmark.rs`. | +| Shell baseline through `ShellTool` + `NativeRuntime` | ✅ Yes | Runner instantiates `ShellTool::new(... NativeRuntime ...)` and benchmarks `grep` via shell. | +| Explicit native evidence states | ✅ Yes | `native_no_index`, `native_cold_build`, and `native_warm_index` are implemented and documented. | +| Canonical correctness parity gate | ✅ Yes | Example tests and fixture benchmark validate canonical shell/native parity. | +| Deferred optimizations kept out of v1 | ✅ Yes | Docs isolate non-v1 items in dedicated future-optimization sections. | +| File changes table alignment | ✅ Yes | Runner, bench note, planner regression, internal doc, English docs, and Spanish docs are present. | + +## Issues Found + +### CRITICAL + +None. + +### WARNING + +1. Repo-snapshot benchmark evidence was not rerun successfully in this verify pass; published repo numbers were reviewed from docs, but fresh execution evidence is still missing because the rerun timed out. +2. The recorded benchmark results live in docs only; there is still no separate machine-readable artifact for independent diffing or replay. + +### SUGGESTION + +1. Add a narrower repo benchmark mode (subset or case filter) so verify can rerun repo evidence without timing out. +2. Add a docs-consistency check or exported artifact to reduce risk of table drift between measured output and published docs. + +## Verdict + +**PASS WITH WARNINGS** + +The implementation matches the proposal/spec/design/tasks and preserves the required regex fallback semantics (`query_regex_not_supported` with no regex-aware narrowing). Formatting is now clean, targeted behavioral evidence still passes, and the remaining gap is limited to fresh repo-snapshot benchmark reproduction. \ No newline at end of file diff --git a/openspec/specs/code-search-rollout/spec.md b/openspec/specs/code-search-rollout/spec.md new file mode 100644 index 000000000..57dc3a0cf --- /dev/null +++ b/openspec/specs/code-search-rollout/spec.md @@ -0,0 +1,163 @@ +# code_search Benchmark Rollout Specification + +## Purpose + +Defines the evidence, documentation, and rollout guidance required to decide when Corvus should +prefer the native `code_search` tool over the current shell/grep-based workflow for Issue #360. + +This specification covers benchmark coverage, recorded benchmark results, documented current +behavior and limits, rollout recommendations grounded in measured data, and explicit separation of +future optimizations from v1 rollout requirements. It does not change `code_search` matching +semantics, safety guarantees, or index-planning behavior. + +## Requirements + +### Requirement: REQ-CSR-001 Benchmark Matrix Coverage + +The change MUST define a benchmark matrix that compares native `code_search` with the current +shell/grep-based workflow across representative repository search shapes. + +The benchmark matrix MUST include, at minimum: + +- literal searches and regex searches, +- small-hit, large-hit, and no-hit queries, +- native search runs in cold/no-reusable-index and warm/reusable-index states, +- explicit coverage for regex requests that currently bypass indexed candidate narrowing and fall + back from planning with `query_regex_not_supported`, +- enough query and repository context for a reviewer to understand what each benchmark is intended + to represent. + +#### Scenario: Benchmark matrix covers representative native and shell comparisons + +- GIVEN the benchmark plan for `code_search` rollout +- WHEN a reviewer inspects the documented benchmark matrix +- THEN the matrix MUST include native-versus-shell comparisons for both literal and regex queries +- AND it MUST include small-hit, large-hit, and no-hit cases +- AND it MUST distinguish cold-index native runs from warm-index native runs + +#### Scenario: Regex fallback cases are benchmarked explicitly + +- GIVEN the verified v1 behavior that regex correctness is supported but indexed narrowing is not +- WHEN the benchmark matrix documents regex scenarios +- THEN it MUST identify those scenarios as fallback cases from index planning +- AND it MUST describe them as discovery plus live verification rather than indexed narrowing +- AND it MUST NOT describe regex benchmarks as evidence of native regex-aware candidate planning + +### Requirement: REQ-CSR-002 Recorded Results and Methodology + +The change MUST publish recorded benchmark results together with enough methodology and environment +context for reviewers to interpret or reproduce the rollout recommendation. + +The recorded results MUST, at minimum: + +- preserve the measured outcomes for each benchmark matrix entry, +- distinguish shell results from native cold-index and native warm-index results where applicable, +- identify which native cases executed through indexed candidate narrowing versus fallback + discovery plus live verification, +- record the repository, environment assumptions, and measurement method used for the published + results, +- avoid presenting anecdotal or partial observations as rollout guidance. + +#### Scenario: Recorded results preserve comparison outcomes by execution mode + +- GIVEN a completed benchmark run for the rollout change +- WHEN the benchmark results are published +- THEN each documented case MUST preserve the measured result for the shell workflow +- AND it MUST preserve the measured result for native search in each relevant index state +- AND it MUST indicate whether the native case used indexed candidate narrowing or fallback + discovery plus live verification + +#### Scenario: Results remain interpretable when behavior evolves later + +- GIVEN benchmark results published for the current version of `code_search` +- WHEN a later reviewer compares those results to newer runtime behavior +- THEN the published methodology MUST identify the measured behavior and environment assumptions +- AND the reviewer MUST be able to tell that the recorded numbers apply to the documented v1 + behavior rather than an unspecified future implementation + +### Requirement: REQ-CSR-003 Documentation of Current Behavior, Limits, and Fallbacks + +The runtime-tool documentation MUST explain the current `code_search` behavior relevant to rollout +without overstating native search capabilities. + +That documentation MUST, at minimum: + +- describe the expected search behavior and current correctness guarantees, +- state that live verification against current file contents remains authoritative, +- state that regex correctness and safety are supported, +- state that indexed candidate narrowing currently does NOT support regex requests and that + `query_regex_not_supported` causes fallback to discovery plus live verification, +- document any known v1 limitations that materially affect rollout interpretation, +- distinguish current verified behavior from deferred optimization ideas. + +#### Scenario: Documentation describes regex support without overstating index support + +- GIVEN the runtime-tool documentation for `code_search` +- WHEN a reader looks for regex behavior details +- THEN the documentation MUST state that regex matching is supported with the existing correctness + and safety guarantees +- AND it MUST state that regex candidate narrowing is not part of v1 indexed planning +- AND it MUST describe `query_regex_not_supported` as a fallback reason rather than a search error + +#### Scenario: Documentation preserves live verification as the source of truth + +- GIVEN a reader evaluating whether indexed search results are authoritative on their own +- WHEN the reader consults the rollout documentation +- THEN the documentation MUST state that final matches come from live verification of current file + contents +- AND it MUST NOT imply that indexed candidates alone are sufficient to report a match + +### Requirement: REQ-CSR-004 Evidence-Based Rollout Guidance + +The change MUST provide rollout guidance that explains when native `code_search` SHOULD be +preferred over shell/grep-based search and when shell search MAY still be appropriate. + +That guidance MUST be grounded in the recorded benchmark results and documented current behavior. +It MUST, at minimum: + +- identify the benchmark-supported cases where native search is recommended, +- identify cases where shell search remains reasonable or preferred, +- explain the tradeoffs using current evidence instead of assumptions, +- avoid presenting the recommendation as a blanket deprecation of shell search, +- align the recommendation with current v1 behavior, including regex fallback and live + verification. + +#### Scenario: Rollout guidance recommends native search only where evidence supports it + +- GIVEN published benchmark results for native and shell workflows +- WHEN the rollout recommendation is written +- THEN the recommendation MUST identify the situations where native `code_search` SHOULD be + preferred based on the measured outcomes and documented ergonomics +- AND it MUST explain why those situations justify preference today +- AND it MUST NOT claim preference for unmeasured or unsupported cases + +#### Scenario: Rollout guidance leaves room for shell search where appropriate + +- GIVEN the current shell/grep-based workflow remains available +- WHEN the rollout guidance discusses non-preferred native cases or operator tradeoffs +- THEN it MUST identify cases where shell search MAY remain appropriate +- AND it MUST frame that guidance as conditional tradeoffs rather than removal of shell-based + workflows + +### Requirement: REQ-CSR-005 Deferred Optimization Separation + +The change MUST separate deferred optimization opportunities from v1 rollout requirements. + +Deferred items MAY include future search-performance or index-planning improvements, but they MUST +be labeled as non-v1 work and MUST NOT be required for the current rollout recommendation. + +#### Scenario: Deferred optimization ideas are documented separately from v1 requirements + +- GIVEN the rollout artifacts for this change +- WHEN a reviewer reads about future opportunities +- THEN those items MUST appear in a clearly separate deferred or future-optimization section +- AND they MUST be labeled as outside the v1 rollout requirements +- AND their absence MUST NOT invalidate the current benchmark-backed recommendation + +#### Scenario: v1 rollout decision does not imply search-engine behavior changes + +- GIVEN this change is limited to benchmarks, documentation, and rollout guidance +- WHEN a reviewer inspects the specification and artifacts +- THEN the v1 requirements MUST NOT require regex-aware indexed candidate narrowing or other new + search-engine optimizations +- AND the rollout decision MUST be expressed using the currently verified behavior From b4afa501acd495ee39273000755344d6789299e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:49:34 +0200 Subject: [PATCH 2/7] fix(agent-runtime): satisfy rollout benchmark lint Rename rollout benchmark result-shape variants so the example passes the repo push checks without changing benchmark behavior. --- .../examples/code_search_rollout_benchmark.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs index b285de3d8..676d0ea24 100644 --- a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs +++ b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs @@ -34,17 +34,17 @@ impl QueryKind { #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum ResultShape { - SmallHit, - LargeHit, - NoHit, + Small, + Large, + Miss, } impl ResultShape { fn as_str(self) -> &'static str { match self { - Self::SmallHit => "small-hit", - Self::LargeHit => "large-hit", - Self::NoHit => "no-hit", + Self::Small => "small-hit", + Self::Large => "large-hit", + Self::Miss => "no-hit", } } } @@ -323,7 +323,7 @@ fn fixture_cases() -> Vec { BenchmarkCase { id: "literal_small_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::SmallHit, + result_shape: ResultShape::Small, pattern: "fixture_small_literal_unique", path: "src", case_sensitive: true, @@ -332,7 +332,7 @@ fn fixture_cases() -> Vec { BenchmarkCase { id: "literal_large_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::LargeHit, + result_shape: ResultShape::Large, pattern: "fixture_large_literal_shared", path: "src", case_sensitive: true, @@ -341,7 +341,7 @@ fn fixture_cases() -> Vec { BenchmarkCase { id: "literal_no_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::NoHit, + result_shape: ResultShape::Miss, pattern: "fixture_literal_rollout_no_hit", path: "src", case_sensitive: true, From 8780e523e195c8da926e5166ff377cb88b30efc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:50:02 +0200 Subject: [PATCH 3/7] fix(agent-runtime): complete rollout benchmark lint fix Update the remaining rollout benchmark enum references so the example compiles under the repo push checks. --- .../agent-runtime/examples/code_search_rollout_benchmark.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs index 676d0ea24..767650616 100644 --- a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs +++ b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs @@ -350,7 +350,7 @@ fn fixture_cases() -> Vec { BenchmarkCase { id: "regex_small_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::SmallHit, + result_shape: ResultShape::Small, pattern: "fixture_regex_unique_target", path: "src", case_sensitive: true, @@ -359,7 +359,7 @@ fn fixture_cases() -> Vec { BenchmarkCase { id: "regex_large_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::LargeHit, + result_shape: ResultShape::Large, pattern: "fixture_regex_bulk_case_", path: "src", case_sensitive: true, @@ -368,7 +368,7 @@ fn fixture_cases() -> Vec { BenchmarkCase { id: "regex_no_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::NoHit, + result_shape: ResultShape::Miss, pattern: "fixture_regex_rollout_no_match_20260405", path: "src", case_sensitive: true, From 8b786d98a552f80d4eae147bcfcc232f7c1d65d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:51:26 +0200 Subject: [PATCH 4/7] fix(agent-runtime): finish rollout benchmark enum rename Update the remaining benchmark runner and test references so the renamed result-shape variants match the lint-safe enum. --- .../examples/code_search_rollout_benchmark.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs index 767650616..631d7c064 100644 --- a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs +++ b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs @@ -382,7 +382,7 @@ fn repo_cases() -> Vec { BenchmarkCase { id: "literal_small_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::SmallHit, + result_shape: ResultShape::Small, pattern: "pub struct ToolResult", path: "clients/agent-runtime/src/tools", case_sensitive: true, @@ -391,7 +391,7 @@ fn repo_cases() -> Vec { BenchmarkCase { id: "literal_large_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::LargeHit, + result_shape: ResultShape::Large, pattern: "success:", path: "clients/agent-runtime/src/tools", case_sensitive: true, @@ -400,7 +400,7 @@ fn repo_cases() -> Vec { BenchmarkCase { id: "literal_no_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::NoHit, + result_shape: ResultShape::Miss, pattern: "code_search_rollout_literal_no_match_20260405", path: "clients/agent-runtime/src/tools", case_sensitive: true, @@ -409,7 +409,7 @@ fn repo_cases() -> Vec { BenchmarkCase { id: "regex_small_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::SmallHit, + result_shape: ResultShape::Small, pattern: "ToolResult", path: "clients/agent-runtime/src/tools", case_sensitive: true, @@ -418,7 +418,7 @@ fn repo_cases() -> Vec { BenchmarkCase { id: "regex_large_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::LargeHit, + result_shape: ResultShape::Large, pattern: "output:", path: "clients/agent-runtime/src/tools", case_sensitive: true, @@ -427,7 +427,7 @@ fn repo_cases() -> Vec { BenchmarkCase { id: "regex_no_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::NoHit, + result_shape: ResultShape::Miss, pattern: "code_search_rollout_regex_no_match_20260405", path: "clients/agent-runtime/src/tools", case_sensitive: true, @@ -1196,7 +1196,7 @@ mod tests { let case = BenchmarkCase { id: "literal_small_hit", query_kind: QueryKind::Literal, - result_shape: ResultShape::SmallHit, + result_shape: ResultShape::Small, pattern: "needle", path: "src", case_sensitive: true, @@ -1214,7 +1214,7 @@ mod tests { let case = BenchmarkCase { id: "regex_small_hit", query_kind: QueryKind::Regex, - result_shape: ResultShape::SmallHit, + result_shape: ResultShape::Small, pattern: "output:", path: "src/lib", case_sensitive: false, From 1db2c8d04f3a9a6d4ba1c469128587a4046c585a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:54:39 +0200 Subject: [PATCH 5/7] fix(web): align CSS imports with stylelint Update shared and app stylesheet imports to the url() form required by the current import-notation rule so pre-push checks can pass. --- clients/web/apps/chat/src/style.css | 4 ++-- clients/web/apps/dashboard/src/style.css | 4 ++-- clients/web/apps/docs/src/styles/custom.css | 4 ++-- clients/web/apps/marketing/src/styles/global.css | 4 ++-- clients/web/packages/shared/app-shell.css | 4 ++-- clients/web/packages/shared/tokens.css | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/clients/web/apps/chat/src/style.css b/clients/web/apps/chat/src/style.css index 5889ce2d1..c110c1b4c 100644 --- a/clients/web/apps/chat/src/style.css +++ b/clients/web/apps/chat/src/style.css @@ -1,5 +1,5 @@ -@import "@corvus/shared/app-shell.css"; -@import "tailwindcss"; +@import url("@corvus/shared/app-shell.css"); +@import url("tailwindcss"); /* ── Animations ─────────────────────────────────────────────── */ diff --git a/clients/web/apps/dashboard/src/style.css b/clients/web/apps/dashboard/src/style.css index 97304def6..a1c37edef 100644 --- a/clients/web/apps/dashboard/src/style.css +++ b/clients/web/apps/dashboard/src/style.css @@ -1,5 +1,5 @@ -@import "@corvus/shared/app-shell.css"; -@import "tailwindcss"; +@import url("@corvus/shared/app-shell.css"); +@import url("tailwindcss"); body { background: diff --git a/clients/web/apps/docs/src/styles/custom.css b/clients/web/apps/docs/src/styles/custom.css index 92459873a..6ac2729b5 100644 --- a/clients/web/apps/docs/src/styles/custom.css +++ b/clients/web/apps/docs/src/styles/custom.css @@ -4,8 +4,8 @@ ───────────────────────────────────────────── */ /* ── Shared Brand Tokens ── */ -@import "@corvus/shared/base.css"; -@import "@corvus/shared/theme.css"; +@import url("@corvus/shared/base.css"); +@import url("@corvus/shared/theme.css"); /* ── Root Theme Tokens ── */ :root { diff --git a/clients/web/apps/marketing/src/styles/global.css b/clients/web/apps/marketing/src/styles/global.css index 0531925d7..c46d16059 100644 --- a/clients/web/apps/marketing/src/styles/global.css +++ b/clients/web/apps/marketing/src/styles/global.css @@ -1,5 +1,5 @@ -@import "@corvus/shared/theme.css"; -@import "@corvus/shared/base.css"; +@import url("@corvus/shared/theme.css"); +@import url("@corvus/shared/base.css"); :root { --surface: rgb(10 15 25 / 84%); diff --git a/clients/web/packages/shared/app-shell.css b/clients/web/packages/shared/app-shell.css index 13b187453..a073f02ea 100644 --- a/clients/web/packages/shared/app-shell.css +++ b/clients/web/packages/shared/app-shell.css @@ -1,5 +1,5 @@ -@import "./theme.css"; -@import "./base.css"; +@import url("./theme.css"); +@import url("./base.css"); /* Shared dark application shell for the Vue apps. App-specific layout and visuals should live in each app stylesheet. */ diff --git a/clients/web/packages/shared/tokens.css b/clients/web/packages/shared/tokens.css index 1f5989a0f..1c812bfe6 100644 --- a/clients/web/packages/shared/tokens.css +++ b/clients/web/packages/shared/tokens.css @@ -1 +1 @@ -@import "./theme.css"; +@import url("./theme.css"); From a2cb9356e4a75f35285f35884b3f1bb4ab9a5993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 23:15:00 +0200 Subject: [PATCH 6/7] docs(agent-runtime): add code_search page descriptions Provide the required frontmatter descriptions so docs metadata validation passes for the new rollout guide pages. --- .../src/content/docs/clients/agent-runtime/tools/code-search.md | 1 + .../content/docs/es/clients/agent-runtime/tools/code-search.md | 1 + 2 files changed, 2 insertions(+) diff --git a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md index 74cf1bc5d..846b9aaab 100644 --- a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md +++ b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md @@ -1,6 +1,7 @@ --- title: code_search summary: Rollout guidance, benchmark evidence, and current behavior for the native code_search tool. +description: Benchmark-backed rollout guide for native code_search, including fallback behavior, limitations, and when to prefer it over shell search. owner: team-runtime status: canonical lastReviewed: 2026-04-05 diff --git a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md index 98ff7828b..b98b5aac3 100644 --- a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md +++ b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md @@ -1,6 +1,7 @@ --- title: code_search summary: Guía de rollout, evidencia de benchmarks y comportamiento actual de la herramienta nativa code_search. +description: Guía respaldada por benchmarks para el rollout de code_search nativo, incluyendo fallback, limitaciones y cuándo preferirlo sobre búsqueda por shell. owner: team-runtime status: canonical lastReviewed: 2026-04-05 From 775bb508f1db1f0e8728b74abb96e1c38af07453 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Sun, 5 Apr 2026 23:27:10 +0200 Subject: [PATCH 7/7] docs(code-search): update documentation for regex query handling and index behavior --- .../docs/design/code-search-tool.md | 68 +++++++++++-------- .../examples/code_search_rollout_benchmark.rs | 53 ++++++++++----- .../agent-runtime/tools/code-search.md | 18 ++--- .../agent-runtime/tools/code-search.md | 18 ++--- .../design.md | 16 ++--- .../tasks.md | 2 +- .../verify-report.md | 18 +++-- 7 files changed, 116 insertions(+), 77 deletions(-) diff --git a/clients/agent-runtime/docs/design/code-search-tool.md b/clients/agent-runtime/docs/design/code-search-tool.md index d1a60149e..5ac5881bc 100644 --- a/clients/agent-runtime/docs/design/code-search-tool.md +++ b/clients/agent-runtime/docs/design/code-search-tool.md @@ -12,8 +12,11 @@ Add a native `code_search` tool to the Corvus agent runtime that performs workspace-scoped text and regex search across source files. The tool follows the same `Tool` trait pattern as `file_read` and reuses the existing `SecurityPolicy` for path validation, rate limiting, and -workspace sandboxing. v1 uses brute-force directory walking via the `ignore` crate (for -`.gitignore` awareness) combined with the `regex` crate for pattern matching. No index is built. +workspace sandboxing. v1 uses directory discovery via the `ignore` crate (for `.gitignore` +awareness) combined with the `regex` crate for matching and live verification. Compatible literal +queries may use workspace trigram index narrowing when a compatible index exists, while regex +queries fall back from planning with `query_regex_not_supported` to discovery plus live +verification. The tool returns both a human-readable grep-like `output` string and a machine-readable `structured` JSON payload, consistent with the `ToolResult` contract. @@ -22,7 +25,7 @@ The tool returns both a human-readable grep-like `output` string and a machine-r > Safe literal queries may use workspace trigram index narrowing when a compatible index exists, > while regex requests still fall back from planning with `query_regex_not_supported` to > discovery plus live verification. For rollout evidence and the canonical behavior summary, see -> `docs/clients/agent-runtime/tools/code-search.md`. +> `clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md`. ## 1. Tool Schema (API Shape) @@ -486,47 +489,56 @@ with temp directories and `test_security()` / `test_security_with()` helpers. ## 7. Freshness Strategy -v1 has no index, no cache, and no in-memory result store. Every `code_search` invocation walks -the workspace directory from scratch and reads each file from disk at the moment of execution. +v1 may use workspace trigram index narrowing for compatible literal queries, but final matches +still come from live verification against current file contents. Regex queries do not use trigram +index narrowing in v1: planning returns `query_regex_not_supported`, then execution continues via +discovery plus live verification. When an otherwise index-eligible literal query has no compatible +index available, planning reports `index_unavailable` and execution also continues via discovery +plus live verification. ### Guarantee: reads reflect the latest writes -Because there is no intermediate data store that could become stale, the agent can rely on the -following read-after-write ordering for files that a later `code_search` is still allowed to scan: +Because live verification is authoritative, the agent can rely on the following read-after-write +ordering for files that a later `code_search` is still allowed to scan: 1. `file_write` completes the write through the runtime's file tool path (without promising an explicit `fsync`). -2. The next `code_search` invocation opens that same file from the OS filesystem. -3. The match result reflects the content written in step 1. +2. The next `code_search` invocation plans candidates (indexed trigram index narrowing for an + eligible literal query when available, or fallback planning for regex / index-unavailable + cases). +3. Candidate files are read from the OS filesystem during discovery plus live verification. +4. The match result reflects the content written in step 1. This guarantee is scoped to files that the subsequent `code_search` is allowed to scan (i.e., within the invoked `path` and `include` filters, and not excluded by `exclude` patterns, `.gitignore` rules, binary detection, or resource limits). Binary detection, ignore rules, and -resource limits can prevent the search from seeing the fresh write even under v1's "always read -from disk" model. No warm-up, index rebuild, or explicit invalidation step is needed between a -write and a subsequent search for files within the search scope. +resource limits can prevent the search from seeing the fresh write even when live verification is +working correctly. No manual warm-up step is needed between a write and a later search; the +runtime either refreshes eligible trigram index narrowing state or falls back to discovery plus +live verification. ### Implications for agent workflows - An agent that writes a file and immediately searches for a symbol it just added can expect `code_search` to observe that content when the file remains inside the requested `path`/`include` scope, is not ignored, is not detected as binary, and is not skipped by size or - other resource limits — there is no propagation delay for eligible files. + other resource limits — indexed trigram index narrowing is optional, but live verification stays + authoritative. +- Regex searches keep the same freshness contract because `query_regex_not_supported` routes them + through discovery plus live verification instead of regex-aware narrowing. - Concurrent writes from other processes may or may not be visible depending on OS buffering, but this is outside the scope of the agent's execution model (agents are single-threaded in their tool-call loop). - The 30-second execution timeout is a per-invocation bound, not a freshness window. -### Why v2 requires an explicit freshness strategy +### Future freshness work -If a future version adds a persistent trigram index (v2+), the index will become a second source -of truth that can diverge from the filesystem. That version must define: +Future search work may still need explicit freshness strategy for more advanced planner features, +including: -- **Write-through**: every `file_write` call triggers an index update for the affected file. -- **Invalidation horizon**: maximum age a cached index entry may have before re-reading the file. -- **Rebuild trigger**: conditions under which the full index is discarded and rebuilt. - -Until then, v1's "always read from disk" model is the simplest possible freshness guarantee. +- regex-aware index narrowing, +- case-insensitive or whole-word-aware narrowed plans, +- additional cached planner state beyond the current trigram index narrowing flow. ## Migration / Rollout @@ -540,20 +552,22 @@ No migration required. The tool is additive: ## v1 vs Future Scope -### v1 (this design) +### v1 (current runtime behavior) -- Brute-force directory walk + regex/literal scan -- `.gitignore`-aware via `ignore` crate +- `.gitignore`-aware discovery via `ignore` crate - Structured results with context lines +- Trigram index narrowing for compatible literal queries when a compatible index exists +- Regex planning fallback via `query_regex_not_supported` to discovery plus live verification - All safety constraints defined above - Single-line matching only (pattern matches within one line) ### v2+ (future — explicitly NOT in v1) -- Sparse n-gram index for sub-100ms searches on large repos -- Probabilistic bloom/mask filters for fast rejection +- Regex-aware index narrowing +- Case-insensitive or whole-word-aware narrowed plans +- Probabilistic bloom/mask filters for faster rejection - `mmap`-based file reading for reduced memory pressure -- Incremental index updates on file watch events +- Additional incremental planner/index optimizations beyond the current trigram index narrowing flow - Multi-line pattern matching (spanning line boundaries) - Search history / caching layer - AST-aware search (search by symbol kind: function, class, etc.) diff --git a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs index 631d7c064..16dad7daf 100644 --- a/clients/agent-runtime/examples/code_search_rollout_benchmark.rs +++ b/clients/agent-runtime/examples/code_search_rollout_benchmark.rs @@ -161,7 +161,6 @@ struct WorkspaceReport { #[derive(Debug, Clone)] struct EnvironmentMetadata { workspace_label: String, - workspace_root: PathBuf, workspace_kind: &'static str, file_count: usize, os: String, @@ -351,7 +350,7 @@ fn fixture_cases() -> Vec { id: "regex_small_hit", query_kind: QueryKind::Regex, result_shape: ResultShape::Small, - pattern: "fixture_regex_unique_target", + pattern: "fixture_regex_unique_.+", path: "src", case_sensitive: true, whole_word: false, @@ -360,7 +359,7 @@ fn fixture_cases() -> Vec { id: "regex_large_hit", query_kind: QueryKind::Regex, result_shape: ResultShape::Large, - pattern: "fixture_regex_bulk_case_", + pattern: "fixture_regex_bulk_case_.+", path: "src", case_sensitive: true, whole_word: false, @@ -369,7 +368,7 @@ fn fixture_cases() -> Vec { id: "regex_no_hit", query_kind: QueryKind::Regex, result_shape: ResultShape::Miss, - pattern: "fixture_regex_rollout_no_match_20260405", + pattern: "fixture_regex_rollout_no_match_.+", path: "src", case_sensitive: true, whole_word: false, @@ -410,7 +409,7 @@ fn repo_cases() -> Vec { id: "regex_small_hit", query_kind: QueryKind::Regex, result_shape: ResultShape::Small, - pattern: "ToolResult", + pattern: "pub +struct +ToolResult", path: "clients/agent-runtime/src/tools", case_sensitive: true, whole_word: false, @@ -419,7 +418,7 @@ fn repo_cases() -> Vec { id: "regex_large_hit", query_kind: QueryKind::Regex, result_shape: ResultShape::Large, - pattern: "output:", + pattern: "output( .+)? *:", path: "clients/agent-runtime/src/tools", case_sensitive: true, whole_word: false, @@ -428,7 +427,7 @@ fn repo_cases() -> Vec { id: "regex_no_hit", query_kind: QueryKind::Regex, result_shape: ResultShape::Miss, - pattern: "code_search_rollout_regex_no_match_20260405", + pattern: "code_search_rollout_regex_no_match_.+", path: "clients/agent-runtime/src/tools", case_sensitive: true, whole_word: false, @@ -1005,10 +1004,23 @@ fn percentile_ms(durations: &[Duration], percentile: usize) -> u64 { if durations.is_empty() { return 0; } - let mut values: Vec = durations.iter().map(Duration::as_millis).collect(); - values.sort_unstable(); - let index = ((values.len() - 1) * percentile) / 100; - u64::try_from(values[index]).unwrap_or(u64::MAX) + let mut values: Vec = durations + .iter() + .map(|duration| duration.as_millis() as f64) + .collect(); + values.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal)); + + let rank = (percentile as f64 / 100.0) * (values.len().saturating_sub(1) as f64); + let lower_index = rank.floor() as usize; + let upper_index = rank.ceil() as usize; + let weight = rank - lower_index as f64; + let interpolated = if lower_index == upper_index { + values[lower_index] + } else { + values[lower_index] + (values[upper_index] - values[lower_index]) * weight + }; + + interpolated.round().clamp(0.0, u64::MAX as f64) as u64 } fn clear_index_artifacts(index: &WorkspaceTrigramIndex) -> Result<()> { @@ -1033,7 +1045,6 @@ fn clear_index_artifacts(index: &WorkspaceTrigramIndex) -> Result<()> { fn capture_environment_metadata(workspace: &WorkspaceContext) -> Result { Ok(EnvironmentMetadata { workspace_label: workspace.label.clone(), - workspace_root: workspace.root.clone(), workspace_kind: workspace.kind, file_count: count_files(&workspace.root)?, os: env::consts::OS.to_string(), @@ -1113,10 +1124,7 @@ fn print_workspace_report(report: &WorkspaceReport) { ); println!(); println!("- workspace_kind: {}", report.metadata.workspace_kind); - println!( - "- workspace_root: {}", - report.metadata.workspace_root.display() - ); + println!("- workspace_root: "); println!("- file_count: {}", report.metadata.file_count); println!("- os: {}", report.metadata.os); println!("- arch: {}", report.metadata.arch); @@ -1227,6 +1235,19 @@ mod tests { ); } + #[test] + fn percentile_ms_interpolates_between_neighboring_samples() { + let durations = [ + Duration::from_millis(10), + Duration::from_millis(20), + Duration::from_millis(30), + Duration::from_millis(40), + ]; + + assert_eq!(percentile_ms(&durations, 95), 39); + assert_eq!(percentile_ms(&durations, 50), 25); + } + #[test] fn label_plan_mode_marks_regex_fallback_after_index_build() { assert_eq!( diff --git a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md index 846b9aaab..b1b74a16a 100644 --- a/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md +++ b/clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md @@ -15,10 +15,10 @@ docType: guide ## Current verified behavior -- Literal queries **may** use indexed candidate narrowing when a compatible workspace trigram index is available and fresh. -- Regex queries are **supported for correctness and safety**, but indexed candidate narrowing does **not** support regex in v1. -- When indexed planning cannot narrow candidates for regex, the planner returns `query_regex_not_supported` and execution is labeled as `fallback_discovery_live_verification`. -- When no compatible index exists, the planner returns `index_unavailable` and execution is labeled `index_unavailable`. +- Indexed planning is attempted only for queries eligible for workspace trigram index narrowing: compatible literal queries when a compatible, fresh trigram index exists. +- Regex queries are **supported for correctness and safety**, but trigram index narrowing does **not** support regex in v1. +- Regex planning returns `query_regex_not_supported` before index loading, and execution is labeled `fallback_discovery_live_verification` while continuing through discovery plus live verification. +- `index_unavailable` applies only when an otherwise index-eligible literal query cannot find a compatible trigram index; those runs are labeled `index_unavailable` and continue through discovery plus live verification. - Final matches always come from live verification of current file contents. Indexed candidates are never treated as authoritative results by themselves. ## What this page is for @@ -75,7 +75,7 @@ This rollout harness only makes recommendation claims for rows where parity pass #### Current repo snapshot - workspace kind: `repo_snapshot` -- workspace root: `/Users/acosta/Dev/corvus` +- workspace root: `` - commit SHA: `82fa4896` - file count: `234763` - benchmarked at: `2026-04-05T19:47:11.665525+00:00` @@ -159,13 +159,13 @@ The rollout runner records these six representative cases in both workspaces: ### SHOULD prefer native `code_search` -Use native `code_search` for the measured regex cases and for fixture-scale searches where you want structured output plus verified parity. +Prefer native `code_search` for measured regex runs when parity passes and execution stays in `native_no_index` or `native_warm_index` fallback mode. Do **not** extend this recommendation to `native_cold_build`, where the current measured repo-snapshot rows still favor shell because index construction dominates the run. Why: -- every published row passed canonical parity, -- regex rows are correctly labeled as fallback (`query_regex_not_supported` → discovery + live verification), not as regex-aware indexed narrowing, -- even in the large repo snapshot, measured regex fallback rows were materially faster than the shell baseline in this local debug run. +- every published recommendation row passed canonical parity, +- the supported regex rows are explicitly labeled as fallback (`query_regex_not_supported` → discovery plus live verification), not as regex-aware trigram index narrowing, +- the measured `native_no_index` and `native_warm_index` regex rows were materially faster than the shell baseline in this local debug run, while `native_cold_build` remained the exception. ### MAY prefer native `code_search` diff --git a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md index b98b5aac3..0c61e3740 100644 --- a/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md +++ b/clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md @@ -15,10 +15,10 @@ docType: guide ## Comportamiento verificado actual -- Las consultas literales **pueden** usar reducción de candidatos por índice cuando existe un índice trigram compatible y fresco. -- Las consultas regex están **soportadas para corrección y seguridad**, pero la reducción de candidatos por índice **no** soporta regex en v1. -- Cuando el planner no puede reducir candidatos para regex, devuelve `query_regex_not_supported` y la ejecución se etiqueta como `fallback_discovery_live_verification`. -- Cuando no existe un índice compatible, el planner devuelve `index_unavailable` y la ejecución se etiqueta como `index_unavailable`. +- El planning indexado solo se intenta para consultas elegibles para reducción por índice trigram: consultas literales compatibles cuando existe un índice trigram compatible y fresco. +- Las consultas regex están **soportadas para corrección y seguridad**, pero la reducción de candidatos trigram por índice **no** soporta regex en v1. +- El planning regex devuelve `query_regex_not_supported` antes de cargar índice, y la ejecución se etiqueta como `fallback_discovery_live_verification` mientras continúa por discovery más live verification. +- `index_unavailable` aplica solo cuando una consulta literal que sí sería elegible para índice no encuentra un índice trigram compatible; esas corridas se etiquetan como `index_unavailable` y continúan por discovery más live verification. - Las coincidencias finales siempre salen de la verificación en vivo del contenido actual. Los candidatos indexados por sí solos nunca son resultados autoritativos. ## Para qué sirve esta página @@ -75,7 +75,7 @@ Este harness de rollout solo hace recomendaciones sobre filas donde la paridad p #### Snapshot actual del repo - tipo de workspace: `repo_snapshot` -- raíz del workspace: `/Users/acosta/Dev/corvus` +- raíz del workspace: `` - commit SHA: `82fa4896` - cantidad de archivos: `234763` - fecha del benchmark: `2026-04-05T19:47:11.665525+00:00` @@ -159,13 +159,13 @@ El runner de rollout registra estos seis casos representativos en ambos workspac ### SHOULD prefer native `code_search` -Usa `code_search` nativo para los casos regex medidos y para búsquedas de tamaño fixture donde quieras salida estructurada con paridad verificada. +Prefiere `code_search` nativo para corridas regex medidas cuando la paridad pasa y la ejecución se queda en fallback `native_no_index` o `native_warm_index`. No extiendas esta recomendación a `native_cold_build`, donde las filas medidas del snapshot del repo todavía favorecen a shell porque la construcción del índice domina la corrida. Por qué: -- todas las filas publicadas pasaron la paridad canónica, -- las filas regex están etiquetadas correctamente como fallback (`query_regex_not_supported` → discovery + live verification), no como reducción regex-aware por índice, -- incluso en el snapshot grande del repo, las filas regex en fallback fueron materialmente más rápidas que el baseline de shell en esta corrida local en `debug`. +- cada fila usada para recomendar pasó la paridad canónica, +- las filas regex soportadas están etiquetadas explícitamente como fallback (`query_regex_not_supported` → discovery más live verification), no como reducción trigram regex-aware por índice, +- en esta corrida local en `debug`, las filas regex de `native_no_index` y `native_warm_index` fueron materialmente más rápidas que el baseline de shell, mientras `native_cold_build` quedó como la excepción. ### MAY prefer native `code_search` diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md index 9b76e3f06..91ef06e9c 100644 --- a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/design.md @@ -122,8 +122,8 @@ Benchmark runner ▼ Recorded results table │ - ├── docs/clients/agent-runtime/tools/code-search.md - └── docs/es/clients/agent-runtime/tools/code-search.md + ├── clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md + └── clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md │ ▼ Rollout guidance derived from measured evidence @@ -281,12 +281,12 @@ tradeoffs explicit. | `clients/agent-runtime/examples/code_search_rollout_benchmark.rs` | Create | Dedicated rollout benchmark runner that prepares scenarios, executes shell/native states, records timings, labels plan mode, and checks parity. | | `clients/agent-runtime/benches/agent_benchmarks.rs` | Modify | Add a short note or companion entrypoint reference if needed so low-level microbenches and rollout benchmarks are clearly separated. | | `clients/agent-runtime/docs/design/code-search-tool.md` | Modify | Align the internal design doc with the implemented indexed-planning reality and point readers to the canonical rollout documentation for current behavior. | -| `docs/clients/agent-runtime/tools/code-search.md` | Create | Canonical English documentation for current behavior, benchmark methodology, recorded results, rollout guidance, fallback reasons, and deferred optimizations. | -| `docs/clients/agent-runtime/tools/core.md` | Modify | Add `code_search` to the core tools reference and link to the dedicated page. | -| `docs/clients/agent-runtime/tools/index.mdx` | Modify | Link the dedicated `code_search` page from the tools index. | -| `docs/es/clients/agent-runtime/tools/code-search.md` | Create | Spanish companion page with the same behavior limits, benchmark summary, rollout guidance, and deferred-work separation. | -| `docs/es/clients/agent-runtime/tools/core.md` | Modify | Add the Spanish `code_search` reference and link to the dedicated page. | -| `docs/es/clients/agent-runtime/tools/index.mdx` | Modify | Add the Spanish navigation link for the dedicated `code_search` page. | +| `clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/code-search.md` | Create | Canonical English documentation for current behavior, benchmark methodology, recorded results, rollout guidance, fallback reasons, and deferred optimizations. | +| `clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/core.md` | Modify | Add `code_search` to the core tools reference and link to the dedicated page. | +| `clients/web/apps/docs/src/content/docs/clients/agent-runtime/tools/index.mdx` | Modify | Link the dedicated `code_search` page from the tools index. | +| `clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/code-search.md` | Create | Spanish companion page with the same behavior limits, benchmark summary, rollout guidance, and deferred-work separation. | +| `clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/core.md` | Modify | Add the Spanish `code_search` reference and link to the dedicated page. | +| `clients/web/apps/docs/src/content/docs/es/clients/agent-runtime/tools/index.mdx` | Modify | Add the Spanish navigation link for the dedicated `code_search` page. | ## Interfaces / Contracts diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md index 3bcfb78e7..b79c32519 100644 --- a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/tasks.md @@ -21,5 +21,5 @@ ## Phase 4: Final Verification -- [x] 4.1 Run targeted Rust validation for the new runner and search coverage in `clients/agent-runtime` and fix any parity, planner-label, or example-test regressions before marking the change complete. +- [x] 4.1 Run Rust validation for `clients/agent-runtime/**/*.rs` with `cargo fmt --all -- --check`, `cargo clippy --all-targets -- -D warnings`, and `cargo test`, or record an explicit skip rationale for any omitted check; fix any parity, planner-label, or example-test regressions before marking the change complete. - [x] 4.2 Review the English and Spanish docs against `openspec/changes/code-search-benchmark-rollout/specs/code-search-rollout/spec.md` and confirm they never imply indexed regex narrowing or blanket shell deprecation. diff --git a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md index 9e01ffc3c..91b228bee 100644 --- a/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md +++ b/openspec/changes/archive/2026-04-05-code-search-benchmark-rollout/verify-report.md @@ -17,21 +17,25 @@ All tasks in `openspec/changes/code-search-benchmark-rollout/tasks.md` are marke ### Executed during verification -1. `cargo test --manifest-path clients/agent-runtime/Cargo.toml --example code_search_rollout_benchmark` +1. `cargo fmt --manifest-path clients/agent-runtime/Cargo.toml --all -- --check` - Result: ✅ passed - - Evidence: 6/6 example tests passed, including shell-command generation, plan-mode labeling, canonicalization, and fixture smoke parity. + - Evidence: formatting re-check completed successfully for `clients/agent-runtime/**/*.rs` during re-verification. -2. `cargo test --manifest-path clients/agent-runtime/Cargo.toml candidate_planner_marks_regex_and_short_patterns_unavailable` +2. `cargo clippy --manifest-path clients/agent-runtime/Cargo.toml --all-targets -- -D warnings` - Result: ✅ passed - - Evidence: targeted planner regression passed; verified `query_regex_not_supported` remains the planner reason for regex. + - Evidence: Clippy completed cleanly for `clients/agent-runtime/**/*.rs`, including the rollout benchmark example target after removing the leaked absolute-path output and fixing the percentile / regex benchmark findings. -3. `cargo run --manifest-path clients/agent-runtime/Cargo.toml --example code_search_rollout_benchmark -- --workspace fixture --samples 1 --cold-build-samples 1` +3. `cargo test --manifest-path clients/agent-runtime/Cargo.toml` + - Result: ✅ passed + - Evidence: full `clients/agent-runtime` test coverage passed, including the main lib test binaries (`3402` and `3429` passing tests) plus the rollout benchmark example tests. + +4. `cargo run --manifest-path clients/agent-runtime/Cargo.toml --example code_search_rollout_benchmark -- --workspace fixture --samples 1 --cold-build-samples 1` - Result: ✅ passed - Evidence: fixture benchmark executed end-to-end, emitted shell/native rows for literal and regex cases, and labeled regex rows as `fallback_discovery_live_verification` with `query_regex_not_supported`. -4. `cargo fmt --manifest-path clients/agent-runtime/Cargo.toml --all -- --check` +5. `node ../../../../scripts/validate-docs-metadata.mjs` - Result: ✅ passed - - Evidence: formatting re-check completed successfully during re-verification. + - Evidence: documentation metadata validation passed for the English and Spanish `code_search` rollout pages after the required frontmatter fixes. ### Not rerun during verification