diff --git a/benchmarks/evals/sem_search/task.yml b/benchmarks/evals/sem_search/task.yml index 8d4877daaa..33a686885b 100644 --- a/benchmarks/evals/sem_search/task.yml +++ b/benchmarks/evals/sem_search/task.yml @@ -1,19 +1,23 @@ run: # Clone into `tmp/task` dir - git clone --depth=1 --branch main https://github.com/antinomyhq/forge . - - forgee workspace sync - - FORGE_DEBUG_REQUESTS='{{dir}}/context.json' forgee --provider open_router --model {{model}} -p '{{task}}' -parallelism: 50 -timeout: 120 + - forge workspace init --yes + - forge workspace sync + - FORGE_DEBUG_REQUESTS='{{dir}}/context.jsonl' FORGE_SESSION__PROVIDER_ID=open_router FORGE_SESSION__MODEL_ID={{model}} forge -p '{{task}}' +parallelism: 8 +timeout: 60 early_exit: true validations: - name: "Uses codebase search tool" type: shell - command: cat '{{dir}}/context.json' | jq -e '[.messages[]?.tool_calls[]? | select(.function.name == "sem_search")] | any' + command: grep -q "Codebase Search" '{{dir}}/task.log' + - name: "Does not call task tool before sem_search" + type: shell + command: "python3 -c \"\nimport sys\nlog = open('{{dir}}/task.log').read().splitlines()\ntask_line = next((i for i, l in enumerate(log) if '[Agent]' in l), None)\nsearch_line = next((i for i, l in enumerate(log) if 'Codebase Search' in l), None)\nif task_line is None: sys.exit(0)\nif search_line is None: sys.exit(1)\nsys.exit(0 if search_line < task_line else 1)\n\"" sources: - value: # - model: "x-ai/grok-code-fast-1" - - model: "anthropic/claude-sonnet-4.5" + - model: "anthropic/claude-sonnet-4.6" - value: # # Location questions - "where is" # - task: "Where is the code that transforms messages between different AI provider formats?" diff --git a/crates/forge_main/src/cli.rs b/crates/forge_main/src/cli.rs index 01d5b56f77..b2c12a4e35 100644 --- a/crates/forge_main/src/cli.rs +++ b/crates/forge_main/src/cli.rs @@ -287,6 +287,10 @@ pub enum WorkspaceCommand { /// Path to the directory to initialize as a workspace #[arg(default_value = ".")] path: PathBuf, + + /// Automatically confirm initialization without prompting + #[arg(short = 'y', long)] + yes: bool, }, } diff --git a/crates/forge_main/src/ui.rs b/crates/forge_main/src/ui.rs index ca01a0b657..5f500d99fc 100644 --- a/crates/forge_main/src/ui.rs +++ b/crates/forge_main/src/ui.rs @@ -652,8 +652,8 @@ impl A + Send + Sync> UI crate::cli::WorkspaceCommand::Status { path, porcelain } => { self.on_workspace_status(path, porcelain).await?; } - crate::cli::WorkspaceCommand::Init { path } => { - self.on_workspace_init(path).await?; + crate::cli::WorkspaceCommand::Init { path, yes } => { + self.on_workspace_init(path, yes).await?; } } return Ok(()); @@ -3850,7 +3850,7 @@ impl A + Send + Sync> UI if init { let workspace_info = self.api.get_workspace_info(path.clone()).await?; if workspace_info.is_none() { - self.on_workspace_init(path.clone()).await?; + self.on_workspace_init(path.clone(), false).await?; // If the workspace still does not exist after init (e.g. user // declined the consent prompt), abort the sync. let workspace_info = self.api.get_workspace_info(path.clone()).await?; @@ -4210,16 +4210,25 @@ impl A + Send + Sync> UI } /// Initialize workspace for a directory without syncing files - async fn on_workspace_init(&mut self, path: std::path::PathBuf) -> anyhow::Result<()> { + async fn on_workspace_init( + &mut self, + path: std::path::PathBuf, + yes: bool, + ) -> anyhow::Result<()> { // Ask for user consent before syncing and sharing directory contents // with the ForgeCode Service. let display_path = path.display().to_string(); - let confirmed = ForgeWidget::confirm(format!( - "This will sync and share the contents of '{}' with ForgeCode Services. Do you wish to continue?", - display_path - )) - .with_default(true) - .prompt()?; + + let confirmed = if yes { + Some(true) + } else { + ForgeWidget::confirm(format!( + "This will sync and share the contents of '{}' with ForgeCode Services. Do you wish to continue?", + display_path + )) + .with_default(true) + .prompt()? + }; if !confirmed.unwrap_or(false) { self.writeln_title(TitleFormat::info("Workspace initialization cancelled"))?; diff --git a/crates/forge_repo/src/agents/forge.md b/crates/forge_repo/src/agents/forge.md index 43c1f1a0bc..1bb59d4b4e 100644 --- a/crates/forge_repo/src/agents/forge.md +++ b/crates/forge_repo/src/agents/forge.md @@ -121,27 +121,15 @@ assistant: I've found some existing telemetry code. I'll start designing the met Choose tools based on the nature of the task: -- **Semantic Search**: When you need to discover code locations or understand implementations. Particularly useful when you don't know exact file names or when exploring unfamiliar codebases. Understands concepts rather than requiring exact text matches. +{{#if tool_names.sem_search}}- **Semantic Search**: YOUR DEFAULT TOOL for code discovery. Always use this first when you need to discover code locations or understand implementations. Particularly useful when you don't know exact file names or when exploring unfamiliar codebases. Understands concepts rather than requiring exact text matches.{{/if}} - **Regex Search**: For finding exact strings, patterns, or when you know precisely what text you're looking for (e.g., TODO comments, specific function names). - **Read**: When you already know the file location and need to examine its contents. - -- When doing file search, prefer to use the {{tool_names.task}} tool in order to reduce context usage. -- You should proactively use the {{tool_names.task}} tool with specialized agents when the task at hand matches the agent's description. - You can call multiple tools in a single response. If you intend to call multiple tools and there are no dependencies between them, make all independent tool calls in parallel. Maximize use of parallel tool calls where possible to increase efficiency. However, if some tool calls depend on previous calls to inform dependent values, do NOT call these tools in parallel and instead call them sequentially. Never use placeholders or guess missing parameters in tool calls. - If the user specifies that they want you to run tools "in parallel", you MUST send a single message with multiple tool use content blocks. For example, if you need to launch multiple agents in parallel, send a single message with multiple {{tool_names.task}} tool calls. - Use specialized tools instead of shell commands when possible. For file operations, use dedicated tools: {{tool_names.read}} for reading files instead of cat/head/tail, {{tool_names.patch}} for editing instead of sed/awk, and {{tool_names.write}} for creating files instead of echo redirection. Reserve {{tool_names.shell}} exclusively for actual system commands and terminal operations that require shell execution. -- VERY IMPORTANT: When exploring the codebase to gather context or to answer a question that is not a needle query for a specific file/class/function, it is CRITICAL that you use the {{tool_names.task}} tool instead of running search commands directly. - - -user: Where are errors from the client handled? -assistant: [Uses the {{tool_names.task}} tool to find the files that handle client errors instead of using {{tool_names.fs_search}} or {{tool_names.sem_search}} directly] - - -user: What is the codebase structure? -assistant: [Uses the {{tool_names.task}} tool] - +- When NOT to use the {{tool_names.task}} tool: Do NOT launch a sub-agent for initial codebase exploration or simple lookups. Always use semantic search directly first. ## Code Output Guidelines: diff --git a/crates/forge_repo/src/agents/sage.md b/crates/forge_repo/src/agents/sage.md index b0dc94edfa..3e314f7044 100644 --- a/crates/forge_repo/src/agents/sage.md +++ b/crates/forge_repo/src/agents/sage.md @@ -1,7 +1,7 @@ --- id: "sage" title: "Research and analyze codebases" -description: "Research-only tool for systematic codebase exploration and analysis. Performs comprehensive, read-only investigation: maps project architecture and module relationships, traces data/logic flow across files, analyzes API usage patterns, examines test coverage and build configurations, identifies design patterns and technical debt. Accepts detailed research questions or investigation tasks as input parameters. IMPORTANT: Always specify the target directory or file path in your task description to narrow down the scope and improve efficiency. Use when you need to understand how systems work, why architectural decisions were made, or to investigate bugs, dependencies, complex behavior patterns, or code quality issues. Do NOT use for code modifications, running commands, or file operations—choose implementation or planning agents instead. Returns structured reports with research summaries, key findings, technical details, contextual insights, and actionable follow-up suggestions. Strictly read-only with no side effects or system modifications." +description: "DEEP RESEARCH ONLY. Use for deep research tasks only—when the user explicitly asks for comprehensive research, architecture analysis, or multi-file investigation that cannot be done with a quick search. Do NOT use for simple lookups or finding where something is defined. Research-only tool for systematic codebase exploration and analysis. Performs comprehensive, read-only investigation: maps project architecture and module relationships, traces data/logic flow across files, analyzes API usage patterns, examines test coverage and build configurations, identifies design patterns and technical debt. Accepts detailed research questions or investigation tasks as input parameters. IMPORTANT: Always specify the target directory or file path in your task description to narrow down the scope and improve efficiency. Do NOT use for code modifications, running commands, or file operations—choose implementation or planning agents instead. Returns structured reports with research summaries, key findings, technical details, contextual insights, and actionable follow-up suggestions. Strictly read-only with no side effects or system modifications." reasoning: enabled: true tools: