From ad9989504eb1713ac53a582a3cbdc63ec7fa8170 Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 11:56:25 +0100 Subject: [PATCH 1/3] fix: replace sys.modules patching with proper patch() in optimizer tests - Rewrite test_optimizer.py using patch('...PydanticAgent', ...) cleanly - Add TestAzureEntraModel class (3 tests covering default gpt-5.2-chat deployment) - Update test_optimizer_integration.py to use azure_entra_model() (gpt-5.2-chat) instead of OPENAI_API_KEY skip guard -- all 3 now pass against real Azure - Verified full test->optimize->test loop end-to-end: 3/3 passed in 64s Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/pytest_codingagents/__init__.py | 7 +- src/pytest_codingagents/copilot/optimizer.py | 98 +++++-- tests/test_optimizer_integration.py | 28 +- tests/unit/test_optimizer.py | 262 +++++++++---------- 4 files changed, 228 insertions(+), 167 deletions(-) diff --git a/src/pytest_codingagents/__init__.py b/src/pytest_codingagents/__init__.py index 2d93271..8110341 100644 --- a/src/pytest_codingagents/__init__.py +++ b/src/pytest_codingagents/__init__.py @@ -4,13 +4,18 @@ from pytest_codingagents.copilot.agent import CopilotAgent from pytest_codingagents.copilot.agents import load_custom_agent, load_custom_agents -from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction +from pytest_codingagents.copilot.optimizer import ( + InstructionSuggestion, + azure_entra_model, + optimize_instruction, +) from pytest_codingagents.copilot.result import CopilotResult __all__ = [ "CopilotAgent", "CopilotResult", "InstructionSuggestion", + "azure_entra_model", "load_custom_agent", "load_custom_agents", "optimize_instruction", diff --git a/src/pytest_codingagents/copilot/optimizer.py b/src/pytest_codingagents/copilot/optimizer.py index 1d2fb99..c97c6c4 100644 --- a/src/pytest_codingagents/copilot/optimizer.py +++ b/src/pytest_codingagents/copilot/optimizer.py @@ -4,22 +4,84 @@ between a current agent instruction and the observed behavior, and suggests a concrete improvement. -Requires ``pydantic-ai``: - - uv add pydantic-ai +Use :func:`azure_entra_model` to build a pre-configured pydantic-ai model +from Azure Entra ID (no API key required): + + model = azure_entra_model() # defaults to gpt-5.2-chat + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Agent should add docstrings.", + model=model, + ) """ from __future__ import annotations +import os from dataclasses import dataclass from typing import TYPE_CHECKING from pydantic import BaseModel +from pydantic_ai import Agent as PydanticAgent +from pydantic_ai.models import Model if TYPE_CHECKING: from pytest_codingagents.copilot.result import CopilotResult -__all__ = ["InstructionSuggestion", "optimize_instruction"] +__all__ = ["InstructionSuggestion", "azure_entra_model", "optimize_instruction"] + +# Most capable model available on Azure OpenAI +_AZURE_DEFAULT_MODEL = "gpt-5.2-chat" + + +def azure_entra_model( + deployment: str = _AZURE_DEFAULT_MODEL, + *, + endpoint: str | None = None, + api_version: str = "2024-12-01-preview", +) -> Model: + """Build a pydantic-ai Model using Azure Entra ID authentication. + + No API key required — uses ``DefaultAzureCredential`` (works with + ``az login`` locally and managed identity in CI). + + Args: + deployment: Azure OpenAI deployment name. Defaults to + ``"gpt-5.2-chat"`` — the most capable model available. + endpoint: Azure OpenAI endpoint URL. Defaults to the + ``AZURE_OPENAI_ENDPOINT`` environment variable. + api_version: Azure OpenAI API version string. + + Returns: + A pydantic-ai ``Model`` ready to pass to ``optimize_instruction()``. + + Example:: + + model = azure_entra_model() + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Agent should add docstrings.", + model=model, + ) + """ + from azure.identity import DefaultAzureCredential, get_bearer_token_provider + from openai import AsyncAzureOpenAI + from pydantic_ai.models.openai import OpenAIChatModel + from pydantic_ai.providers.openai import OpenAIProvider + + azure_endpoint = endpoint or os.environ["AZURE_OPENAI_ENDPOINT"] + token_provider = get_bearer_token_provider( + DefaultAzureCredential(), + "https://cognitiveservices.azure.com/.default", + ) + client = AsyncAzureOpenAI( + azure_endpoint=azure_endpoint, + azure_ad_token_provider=token_provider, + api_version=api_version, + ) + return OpenAIChatModel(deployment, provider=OpenAIProvider(openai_client=client)) @dataclass @@ -40,6 +102,7 @@ class InstructionSuggestion: agent.instructions, result, "Agent should add docstrings to all functions.", + model=azure_entra_model(), ) pytest.fail(f"No docstrings found.\\n\\n{suggestion}") """ @@ -70,7 +133,7 @@ async def optimize_instruction( result: CopilotResult, criterion: str, *, - model: str = "openai:gpt-4o-mini", + model: str | Model = "openai:gpt-4o-mini", ) -> InstructionSuggestion: """Analyze a result and suggest an improved instruction. @@ -79,16 +142,22 @@ async def optimize_instruction( concrete, actionable improvement. Designed to drop into ``pytest.fail()`` so the failure message - contains a ready-to-use fix: + contains a ready-to-use fix. + + For Azure OpenAI with Entra ID auth (recommended), use + :func:`azure_entra_model` to build the model: Example:: + from pytest_codingagents import optimize_instruction, azure_entra_model + result = await copilot_run(agent, task) if '\"\"\"' not in result.file("main.py"): suggestion = await optimize_instruction( agent.instructions or "", result, "Agent should add docstrings to all functions.", + model=azure_entra_model(), # gpt-5.2-chat via Entra ID ) pytest.fail(f"No docstrings found.\\n\\n{suggestion}") @@ -97,24 +166,13 @@ async def optimize_instruction( result: The ``CopilotResult`` from the (failed) run. criterion: What the agent *should* have done — the test expectation in plain English (e.g. ``"Always write docstrings"``). - model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"`` - or ``"anthropic:claude-3-haiku-20240307"``). + model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"``) + **or** a pre-configured pydantic-ai ``Model`` object built with + :func:`azure_entra_model` or any other provider. Returns: An :class:`InstructionSuggestion` with the improved instruction. - - Raises: - ImportError: If pydantic-ai is not installed. """ - try: - from pydantic_ai import Agent as PydanticAgent - except ImportError as exc: - msg = ( - "pydantic-ai is required for optimize_instruction(). " - "Install it with: uv add pydantic-ai" - ) - raise ImportError(msg) from exc - final_output = result.final_response or "(no response)" tool_calls = ", ".join(sorted(result.tool_names_called)) or "none" diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py index b1b6ea7..2fb2f1e 100644 --- a/tests/test_optimizer_integration.py +++ b/tests/test_optimizer_integration.py @@ -2,9 +2,9 @@ These tests require: - GitHub Copilot credentials (for copilot_run to produce a real result) -- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model) +- AZURE_OPENAI_ENDPOINT env var set (for the optimizer LLM via Azure Entra ID) -Skipped automatically when the required API key is absent. +Skipped automatically when AZURE_OPENAI_ENDPOINT is absent. """ from __future__ import annotations @@ -14,18 +14,27 @@ import pytest from pytest_codingagents.copilot.agent import CopilotAgent -from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction +from pytest_codingagents.copilot.optimizer import ( + InstructionSuggestion, + azure_entra_model, + optimize_instruction, +) + + +def _model(): + """Build Azure Entra ID model for optimizer tests.""" + return azure_entra_model() # defaults to gpt-5.2-chat @pytest.mark.copilot class TestOptimizeInstructionIntegration: - """Integration tests for optimize_instruction() with real LLM calls.""" + """Integration tests for optimize_instruction() with real Azure LLM calls.""" @pytest.fixture(autouse=True) - def require_openai_key(self): - """Skip entire class when OPENAI_API_KEY is not set.""" - if not os.environ.get("OPENAI_API_KEY"): - pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests") + def require_azure_endpoint(self): + """Skip entire class when AZURE_OPENAI_ENDPOINT is not set.""" + if not os.environ.get("AZURE_OPENAI_ENDPOINT"): + pytest.skip("AZURE_OPENAI_ENDPOINT not set — skipping optimizer integration tests") async def test_returns_valid_suggestion(self, copilot_run, tmp_path): """optimize_instruction returns an InstructionSuggestion with non-empty fields.""" @@ -44,6 +53,7 @@ async def test_returns_valid_suggestion(self, copilot_run, tmp_path): agent.instructions or "", result, "Every function must have a Google-style docstring.", + model=_model(), ) assert isinstance(suggestion, InstructionSuggestion) @@ -66,6 +76,7 @@ async def test_suggestion_str_is_human_readable(self, copilot_run, tmp_path): agent.instructions or "", result, "Add type hints to all function parameters and return values.", + model=_model(), ) text = str(suggestion) @@ -91,6 +102,7 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path): agent.instructions or "", result, criterion, + model=_model(), ) # The suggestion instruction should mention docstrings somehow diff --git a/tests/unit/test_optimizer.py b/tests/unit/test_optimizer.py index 81e797c..1d2871f 100644 --- a/tests/unit/test_optimizer.py +++ b/tests/unit/test_optimizer.py @@ -1,11 +1,8 @@ -"""Unit tests for optimize_instruction() and InstructionSuggestion.""" +"""Unit tests for optimize_instruction(), azure_entra_model(), and InstructionSuggestion.""" from __future__ import annotations -import sys -from unittest.mock import AsyncMock, MagicMock - -import pytest +from unittest.mock import AsyncMock, MagicMock, patch from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction from pytest_codingagents.copilot.result import CopilotResult, ToolCall, Turn @@ -20,27 +17,21 @@ def _make_result( tool_calls = [ToolCall(name=t, arguments={}) for t in (tools or [])] return CopilotResult( success=success, - turns=[ - Turn(role="assistant", content=final_response, tool_calls=tool_calls), - ], + turns=[Turn(role="assistant", content=final_response, tool_calls=tool_calls)], ) def _make_agent_mock(instruction: str, reasoning: str, changes: str) -> MagicMock: - """Build a pydantic-ai Agent mock that returns a structured suggestion.""" - output = MagicMock() - output.instruction = instruction - output.reasoning = reasoning - output.changes = changes - - run_result = MagicMock() - run_result.output = output - + """Return a MagicMock that behaves like pydantic-ai Agent class.""" + output = MagicMock(instruction=instruction, reasoning=reasoning, changes=changes) + run_result = MagicMock(output=output) agent_instance = MagicMock() agent_instance.run = AsyncMock(return_value=run_result) + return MagicMock(return_value=agent_instance) + - agent_class = MagicMock(return_value=agent_instance) - return agent_class +# Patch target: PydanticAgent as imported in the optimizer module +_AGENT_PATCH = "pytest_codingagents.copilot.optimizer.PydanticAgent" class TestInstructionSuggestion: @@ -55,27 +46,17 @@ def test_str_contains_instruction(self): assert "Always add docstrings." in str(s) def test_str_contains_reasoning(self): - s = InstructionSuggestion( - instruction="inst", - reasoning="because reasons", - changes="changed x", - ) + s = InstructionSuggestion(instruction="inst", reasoning="because reasons", changes="x") assert "because reasons" in str(s) def test_str_contains_changes(self): s = InstructionSuggestion( - instruction="inst", - reasoning="reason", - changes="Added docstring mandate.", + instruction="inst", reasoning="reason", changes="Added docstring mandate." ) assert "Added docstring mandate." in str(s) def test_fields_accessible(self): - s = InstructionSuggestion( - instruction="inst", - reasoning="reason", - changes="changes", - ) + s = InstructionSuggestion(instruction="inst", reasoning="reason", changes="changes") assert s.instruction == "inst" assert s.reasoning == "reason" assert s.changes == "changes" @@ -85,22 +66,15 @@ class TestOptimizeInstruction: """Tests for optimize_instruction().""" async def test_returns_instruction_suggestion(self): - """optimize_instruction returns an InstructionSuggestion.""" agent_class = _make_agent_mock( instruction="Always add Google-style docstrings.", reasoning="The original instruction omits documentation.", changes="Added docstring mandate.", ) - - # patch pydantic_ai.Agent in the module where it's imported - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = await optimize_instruction( - "Write Python code.", - _make_result(), - "Agent should add docstrings.", - ) - + with patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction( + "Write Python code.", _make_result(), "Agent should add docstrings." + ) assert isinstance(result, InstructionSuggestion) assert result.instruction == "Always add Google-style docstrings." assert result.reasoning == "The original instruction omits documentation." @@ -109,127 +83,139 @@ async def test_returns_instruction_suggestion(self): async def test_uses_default_model(self): """optimize_instruction defaults to openai:gpt-4o-mini.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction("inst", _make_result(), "criterion") - - agent_class.assert_called_once() + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction("inst", _make_result(), "criterion") assert agent_class.call_args[0][0] == "openai:gpt-4o-mini" - async def test_accepts_custom_model(self): + async def test_accepts_custom_model_string(self): """optimize_instruction accepts a custom model string.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "inst", - _make_result(), - "criterion", - model="anthropic:claude-3-haiku-20240307", - ) - + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", + _make_result(), + "criterion", + model="anthropic:claude-3-haiku-20240307", + ) assert agent_class.call_args[0][0] == "anthropic:claude-3-haiku-20240307" + async def test_accepts_model_object(self): + """optimize_instruction accepts a pre-built Model object (e.g. azure_entra_model()).""" + agent_class = _make_agent_mock("inst", "reason", "changes") + fake_model = MagicMock() + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction("inst", _make_result(), "criterion", model=fake_model) + assert agent_class.call_args[0][0] is fake_model + async def test_includes_criterion_in_prompt(self): - """The LLM prompt includes the criterion text.""" agent_class = _make_agent_mock("improved", "reason", "change") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "Write code.", - _make_result(), - "Agent must use type hints on all functions.", - ) - - prompt = agent_instance.run.call_args[0][0] - assert "type hints" in prompt + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "Write code.", _make_result(), "Agent must use type hints on all functions." + ) + assert "type hints" in agent_instance.run.call_args[0][0] async def test_includes_current_instruction_in_prompt(self): - """The LLM prompt contains the current instruction.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "Always use FastAPI for web APIs.", - _make_result(), - "criterion", - ) - - prompt = agent_instance.run.call_args[0][0] - assert "FastAPI" in prompt + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "Always use FastAPI for web APIs.", _make_result(), "criterion" + ) + assert "FastAPI" in agent_instance.run.call_args[0][0] async def test_includes_agent_output_in_prompt(self): - """The LLM prompt contains the agent's final response.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = _make_result(final_response="def add(a, b): return a + b") - await optimize_instruction("inst", result, "criterion") - - prompt = agent_instance.run.call_args[0][0] - assert "def add" in prompt + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", _make_result(final_response="def add(a, b): return a + b"), "criterion" + ) + assert "def add" in agent_instance.run.call_args[0][0] async def test_handles_no_final_response(self): - """optimize_instruction handles results with no turns gracefully.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - empty_result = CopilotResult(success=False, turns=[]) - result = await optimize_instruction("inst", empty_result, "criterion") - + with patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction( + "inst", CopilotResult(success=False, turns=[]), "criterion" + ) assert isinstance(result, InstructionSuggestion) async def test_handles_empty_instruction(self): - """optimize_instruction handles empty current instruction.""" agent_class = _make_agent_mock("new inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = await optimize_instruction("", _make_result(), "criterion") + with patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction("", _make_result(), "criterion") assert isinstance(result, InstructionSuggestion) async def test_includes_tool_calls_in_prompt(self): - """The LLM prompt includes tool call information.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = _make_result(tools=["create_file", "read_file"]) - await optimize_instruction("inst", result, "criterion") - - prompt = agent_instance.run.call_args[0][0] - assert "create_file" in prompt - - -class TestOptimizeInstructionImportError: - """Test ImportError when pydantic-ai is not installed.""" - - async def test_raises_import_error_when_pydantic_ai_missing(self): - """optimize_instruction raises ImportError if pydantic-ai not installed.""" - saved = sys.modules.get("pydantic_ai") - try: - sys.modules["pydantic_ai"] = None # type: ignore - - with pytest.raises(ImportError, match="pydantic-ai"): - await optimize_instruction("inst", _make_result(), "criterion") - finally: - if saved is not None: - sys.modules["pydantic_ai"] = saved - else: - del sys.modules["pydantic_ai"] - - async def test_import_error_includes_install_hint(self): - """ImportError message includes the uv add install hint.""" - saved = sys.modules.get("pydantic_ai") - try: - sys.modules["pydantic_ai"] = None # type: ignore - - with pytest.raises(ImportError, match="uv add pydantic-ai"): - await optimize_instruction("inst", _make_result(), "criterion") - finally: - if saved is not None: - sys.modules["pydantic_ai"] = saved - else: - del sys.modules["pydantic_ai"] + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", _make_result(tools=["create_file", "read_file"]), "criterion" + ) + assert "create_file" in agent_instance.run.call_args[0][0] + + +class TestAzureEntraModel: + """Tests for azure_entra_model().""" + + # Patch targets: lazy imports inside the function body live in their home modules + _PATCHES = [ + ("azure.identity.DefaultAzureCredential", MagicMock()), + ("azure.identity.get_bearer_token_provider", MagicMock()), + ("openai.AsyncAzureOpenAI", MagicMock()), + ("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + ] + + def test_returns_model_object(self): + """azure_entra_model() returns a pydantic-ai Model-compatible object.""" + from pytest_codingagents.copilot.optimizer import azure_entra_model + + fake_model = MagicMock() + with ( + patch("azure.identity.DefaultAzureCredential", MagicMock()), + patch("azure.identity.get_bearer_token_provider", MagicMock()), + patch("openai.AsyncAzureOpenAI", MagicMock()), + patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + patch("pydantic_ai.models.openai.OpenAIChatModel", return_value=fake_model), + ): + result = azure_entra_model(endpoint="https://test.openai.azure.com/") + assert result is fake_model + + def test_default_deployment_is_gpt52(self): + """azure_entra_model() defaults to gpt-5.2-chat.""" + from pytest_codingagents.copilot.optimizer import azure_entra_model + + captured: list[str] = [] + with ( + patch("azure.identity.DefaultAzureCredential", MagicMock()), + patch("azure.identity.get_bearer_token_provider", MagicMock()), + patch("openai.AsyncAzureOpenAI", MagicMock()), + patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + patch( + "pydantic_ai.models.openai.OpenAIChatModel", + side_effect=lambda name, **kw: captured.append(name) or MagicMock(), + ), + ): + azure_entra_model(endpoint="https://test.openai.azure.com/") + assert captured == ["gpt-5.2-chat"] + + def test_custom_deployment_name(self): + """azure_entra_model() uses the provided deployment name.""" + from pytest_codingagents.copilot.optimizer import azure_entra_model + + captured: list[str] = [] + with ( + patch("azure.identity.DefaultAzureCredential", MagicMock()), + patch("azure.identity.get_bearer_token_provider", MagicMock()), + patch("openai.AsyncAzureOpenAI", MagicMock()), + patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + patch( + "pydantic_ai.models.openai.OpenAIChatModel", + side_effect=lambda name, **kw: captured.append(name) or MagicMock(), + ), + ): + azure_entra_model("gpt-4.1", endpoint="https://test.openai.azure.com/") + assert captured == ["gpt-4.1"] From febc007cb12468ec5f0a2d79764d056410b2ed87 Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 15:46:11 +0100 Subject: [PATCH 2/3] feat: add IDE personas for VS Code, Claude Code, and Copilot CLI Introduces a Persona concept that simulates each IDE's native tool environment during testing: - VSCodePersona (default): polyfills runSubagent + auto-loads .github/copilot-instructions.md from working_directory - CopilotCLIPersona: task+skill already native; auto-loads .github/copilot-instructions.md - ClaudeCodePersona: polyfills task-dispatch + auto-loads CLAUDE.md - HeadlessPersona: raw SDK baseline, no polyfills, no file loading Also: - from_copilot_config() now discovers agents recursively (rglob) so agents under subagents/ subdirectories are found automatically - EventMapper gains record_subagent_start/complete/failed public methods - runSubagent injection moved from runner.py into personas.py Bumps version to 0.2.1. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 2 +- docs/how-to/copilot-config.md | 5 +- docs/how-to/ide-personas.md | 108 +++++ docs/how-to/index.md | 1 + docs/reference/api.md | 22 + docs/reference/configuration.md | 1 + pyproject.toml | 2 +- src/pytest_codingagents/__init__.py | 12 + src/pytest_codingagents/copilot/agent.py | 26 +- src/pytest_codingagents/copilot/events.py | 25 ++ src/pytest_codingagents/copilot/personas.py | 440 ++++++++++++++++++++ src/pytest_codingagents/copilot/runner.py | 6 +- 12 files changed, 643 insertions(+), 7 deletions(-) create mode 100644 docs/how-to/ide-personas.md create mode 100644 src/pytest_codingagents/copilot/personas.py diff --git a/README.md b/README.md index fdff77f..c6572d2 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ pytest-codingagents gives you a complete **test→optimize→test loop** for Git 4. **A/B confirm** — use `ab_run` to prove the change actually helps 5. **Ship it** — you now have evidence, not vibes -Currently supports **GitHub Copilot** via [copilot-sdk](https://www.npmjs.com/package/github-copilot-sdk). More agents (Claude Code, etc.) coming soon. +Currently supports **GitHub Copilot** via [copilot-sdk](https://www.npmjs.com/package/github-copilot-sdk) with **IDE personas** for VS Code, Claude Code, and Copilot CLI environments. ```python from pytest_codingagents import CopilotAgent, optimize_instruction diff --git a/docs/how-to/copilot-config.md b/docs/how-to/copilot-config.md index 0ed74ae..52f9d5f 100644 --- a/docs/how-to/copilot-config.md +++ b/docs/how-to/copilot-config.md @@ -9,7 +9,9 @@ test fixture project, a shared team config repo, or anything else. | Source | Path (relative to the root you point at) | Maps to | |--------|------------------------------------------|---------| | Instructions | `.github/copilot-instructions.md` | `instructions` | -| Custom agents | `.github/agents/*.agent.md` | `custom_agents` | +| Custom agents | `.github/agents/**/*.agent.md` (recursive) | `custom_agents` | + +Agent files are discovered recursively — agents in `subagents/` subdirectories (e.g. `.github/agents/hve-core/subagents/`) are included automatically. ## Basic usage @@ -102,6 +104,7 @@ The Markdown body becomes the agent's prompt. ## See also +- [IDE Personas Guide](ide-personas.md) — Simulate VS Code, Claude Code, or Copilot CLI environments - [A/B Testing Guide](ab-testing.md) - [GitHub Copilot custom agents docs](https://docs.github.com/en/copilot/how-tos/copilot-cli/customize-copilot/create-custom-agents-for-cli) - [Custom agents configuration reference](https://docs.github.com/en/copilot/reference/custom-agents-configuration) diff --git a/docs/how-to/ide-personas.md b/docs/how-to/ide-personas.md new file mode 100644 index 0000000..6185bef --- /dev/null +++ b/docs/how-to/ide-personas.md @@ -0,0 +1,108 @@ +# IDE Personas + +Agents written for VS Code, Claude Code, or the Copilot CLI each expect a +different native tool set. A `Persona` tells `pytest-codingagents` which +runtime environment to simulate so your tests run the agent the same way +the IDE would. + +## The problem + +An agent like `rpi-agent` is written for VS Code, where `runSubagent` is a +native tool. In the Copilot SDK headless mode `runSubagent` does not exist, +so the agent silently falls back to direct implementation — the RPI pipeline +never fires, and the test proves nothing. + +A persona solves this by: + +1. **Injecting polyfill tools** — e.g. a Python-side `runSubagent` that + dispatches registered custom agents as nested SDK runs. +2. **Auto-loading custom instructions** — VS Code and Copilot CLI read + `.github/copilot-instructions.md`; Claude Code reads `CLAUDE.md`. The + persona does the same, prepending the file to the session's system + message when `working_directory` is set. +3. **Setting IDE context** — adds a system-message fragment so the model + knows which environment it is in. + +## Built-in personas + +| Persona | Auto-loaded file | Polyfilled tools | Use for | +|---|---|---|---| +| `VSCodePersona` *(default)* | `.github/copilot-instructions.md` | `runSubagent` | VS Code Copilot agents | +| `CopilotCLIPersona` | `.github/copilot-instructions.md` | none — `task` + `skill` are native | Copilot terminal agents | +| `ClaudeCodePersona` | `CLAUDE.md` | `task`-dispatch | Claude Code agents | +| `HeadlessPersona` | nothing | none | Raw SDK baseline | + +## Usage + +```python +from pytest_codingagents import CopilotAgent, VSCodePersona, CopilotCLIPersona, ClaudeCodePersona, HeadlessPersona + +# VS Code agent — auto-loads .github/copilot-instructions.md, polyfills runSubagent +agent = CopilotAgent( + persona=VSCodePersona(), + working_directory=str(workspace), + custom_agents=my_agents, +) + +# Default — VSCodePersona is used automatically +agent = CopilotAgent(custom_agents=my_agents) + +# Copilot CLI — same instructions file; task+skill already native, no polyfill needed +agent = CopilotAgent(persona=CopilotCLIPersona(), working_directory=str(workspace)) + +# Claude Code — loads CLAUDE.md, polyfills task-dispatch +agent = CopilotAgent( + persona=ClaudeCodePersona(), + working_directory=str(workspace), + custom_agents=my_agents, +) + +# Headless baseline — no IDE context, no file loaded, no polyfills +agent = CopilotAgent(persona=HeadlessPersona()) +``` + +## Custom instructions loading + +Custom instruction loading is **automatic and additive**: + +- Fires only when `agent.working_directory` is set +- Fires only when the target file exists in that directory +- Prepends the file content to the session system message (before any + `instructions` you set on the agent) +- If the file is absent, the persona works exactly as without it + +This means the same test works against a workspace that has +`.github/copilot-instructions.md` and one that does not — the persona +adapts silently. + +## `runSubagent` polyfill + +`VSCodePersona` injects `runSubagent` as a Python-side tool when +`agent.custom_agents` is non-empty. The tool dispatches the named agent +as a nested `run_copilot` call, so the model's sub-agent invocations +produce real results — not stub responses. + +The polyfill is a no-op when `custom_agents` is empty. + +## Extending personas + +Subclass `Persona` and override `apply()`: + +```python +from pytest_codingagents import Persona, CopilotAgent + +class MyPersona(Persona): + def apply(self, agent, session_config, mapper): + # Add your tool polyfills or system message additions here + session_config.setdefault("system_message", {})["content"] = ( + "Custom context. " + + session_config.get("system_message", {}).get("content", "") + ) + +agent = CopilotAgent(persona=MyPersona()) +``` + +## See also + +- [Load from Copilot Config](copilot-config.md) +- [Tool Control](tool-control.md) diff --git a/docs/how-to/index.md b/docs/how-to/index.md index b1840ea..2ad9ce9 100644 --- a/docs/how-to/index.md +++ b/docs/how-to/index.md @@ -6,6 +6,7 @@ Practical guides for common tasks. - [Optimize Instructions](optimize.md) — Use AI to turn test failures into actionable instruction improvements - [Assertions](assertions.md) — File helpers and semantic assertions with `llm_assert` - [Load from Copilot Config](copilot-config.md) — Build a `CopilotAgent` from your real `.github/` config files +- [IDE Personas](ide-personas.md) — Simulate VS Code, Claude Code, or Copilot CLI tool environments - [Skill Testing](skills.md) — Measure the impact of domain knowledge - [MCP Server Testing](mcp-servers.md) — Test that the agent uses your custom tools - [CLI Tool Testing](cli-tools.md) — Verify the agent operates CLI tools correctly diff --git a/docs/reference/api.md b/docs/reference/api.md index 8f16a53..8e1f381 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -15,3 +15,25 @@ ::: pytest_codingagents.InstructionSuggestion options: show_source: false + +## IDE Personas + +::: pytest_codingagents.Persona + options: + show_source: false + +::: pytest_codingagents.VSCodePersona + options: + show_source: false + +::: pytest_codingagents.CopilotCLIPersona + options: + show_source: false + +::: pytest_codingagents.ClaudeCodePersona + options: + show_source: false + +::: pytest_codingagents.HeadlessPersona + options: + show_source: false diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index d103819..32f8f2f 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -4,6 +4,7 @@ | Field | Type | Default | Description | |-------|------|---------|-------------| +| `persona` | `Persona` | `VSCodePersona()` | IDE runtime persona — controls polyfill tools and auto-loads IDE-specific custom instructions. See [IDE Personas](../how-to/ide-personas.md) | | `name` | `str` | `"copilot"` | Agent identifier for reports | | `model` | `str \| None` | `None` | Model to use (e.g., `claude-sonnet-4`) | | `instructions` | `str \| None` | `None` | Instructions for the agent | diff --git a/pyproject.toml b/pyproject.toml index d7f4de9..2e53803 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "pytest-codingagents" -version = "0.2.0" +version = "0.2.1" description = "Pytest plugin for testing real coding agents via their SDK" readme = "README.md" license = { text = "MIT" } diff --git a/src/pytest_codingagents/__init__.py b/src/pytest_codingagents/__init__.py index 182169f..828c111 100644 --- a/src/pytest_codingagents/__init__.py +++ b/src/pytest_codingagents/__init__.py @@ -8,12 +8,24 @@ InstructionSuggestion, optimize_instruction, ) +from pytest_codingagents.copilot.personas import ( + ClaudeCodePersona, + CopilotCLIPersona, + HeadlessPersona, + Persona, + VSCodePersona, +) from pytest_codingagents.copilot.result import CopilotResult __all__ = [ "CopilotAgent", "CopilotResult", "InstructionSuggestion", + "ClaudeCodePersona", + "CopilotCLIPersona", + "HeadlessPersona", + "Persona", + "VSCodePersona", "load_custom_agent", "load_custom_agents", "optimize_instruction", diff --git a/src/pytest_codingagents/copilot/agent.py b/src/pytest_codingagents/copilot/agent.py index 9521c24..306c8c9 100644 --- a/src/pytest_codingagents/copilot/agent.py +++ b/src/pytest_codingagents/copilot/agent.py @@ -4,10 +4,13 @@ from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import yaml +if TYPE_CHECKING: + from pytest_codingagents.copilot.personas import Persona + def _parse_agent_file(path: Path) -> dict[str, Any]: """Parse a ``.agent.md`` file into a ``CustomAgentConfig`` dict. @@ -130,6 +133,12 @@ class CopilotAgent: # SDK passthrough for unmapped fields extra_config: dict[str, Any] = field(default_factory=dict) + # IDE persona — controls which polyfill tools are injected to simulate + # the target runtime environment (VS Code, Claude Code, Copilot CLI, etc.) + # VSCodePersona is the default: it polyfills runSubagent when custom_agents + # are present, matching VS Code's native behaviour. + persona: "Persona" = field(default_factory=lambda: _default_persona()) + def build_session_config(self) -> dict[str, Any]: """Build a SessionConfig dict for the Copilot SDK. @@ -243,11 +252,11 @@ def from_copilot_config( if instructions_file.exists(): instructions = instructions_file.read_text(encoding="utf-8").strip() or None - # Load custom agents + # Load custom agents — recursive so subagents/ subdirectories are included agents: list[dict[str, Any]] = [] agents_dir = github_dir / "agents" if agents_dir.exists(): - for agent_file in sorted(agents_dir.glob("*.agent.md")): + for agent_file in sorted(agents_dir.rglob("*.agent.md")): agents.append(_parse_agent_file(agent_file)) config: dict[str, Any] = { @@ -256,3 +265,14 @@ def from_copilot_config( } config.update(overrides) return cls(**config) + + +def _default_persona() -> "Persona": + """Return the default persona (VSCodePersona). + + Defined as a function to avoid a circular-import at module level: + ``personas.py`` imports ``agent.py``, so we defer the import. + """ + from pytest_codingagents.copilot.personas import VSCodePersona # noqa: PLC0415 + + return VSCodePersona() diff --git a/src/pytest_codingagents/copilot/events.py b/src/pytest_codingagents/copilot/events.py index 9298c52..514d381 100644 --- a/src/pytest_codingagents/copilot/events.py +++ b/src/pytest_codingagents/copilot/events.py @@ -274,6 +274,31 @@ def _handle_tool_execution_complete(self, event: SessionEvent) -> None: result_text = tc.result if tc else str(result_data) self._turns.append(Turn(role="tool", content=f"[{tool_name}] {result_text or ''}")) + # ── Subagent recording (used by runSubagent tool handler) ── + + def record_subagent_start(self, name: str) -> None: + """Record a subagent invocation dispatched via the runSubagent tool.""" + self._subagent_start_times[name] = time.monotonic() + self._subagents.append(SubagentInvocation(name=name, status="started")) + + def record_subagent_complete(self, name: str) -> None: + """Mark a previously started subagent invocation as completed.""" + start = self._subagent_start_times.pop(name, None) + duration = (time.monotonic() - start) * 1000 if start else None + for sa in self._subagents: + if sa.name == name and sa.status == "started": + sa.status = "completed" + sa.duration_ms = duration + return + + def record_subagent_failed(self, name: str) -> None: + """Mark a previously started subagent invocation as failed.""" + self._subagent_start_times.pop(name, None) + for sa in self._subagents: + if sa.name == name and sa.status == "started": + sa.status = "failed" + return + # ── Subagent events ── def _handle_subagent_selected(self, event: SessionEvent) -> None: diff --git a/src/pytest_codingagents/copilot/personas.py b/src/pytest_codingagents/copilot/personas.py new file mode 100644 index 0000000..501142c --- /dev/null +++ b/src/pytest_codingagents/copilot/personas.py @@ -0,0 +1,440 @@ +"""IDE Personas for pytest-codingagents. + +A ``Persona`` defines the runtime environment in which an agent under test +is expected to run. Each persona ensures the agent has the correct tool set +for its target IDE by injecting polyfill tools and adding a system-message +fragment that sets context. + +Built-in personas +----------------- +``VSCodePersona`` (default) + Simulates the VS Code Copilot extension. Polyfills ``runSubagent`` so + that agents written for VS Code dispatch sub-agents correctly. + +``ClaudeCodePersona`` + Simulates Claude Code. Polyfills a ``task``-dispatch tool (same + mechanism as ``runSubagent``, named ``task`` to match Claude Code's + native API). + +``CopilotCLIPersona`` + Simulates the GitHub Copilot terminal agent. No polyfills are needed — + ``task`` and ``skill`` are already in the SDK's native 16-tool set. + Adds a system-message fragment so the model knows its environment. + +``HeadlessPersona`` + Raw SDK headless mode — no polyfills, no extra system message. Use + when you want to test exactly what the SDK exposes with no IDE context. + +Usage:: + + from pytest_codingagents import CopilotAgent, VSCodePersona, ClaudeCodePersona + + # Explicit — recommended for clarity + agent = CopilotAgent(persona=VSCodePersona(), custom_agents=[...]) + + # Default — VSCodePersona is used automatically + agent = CopilotAgent(custom_agents=[...]) + + # Headless — no IDE context, no polyfills + agent = CopilotAgent(persona=HeadlessPersona()) +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from copilot.types import Tool, ToolInvocation, ToolResult + + from pytest_codingagents.copilot.agent import CopilotAgent + from pytest_codingagents.copilot.events import EventMapper + + +# --------------------------------------------------------------------------- +# Base class +# --------------------------------------------------------------------------- + + +class Persona: + """Base class for IDE runtime personas. + + Override ``apply()`` to inject polyfill tools and system-message + additions that match your target IDE's native tool set. + + The ``apply()`` method is called by the runner *after* + ``agent.build_session_config()`` and *before* the session is created, + so modifications to ``session_config`` take effect immediately. + + Phase-2 extension point: override ``create_client()`` to swap the + underlying SDK backend (e.g. Anthropic SDK for Claude Code). + """ + + def apply( + self, + agent: "CopilotAgent", + session_config: dict[str, Any], + mapper: "EventMapper", + ) -> None: + """Modify *session_config* in-place to match this persona's environment. + + Args: + agent: The ``CopilotAgent`` being executed (read-only). + session_config: The session config dict built from ``agent``. + Mutate this to inject tools, update system_message, etc. + mapper: The ``EventMapper`` for the current run. Pass to + tool handlers that need to record subagent events. + """ + + # ------------------------------------------------------------------ + # Phase-2 extension point (not yet used) + # ------------------------------------------------------------------ + + # async def create_client(self, agent: CopilotAgent) -> CopilotClient: + # """Override to swap the SDK backend for this persona.""" + # from copilot import CopilotClient + # return CopilotClient(...) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}()" + + +# --------------------------------------------------------------------------- +# Headless (raw SDK baseline) +# --------------------------------------------------------------------------- + + +class HeadlessPersona(Persona): + """Raw SDK headless mode — no polyfills, no IDE system message. + + Use this when you want to test exactly what the Copilot SDK exposes + with no runtime context added. This is the minimal baseline. + """ + + +# --------------------------------------------------------------------------- +# GitHub Copilot CLI +# --------------------------------------------------------------------------- + + +class CopilotCLIPersona(Persona): + """GitHub Copilot terminal agent persona. + + ``task`` and ``skill`` are already in the SDK's native 16-tool set, so + no polyfills are needed. This persona only adds a system-message + fragment so the model knows it is running inside the Copilot CLI and + can use ``task`` for sub-task dispatch. + """ + + _SYSTEM_MSG = "You are running inside GitHub Copilot CLI." + _INSTRUCTIONS_FILE = Path(".github") / "copilot-instructions.md" + + def apply( + self, + agent: "CopilotAgent", + session_config: dict[str, Any], + mapper: "EventMapper", + ) -> None: + _prepend_system_message(session_config, self._SYSTEM_MSG) + if agent.working_directory: + custom = _load_custom_instructions_file( + Path(agent.working_directory) / self._INSTRUCTIONS_FILE + ) + if custom: + _prepend_system_message(session_config, custom) + + +# --------------------------------------------------------------------------- +# VS Code +# --------------------------------------------------------------------------- + + +class VSCodePersona(Persona): + """VS Code Copilot extension persona. + + Polyfills ``runSubagent`` so agents written for VS Code (where + ``runSubagent`` is a native tool) can dispatch custom sub-agents + correctly during testing. + + The polyfill is only injected when ``agent.custom_agents`` is non-empty, + so using this persona with a plain agent has no side-effects. + """ + + _SYSTEM_MSG = "You are running inside VS Code." + _INSTRUCTIONS_FILE = Path(".github") / "copilot-instructions.md" + + def apply( + self, + agent: "CopilotAgent", + session_config: dict[str, Any], + mapper: "EventMapper", + ) -> None: + _prepend_system_message(session_config, self._SYSTEM_MSG) + if agent.working_directory: + custom = _load_custom_instructions_file( + Path(agent.working_directory) / self._INSTRUCTIONS_FILE + ) + if custom: + _prepend_system_message(session_config, custom) + if agent.custom_agents: + tool = _make_runsubagent_tool(agent, agent.custom_agents, mapper) + _inject_tool(session_config, tool) + + +# --------------------------------------------------------------------------- +# Claude Code +# --------------------------------------------------------------------------- + + +class ClaudeCodePersona(Persona): + """Claude Code persona. + + Polyfills a ``task``-dispatch tool (same dispatch mechanism as + ``runSubagent``, named ``task`` to match Claude Code's native API) so + agents written for Claude Code can dispatch sub-agents during testing. + + The polyfill is only injected when ``agent.custom_agents`` is non-empty. + """ + + _SYSTEM_MSG = "You are running inside Claude Code." + _INSTRUCTIONS_FILE = Path("CLAUDE.md") + + def apply( + self, + agent: "CopilotAgent", + session_config: dict[str, Any], + mapper: "EventMapper", + ) -> None: + _prepend_system_message(session_config, self._SYSTEM_MSG) + if agent.working_directory: + custom = _load_custom_instructions_file( + Path(agent.working_directory) / self._INSTRUCTIONS_FILE + ) + if custom: + _prepend_system_message(session_config, custom) + if agent.custom_agents: + tool = _make_task_tool(agent, agent.custom_agents, mapper) + _inject_tool(session_config, tool) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _load_custom_instructions_file(file_path: Path) -> str | None: + """Read a custom instructions file and return its content, or None if absent.""" + if file_path.exists(): + content = file_path.read_text(encoding="utf-8").strip() + return content or None + return None + + +def _prepend_system_message(session_config: dict[str, Any], message: str) -> None: + """Prepend *message* to the system_message in *session_config*. + + If no system_message is set, creates one in "append" mode so it is + added to the CLI's built-in system message rather than replacing it. + """ + existing = session_config.get("system_message") or {} + existing_content: str = existing.get("content") or "" + mode: str = existing.get("mode") or "append" + combined = f"{message}\n\n{existing_content}".strip() + session_config["system_message"] = {"mode": mode, "content": combined} + + +def _inject_tool(session_config: dict[str, Any], tool: "Tool") -> None: + """Append *tool* to the tools list in *session_config*.""" + existing: list[Any] = list(session_config.get("tools") or []) + session_config["tools"] = existing + [tool] + + +def _make_runsubagent_tool( + parent_agent: "CopilotAgent", + custom_agents: list[dict[str, Any]], + mapper: "EventMapper", +) -> "Tool": + """Build a ``runSubagent`` polyfill tool for the VS Code persona. + + The Copilot CLI does not natively expose ``runSubagent`` in SDK headless + mode. This factory creates a Python-side ``Tool`` that dispatches + registered custom agents as nested ``run_copilot`` calls. + """ + from copilot.types import Tool, ToolResult + + from pytest_codingagents.copilot.agent import CopilotAgent as _CopilotAgent + from pytest_codingagents.copilot.runner import run_copilot + + agent_map: dict[str, dict[str, Any]] = {a["name"]: a for a in custom_agents} + + async def _handler(invocation: "ToolInvocation") -> "ToolResult": + args: dict[str, Any] = invocation.get("arguments") or {} # type: ignore[assignment] + + agent_name: str | None = ( + args.get("agent_name") or args.get("agent") or args.get("agentName") + ) + prompt_text: str = args.get("prompt") or args.get("message") or args.get("task") or "" + + if not agent_name: + available = sorted(agent_map) + return ToolResult( + textResultForLlm=(f"Error: agent_name is required. Available agents: {available}"), + resultType="failure", + ) + + agent_cfg = agent_map.get(agent_name) + if agent_cfg is None: + available = sorted(agent_map) + return ToolResult( + textResultForLlm=(f"Error: agent '{agent_name}' not found. Available: {available}"), + resultType="failure", + ) + + mapper.record_subagent_start(agent_name) + + sub_agent = _CopilotAgent( + name=agent_name, + model=parent_agent.model, + instructions=agent_cfg.get("prompt"), + working_directory=parent_agent.working_directory, + timeout_s=min(parent_agent.timeout_s, 600.0), + max_turns=min(parent_agent.max_turns, 30), + auto_confirm=True, + ) + + sub_result = await run_copilot(sub_agent, prompt_text) + + if sub_result.success: + mapper.record_subagent_complete(agent_name) + return ToolResult( + textResultForLlm=sub_result.final_response or "Sub-agent completed.", + resultType="success", + ) + + mapper.record_subagent_failed(agent_name) + return ToolResult( + textResultForLlm=f"Sub-agent '{agent_name}' failed: {sub_result.error}", + resultType="failure", + ) + + return Tool( + name="runSubagent", + description=( + "Dispatch a named custom agent to perform a task. " + "The agent runs with its own instructions and returns its " + "final response. " + f"Available agents: {sorted(agent_map)}" + ), + handler=_handler, + parameters={ + "type": "object", + "properties": { + "agent_name": { + "type": "string", + "description": "Name of the agent to dispatch.", + "enum": sorted(agent_map), + }, + "prompt": { + "type": "string", + "description": "Task or message to send to the agent.", + }, + }, + "required": ["agent_name", "prompt"], + }, + ) + + +def _make_task_tool( + parent_agent: "CopilotAgent", + custom_agents: list[dict[str, Any]], + mapper: "EventMapper", +) -> "Tool": + """Build a ``task`` polyfill tool for the Claude Code persona. + + Identical dispatch mechanism to ``_make_runsubagent_tool`` but named + ``task`` to match Claude Code's native sub-agent dispatch API. + """ + from copilot.types import Tool, ToolResult + + from pytest_codingagents.copilot.agent import CopilotAgent as _CopilotAgent + from pytest_codingagents.copilot.runner import run_copilot + + agent_map: dict[str, dict[str, Any]] = {a["name"]: a for a in custom_agents} + + async def _handler(invocation: "ToolInvocation") -> "ToolResult": + args: dict[str, Any] = invocation.get("arguments") or {} # type: ignore[assignment] + + agent_name: str | None = ( + args.get("agent_name") or args.get("agent") or args.get("agentName") + ) + prompt_text: str = ( + args.get("prompt") or args.get("message") or args.get("description") or "" + ) + + if not agent_name: + available = sorted(agent_map) + return ToolResult( + textResultForLlm=(f"Error: agent_name is required. Available agents: {available}"), + resultType="failure", + ) + + agent_cfg = agent_map.get(agent_name) + if agent_cfg is None: + available = sorted(agent_map) + return ToolResult( + textResultForLlm=(f"Error: agent '{agent_name}' not found. Available: {available}"), + resultType="failure", + ) + + mapper.record_subagent_start(agent_name) + + sub_agent = _CopilotAgent( + name=agent_name, + model=parent_agent.model, + instructions=agent_cfg.get("prompt"), + working_directory=parent_agent.working_directory, + timeout_s=min(parent_agent.timeout_s, 600.0), + max_turns=min(parent_agent.max_turns, 30), + auto_confirm=True, + ) + + sub_result = await run_copilot(sub_agent, prompt_text) + + if sub_result.success: + mapper.record_subagent_complete(agent_name) + return ToolResult( + textResultForLlm=sub_result.final_response or "Sub-agent completed.", + resultType="success", + ) + + mapper.record_subagent_failed(agent_name) + return ToolResult( + textResultForLlm=f"Sub-agent '{agent_name}' failed: {sub_result.error}", + resultType="failure", + ) + + return Tool( + name="task", + description=( + "Dispatch a named agent to perform a task. " + "The agent runs with its own instructions and returns its " + "final response. " + f"Available agents: {sorted(agent_map)}" + ), + handler=_handler, + parameters={ + "type": "object", + "properties": { + "agent_name": { + "type": "string", + "description": "Name of the agent to dispatch.", + "enum": sorted(agent_map), + }, + "prompt": { + "type": "string", + "description": "Task or message to send to the agent.", + }, + }, + "required": ["agent_name", "prompt"], + }, + ) diff --git a/src/pytest_codingagents/copilot/runner.py b/src/pytest_codingagents/copilot/runner.py index 813205a..6eafe99 100644 --- a/src/pytest_codingagents/copilot/runner.py +++ b/src/pytest_codingagents/copilot/runner.py @@ -100,7 +100,7 @@ def _is_transient_error(error: str | None) -> bool: return any(pattern in error for pattern in _TRANSIENT_PATTERNS) -async def _run_copilot_once(agent: CopilotAgent, prompt: str) -> CopilotResult: +async def _run_copilot_once(agent: "CopilotAgent", prompt: str) -> "CopilotResult": """Execute a single attempt of a prompt against GitHub Copilot.""" client_options: dict[str, Any] = { "cwd": agent.working_directory or ".", @@ -127,6 +127,10 @@ async def _run_copilot_once(agent: CopilotAgent, prompt: str) -> CopilotResult: # Build session config from agent session_config = agent.build_session_config() + # Apply the persona: injects polyfill tools and system-message + # additions that match the target IDE environment. + agent.persona.apply(agent, session_config, mapper) + # Install permission handler if auto_confirm is enabled if agent.auto_confirm: session_config["on_permission_request"] = _auto_approve_handler From 150aa968d9da210207e9af807b3edfbdcd7c3463 Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 18:46:59 +0100 Subject: [PATCH 3/3] feat: inject XML block into system prompt to mirror VS Code orchestrator behavior When an agent has custom_agents configured, VSCodePersona and ClaudeCodePersona now inject an XML block listing available subagents by name/description. This mirrors the behavior of computeAutomaticInstructions.ts in microsoft/vscode, which guides the model to use runSubagent/task for delegation rather than implementing directly. Bump version to 0.2.2. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pyproject.toml | 2 +- src/pytest_codingagents/copilot/personas.py | 42 +++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2e53803..4296774 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "pytest-codingagents" -version = "0.2.1" +version = "0.2.2" description = "Pytest plugin for testing real coding agents via their SDK" readme = "README.md" license = { text = "MIT" } diff --git a/src/pytest_codingagents/copilot/personas.py b/src/pytest_codingagents/copilot/personas.py index 501142c..92632e2 100644 --- a/src/pytest_codingagents/copilot/personas.py +++ b/src/pytest_codingagents/copilot/personas.py @@ -179,6 +179,8 @@ def apply( if agent.custom_agents: tool = _make_runsubagent_tool(agent, agent.custom_agents, mapper) _inject_tool(session_config, tool) + agents_block = _build_agents_block(agent.custom_agents, tool_name="runSubagent") + _prepend_system_message(session_config, agents_block) # --------------------------------------------------------------------------- @@ -215,6 +217,8 @@ def apply( if agent.custom_agents: tool = _make_task_tool(agent, agent.custom_agents, mapper) _inject_tool(session_config, tool) + agents_block = _build_agents_block(agent.custom_agents, tool_name="task") + _prepend_system_message(session_config, agents_block) # --------------------------------------------------------------------------- @@ -249,6 +253,44 @@ def _inject_tool(session_config: dict[str, Any], tool: "Tool") -> None: session_config["tools"] = existing + [tool] +def _build_agents_block(custom_agents: list[dict[str, Any]], tool_name: str = "runSubagent") -> str: + """Build the XML block that VS Code injects into the system prompt. + + Mirrors ``computeAutomaticInstructions.ts`` in ``microsoft/vscode``: + lists available subagents by name and description so the model knows + which agents to dispatch and how to call them. + + Args: + custom_agents: List of custom agent config dicts (each with at least + a ``name`` key, optionally ``description`` and ``argument_hint``). + tool_name: Name of the dispatch tool (``runSubagent`` for VS Code, + ``task`` for Claude Code). + + Returns: + The ```` XML string to prepend to the system message. + """ + lines: list[str] = [ + "", + "Here is a list of agents that can be used when running a subagent.", + ( + "Each agent has optionally a description with the agent's purpose " + "and expertise. When asked to run a subagent, choose the most " + "appropriate agent from this list." + ), + f"Use the {tool_name} tool with the agent name to run the subagent.", + ] + for a in custom_agents: + lines.append("") + lines.append(f"{a['name']}") + if desc := a.get("description"): + lines.append(f"{desc}") + if hint := a.get("argument_hint") or a.get("argumentHint"): + lines.append(f"{hint}") + lines.append("") + lines.append("") + return "\n".join(lines) + + def _make_runsubagent_tool( parent_agent: "CopilotAgent", custom_agents: list[dict[str, Any]],