From 2426156dafbb6360274a7892efff5986313429ef Mon Sep 17 00:00:00 2001 From: Lucio Cunha Tinoco Date: Tue, 3 Mar 2026 16:52:27 -0500 Subject: [PATCH 1/2] fix(copilot): add permission handler, event logging, and idle detection race fix Three fixes for the Copilot SDK provider: 1. Add on_permission_request auto-approve handler to session config. The Copilot SDK now requires this handler when creating sessions. Without it, every workflow fails with 'An on_permission_request handler is required'. Auto-approve is correct for non-interactive conductor workflows. 2. Add SDK event logging via logger.debug() in the on_event callback. Every SDK event (including tool execution starts) is logged at DEBUG level, visible when --log-file is passed. This enables post-mortem debugging of session stalls. Works with --web-bg because bg_runner.py already forwards --log-file to the background process. 3. Fix done.clear() race condition in _wait_with_idle_detection. If session.idle arrives between a timeout check and done.clear(), the completion signal is lost, causing the loop to wait another full idle timeout (5 minutes) before checking again. Added done.is_set() guards at three locations before done.clear() and an early-return check at the top of the loop. --- src/conductor/providers/copilot.py | 31 ++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/conductor/providers/copilot.py b/src/conductor/providers/copilot.py index d1368c7..56d179f 100644 --- a/src/conductor/providers/copilot.py +++ b/src/conductor/providers/copilot.py @@ -411,6 +411,9 @@ async def _execute_sdk_call( # Build session config with MCP servers from workflow configuration session_config: dict[str, Any] = { "model": model, + # Auto-approve all permission requests (shell, write, mcp, read, url) + # since Conductor workflows run non-interactively. + "on_permission_request": lambda _req, _ctx: {"kind": "approved"}, } # Add temperature if configured @@ -621,6 +624,16 @@ def on_event(event: Any) -> None: nonlocal response_content, error_message event_type = event.type.value if hasattr(event.type, "value") else str(event.type) + # Log every SDK event for debugging stalls (visible via --log-file) + if logger.isEnabledFor(logging.DEBUG): + tool_info = "" + if event_type == "tool.execution_start": + tn = getattr(event.data, "tool_name", None) or getattr( + event.data, "name", "?" + ) + tool_info = f" tool={tn}" + logger.debug("sdk_event: %s%s", event_type, tool_info) + # Update last activity on EVERY event (this is key for idle detection!) last_activity_ref[0] = event_type last_activity_ref[2] = time.monotonic() @@ -1267,6 +1280,11 @@ async def _wait_with_idle_detection( idle_timeout = self._idle_recovery_config.idle_timeout_seconds while True: + # Check if done was already set (avoids race where session.idle + # arrived between a previous done.clear() and the next wait). + if done.is_set(): + return + try: # Wait for done with idle timeout await asyncio.wait_for( @@ -1288,7 +1306,11 @@ async def _wait_with_idle_detection( # just hasn't finished yet. Reset recovery counter (new task) # and keep waiting. recovery_attempts = 0 - done.clear() + # Only clear if done hasn't been set in the meantime + # (prevents race where session.idle arrives right as we + # check time_since_last_event). + if not done.is_set(): + done.clear() continue # Genuinely idle — no events for the full timeout period @@ -1321,9 +1343,10 @@ async def _wait_with_idle_detection( recovery_prompt = self._build_recovery_prompt(last_event_type, last_tool_call) await session.send({"prompt": recovery_prompt}) - # Reset the done event to wait again - # (it may have been set by a previous partial response) - done.clear() + # Reset the done event to wait again — but only if it hasn't + # been set since the recovery prompt was sent. + if not done.is_set(): + done.clear() async def _ensure_client_started(self) -> None: """Ensure the Copilot client is started.""" From a5071ffd6f3f49ce27c996585c9bcdfedd228c60 Mon Sep 17 00:00:00 2001 From: Lucio Cunha Tinoco Date: Tue, 3 Mar 2026 18:52:27 -0500 Subject: [PATCH 2/2] =?UTF-8?q?fix(copilot):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20extract=20permission=20handler,=20fix=20resume=5Fse?= =?UTF-8?q?ssion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract inline lambda to static `_default_permission_handler` with proper type hints and docstring (fixes ruff format CI failure) - Pass `on_permission_request` to `resume_session()` (was only on `create_session`, causing permission denied on resumed workflows) - Add `logger.debug` audit logging on permission approvals - Add defensive `hasattr(event, "data")` guard in event logging Co-Authored-By: Claude Opus 4.6 --- src/conductor/providers/copilot.py | 33 ++++++++++++++++----- tests/test_providers/test_copilot_resume.py | 10 +++++-- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/conductor/providers/copilot.py b/src/conductor/providers/copilot.py index 56d179f..12cbef7 100644 --- a/src/conductor/providers/copilot.py +++ b/src/conductor/providers/copilot.py @@ -163,6 +163,20 @@ def __init__( self._interrupted_session: Any = None self._abort_supported: bool | None = None + @staticmethod + def _default_permission_handler( + request: dict[str, Any], + invocation: dict[str, str], + ) -> dict[str, Any]: + """Default permission handler that approves all requests. + + SDK v0.1.28+ requires a permission handler on session creation. + In orchestration mode, we approve all tool permissions since the + workflow author controls which tools are available to each agent. + """ + logger.debug("auto-approved permission request: %s", request) + return {"kind": "approved"} + async def execute( self, agent: AgentDef, @@ -411,9 +425,7 @@ async def _execute_sdk_call( # Build session config with MCP servers from workflow configuration session_config: dict[str, Any] = { "model": model, - # Auto-approve all permission requests (shell, write, mcp, read, url) - # since Conductor workflows run non-interactively. - "on_permission_request": lambda _req, _ctx: {"kind": "approved"}, + "on_permission_request": self._default_permission_handler, } # Add temperature if configured @@ -429,7 +441,10 @@ async def _execute_sdk_call( resume_sid = self._resume_session_ids.get(agent.name) if resume_sid is not None: try: - session = await self._client.resume_session(resume_sid) + session = await self._client.resume_session( + resume_sid, + {"on_permission_request": self._default_permission_handler}, + ) logger.info(f"Resumed Copilot session {resume_sid} for agent '{agent.name}'") except Exception as exc: logger.warning( @@ -627,10 +642,12 @@ def on_event(event: Any) -> None: # Log every SDK event for debugging stalls (visible via --log-file) if logger.isEnabledFor(logging.DEBUG): tool_info = "" - if event_type == "tool.execution_start": - tn = getattr(event.data, "tool_name", None) or getattr( - event.data, "name", "?" - ) + if ( + event_type == "tool.execution_start" + and hasattr(event, "data") + and event.data is not None + ): + tn = getattr(event.data, "tool_name", None) or getattr(event.data, "name", "?") tool_info = f" tool={tn}" logger.debug("sdk_event: %s%s", event_type, tool_info) diff --git a/tests/test_providers/test_copilot_resume.py b/tests/test_providers/test_copilot_resume.py index 94cbc5d..a7132b0 100644 --- a/tests/test_providers/test_copilot_resume.py +++ b/tests/test_providers/test_copilot_resume.py @@ -161,7 +161,10 @@ async def _fake_send(msg: Any) -> None: ): await provider.execute(agent, {}, "Continue research") - mock_client.resume_session.assert_called_once_with(resumed_sid) + mock_client.resume_session.assert_called_once_with( + resumed_sid, + {"on_permission_request": CopilotProvider._default_permission_handler}, + ) mock_client.create_session.assert_not_called() @pytest.mark.asyncio @@ -201,7 +204,10 @@ async def _fake_send(msg: Any) -> None: ): await provider.execute(agent, {}, "Continue research") - mock_client.resume_session.assert_called_once_with("stale-sid") + mock_client.resume_session.assert_called_once_with( + "stale-sid", + {"on_permission_request": CopilotProvider._default_permission_handler}, + ) mock_client.create_session.assert_called_once() # Session ID should now reflect the new session assert provider.get_session_ids()["researcher"] == "sess-new"