Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions src/conductor/providers/copilot.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,20 @@ def __init__(
self._interrupted_session: Any = None
self._abort_supported: bool | None = None

@staticmethod
def _default_permission_handler(
request: dict[str, Any],
invocation: dict[str, str],
) -> dict[str, Any]:
"""Default permission handler that approves all requests.

SDK v0.1.28+ requires a permission handler on session creation.
In orchestration mode, we approve all tool permissions since the
workflow author controls which tools are available to each agent.
"""
logger.debug("auto-approved permission request: %s", request)
return {"kind": "approved"}

async def execute(
self,
agent: AgentDef,
Expand Down Expand Up @@ -411,6 +425,7 @@ async def _execute_sdk_call(
# Build session config with MCP servers from workflow configuration
session_config: dict[str, Any] = {
"model": model,
"on_permission_request": self._default_permission_handler,
}

# Add temperature if configured
Expand All @@ -426,7 +441,10 @@ async def _execute_sdk_call(
resume_sid = self._resume_session_ids.get(agent.name)
if resume_sid is not None:
try:
session = await self._client.resume_session(resume_sid)
session = await self._client.resume_session(
resume_sid,
{"on_permission_request": self._default_permission_handler},
)
logger.info(f"Resumed Copilot session {resume_sid} for agent '{agent.name}'")
except Exception as exc:
logger.warning(
Expand Down Expand Up @@ -621,6 +639,18 @@ def on_event(event: Any) -> None:
nonlocal response_content, error_message
event_type = event.type.value if hasattr(event.type, "value") else str(event.type)

# Log every SDK event for debugging stalls (visible via --log-file)
if logger.isEnabledFor(logging.DEBUG):
tool_info = ""
if (
event_type == "tool.execution_start"
and hasattr(event, "data")
and event.data is not None
):
tn = getattr(event.data, "tool_name", None) or getattr(event.data, "name", "?")
tool_info = f" tool={tn}"
logger.debug("sdk_event: %s%s", event_type, tool_info)

# Update last activity on EVERY event (this is key for idle detection!)
last_activity_ref[0] = event_type
last_activity_ref[2] = time.monotonic()
Expand Down Expand Up @@ -1267,6 +1297,11 @@ async def _wait_with_idle_detection(
idle_timeout = self._idle_recovery_config.idle_timeout_seconds

while True:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good fix. This early-return guard closes a real race window where session.idle could arrive between a previous done.clear() and the await wait_for(), causing the completion signal to be lost and an unnecessary full idle-timeout wait.

# Check if done was already set (avoids race where session.idle
# arrived between a previous done.clear() and the next wait).
if done.is_set():
return

try:
# Wait for done with idle timeout
await asyncio.wait_for(
Expand All @@ -1288,7 +1323,11 @@ async def _wait_with_idle_detection(
# just hasn't finished yet. Reset recovery counter (new task)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good fix. Without this guard, if wait_for raises TimeoutError but done was set concurrently (e.g., session.idle arrived during the timeout window), the unconditional clear() would erase the legitimate completion signal. The check-then-act is safe here since there's no await between is_set() and clear().

# and keep waiting.
recovery_attempts = 0
done.clear()
# Only clear if done hasn't been set in the meantime
# (prevents race where session.idle arrives right as we
# check time_since_last_event).
if not done.is_set():
done.clear()
continue

# Genuinely idle — no events for the full timeout period
Expand Down Expand Up @@ -1321,9 +1360,10 @@ async def _wait_with_idle_detection(
recovery_prompt = self._build_recovery_prompt(last_event_type, last_tool_call)
await session.send({"prompt": recovery_prompt})

# Reset the done event to wait again
# (it may have been set by a previous partial response)
done.clear()
# Reset the done event to wait again — but only if it hasn't
# been set since the recovery prompt was sent.
if not done.is_set():
done.clear()

async def _ensure_client_started(self) -> None:
"""Ensure the Copilot client is started."""
Expand Down
10 changes: 8 additions & 2 deletions tests/test_providers/test_copilot_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,10 @@ async def _fake_send(msg: Any) -> None:
):
await provider.execute(agent, {}, "Continue research")

mock_client.resume_session.assert_called_once_with(resumed_sid)
mock_client.resume_session.assert_called_once_with(
resumed_sid,
{"on_permission_request": CopilotProvider._default_permission_handler},
)
mock_client.create_session.assert_not_called()

@pytest.mark.asyncio
Expand Down Expand Up @@ -201,7 +204,10 @@ async def _fake_send(msg: Any) -> None:
):
await provider.execute(agent, {}, "Continue research")

mock_client.resume_session.assert_called_once_with("stale-sid")
mock_client.resume_session.assert_called_once_with(
"stale-sid",
{"on_permission_request": CopilotProvider._default_permission_handler},
)
mock_client.create_session.assert_called_once()
# Session ID should now reflect the new session
assert provider.get_session_ids()["researcher"] == "sess-new"
Expand Down
Loading