From 8a9aee756069055bd72c93db21dd8f2fff5e5f10 Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Thu, 26 Mar 2026 21:07:30 -0700 Subject: [PATCH 01/17] initial commit --- .../W365ComputerUseSample.sln | 25 ++ .../sample-agent/Agent/MyAgent.cs | 293 ++++++++++++++++ .../sample-agent/AspNetExtensions.cs | 170 ++++++++++ .../ComputerUse/ComputerUseOrchestrator.cs | 321 ++++++++++++++++++ .../ComputerUse/Models/ComputerUseModels.cs | 85 +++++ .../w365-computer-use/sample-agent/Program.cs | 98 ++++++ .../sample-agent/ServiceExtensions.cs | 41 +++ .../sample-agent/Telemetry/A365OtelWrapper.cs | 77 +++++ .../sample-agent/Telemetry/AgentMetrics.cs | 114 +++++++ .../sample-agent/W365ComputerUseSample.csproj | 32 ++ .../sample-agent/appsettings.json | 69 ++++ 11 files changed, 1325 insertions(+) create mode 100644 dotnet/w365-computer-use/W365ComputerUseSample.sln create mode 100644 dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs create mode 100644 dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs create mode 100644 dotnet/w365-computer-use/sample-agent/Program.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs create mode 100644 dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs create mode 100644 dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs create mode 100644 dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj create mode 100644 dotnet/w365-computer-use/sample-agent/appsettings.json diff --git a/dotnet/w365-computer-use/W365ComputerUseSample.sln b/dotnet/w365-computer-use/W365ComputerUseSample.sln new file mode 100644 index 00000000..67fe015d --- /dev/null +++ b/dotnet/w365-computer-use/W365ComputerUseSample.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36623.8 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "W365ComputerUseSample", "sample-agent\W365ComputerUseSample.csproj", "{B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D4E5F6A7-B8C9-0D1E-2F3A-4B5C6D7E8F90} + EndGlobalSection +EndGlobal diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs new file mode 100644 index 00000000..8232d36e --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -0,0 +1,293 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using W365ComputerUseSample.ComputerUse; +using W365ComputerUseSample.Telemetry; +using Microsoft.Agents.A365.Observability.Caching; +using Microsoft.Agents.A365.Runtime.Utils; +using Microsoft.Agents.A365.Tooling.Extensions.AgentFramework.Services; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Builder.App; +using Microsoft.Agents.Builder.State; +using Microsoft.Agents.Core; +using Microsoft.Agents.Core.Models; +using Microsoft.Extensions.AI; + +namespace W365ComputerUseSample.Agent; + +public class MyAgent : AgentApplication +{ + private const string AgentWelcomeMessage = "Hello! I can help you perform tasks on a Windows 365 Cloud PC. Tell me what you'd like to do."; + private const string AgentHireMessage = "Thank you for hiring me! I can control a Windows desktop to accomplish tasks for you."; + private const string AgentFarewellMessage = "Thank you for your time, I enjoyed working with you."; + + private readonly IConfiguration _configuration; + private readonly IExporterTokenCache? _agentTokenCache; + private readonly ILogger _logger; + private readonly IMcpToolRegistrationService _toolService; + private readonly ComputerUseOrchestrator _orchestrator; + + private readonly string? AgenticAuthHandlerName; + private readonly string? OboAuthHandlerName; + + /// + /// Check if a bearer token is available in the environment for development/testing. + /// + public static bool TryGetBearerTokenForDevelopment(out string? bearerToken) + { + bearerToken = Environment.GetEnvironmentVariable("BEARER_TOKEN"); + return !string.IsNullOrEmpty(bearerToken); + } + + /// + /// Checks if graceful fallback is enabled when MCP tools fail to load. + /// Only allowed in Development + SKIP_TOOLING_ON_ERRORS=true. + /// + private static bool ShouldSkipToolingOnErrors() + { + var environment = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") ?? + Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT") ?? + "Production"; + + var skipToolingOnErrors = Environment.GetEnvironmentVariable("SKIP_TOOLING_ON_ERRORS"); + + return environment.Equals("Development", StringComparison.OrdinalIgnoreCase) && + !string.IsNullOrEmpty(skipToolingOnErrors) && + skipToolingOnErrors.Equals("true", StringComparison.OrdinalIgnoreCase); + } + + public MyAgent( + AgentApplicationOptions options, + IConfiguration configuration, + IExporterTokenCache agentTokenCache, + IMcpToolRegistrationService toolService, + ComputerUseOrchestrator orchestrator, + ILogger logger) : base(options) + { + _configuration = configuration; + _agentTokenCache = agentTokenCache; + _logger = logger; + _toolService = toolService; + _orchestrator = orchestrator; + + AgenticAuthHandlerName = _configuration.GetValue("AgentApplication:AgenticAuthHandlerName"); + OboAuthHandlerName = _configuration.GetValue("AgentApplication:OboAuthHandlerName"); + + // Greet when members are added + OnConversationUpdate(ConversationUpdateEvents.MembersAdded, WelcomeMessageAsync); + + // Compute auth handler arrays once + var agenticHandlers = !string.IsNullOrEmpty(AgenticAuthHandlerName) ? [AgenticAuthHandlerName] : Array.Empty(); + var oboHandlers = !string.IsNullOrEmpty(OboAuthHandlerName) ? [OboAuthHandlerName] : Array.Empty(); + + // Handle install/uninstall + OnActivity(ActivityTypes.InstallationUpdate, OnInstallationUpdateAsync, isAgenticOnly: true, autoSignInHandlers: agenticHandlers); + OnActivity(ActivityTypes.InstallationUpdate, OnInstallationUpdateAsync, isAgenticOnly: false); + + // Handle messages — MUST BE AFTER any other message handlers + OnActivity(ActivityTypes.Message, OnMessageAsync, isAgenticOnly: true, autoSignInHandlers: agenticHandlers); + OnActivity(ActivityTypes.Message, OnMessageAsync, isAgenticOnly: false, autoSignInHandlers: oboHandlers); + } + + protected async Task WelcomeMessageAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + await AgentMetrics.InvokeObservedAgentOperation( + "WelcomeMessage", + turnContext, + async () => + { + foreach (ChannelAccount member in turnContext.Activity.MembersAdded) + { + if (member.Id != turnContext.Activity.Recipient.Id) + { + await turnContext.SendActivityAsync(AgentWelcomeMessage); + } + } + }); + } + + protected async Task OnInstallationUpdateAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + await AgentMetrics.InvokeObservedAgentOperation( + "InstallationUpdate", + turnContext, + async () => + { + _logger.LogInformation( + "InstallationUpdate received — Action: '{Action}', DisplayName: '{Name}', UserId: '{Id}'", + turnContext.Activity.Action ?? "(none)", + turnContext.Activity.From?.Name ?? "(unknown)", + turnContext.Activity.From?.Id ?? "(unknown)"); + + if (turnContext.Activity.Action == InstallationUpdateActionTypes.Add) + { + await turnContext.SendActivityAsync(MessageFactory.Text(AgentHireMessage), cancellationToken); + } + else if (turnContext.Activity.Action == InstallationUpdateActionTypes.Remove) + { + await turnContext.SendActivityAsync(MessageFactory.Text(AgentFarewellMessage), cancellationToken); + } + }); + } + + protected async Task OnMessageAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + if (turnContext is null) + { + throw new ArgumentNullException(nameof(turnContext)); + } + + var fromAccount = turnContext.Activity.From; + _logger.LogDebug( + "Turn received from user — DisplayName: '{Name}', UserId: '{Id}', AadObjectId: '{AadObjectId}'", + fromAccount?.Name ?? "(unknown)", + fromAccount?.Id ?? "(unknown)", + fromAccount?.AadObjectId ?? "(none)"); + + // Select auth handler based on request type + string? ObservabilityAuthHandlerName; + string? ToolAuthHandlerName; + if (turnContext.IsAgenticRequest()) + { + ObservabilityAuthHandlerName = ToolAuthHandlerName = AgenticAuthHandlerName; + } + else + { + ObservabilityAuthHandlerName = ToolAuthHandlerName = OboAuthHandlerName; + } + + await A365OtelWrapper.InvokeObservedAgentOperation( + "MessageProcessor", + turnContext, + turnState, + _agentTokenCache, + UserAuthorization, + ObservabilityAuthHandlerName ?? string.Empty, + _logger, + async () => + { + // Immediate acknowledgment + await turnContext.SendActivityAsync(MessageFactory.Text("Got it — working on it…"), cancellationToken).ConfigureAwait(false); + + // Typing indicator + await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), cancellationToken).ConfigureAwait(false); + + using var typingCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + var typingTask = Task.Run(async () => + { + try + { + while (!typingCts.IsCancellationRequested) + { + await Task.Delay(TimeSpan.FromSeconds(4), typingCts.Token).ConfigureAwait(false); + await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), typingCts.Token).ConfigureAwait(false); + } + } + catch (OperationCanceledException) { /* expected */ } + }, typingCts.Token); + + try + { + var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; + + // Get W365 MCP tools via the A365 SDK + var w365Tools = await GetW365ToolsAsync(turnContext, ToolAuthHandlerName); + + if (w365Tools == null || w365Tools.Count == 0) + { + await turnContext.SendActivityAsync( + MessageFactory.Text("Unable to connect to the W365 Computer Use service. Please check your configuration."), + cancellationToken); + return; + } + + await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Connected to W365. Working on your request...").ConfigureAwait(false); + + // Run the CUA loop — the MCP server manages sessions automatically + var response = await _orchestrator.RunAsync( + userText, + w365Tools, + onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), + cancellationToken: cancellationToken); + + // Send the response + turnContext.StreamingResponse.QueueTextChunk(response); + } + finally + { + typingCts.Cancel(); + try { await typingTask.ConfigureAwait(false); } + catch (OperationCanceledException) { /* expected */ } + await turnContext.StreamingResponse.EndStreamAsync(cancellationToken).ConfigureAwait(false); + } + }); + } + + /// + /// Get the W365 MCP tools via the A365 Tooling SDK. + /// Returns the tools as AITool wrappers that can invoke MCP server methods. + /// + private async Task?> GetW365ToolsAsync(ITurnContext context, string? authHandlerName) + { + // Acquire access token + string? accessToken = null; + string? agentId = null; + + if (!string.IsNullOrEmpty(authHandlerName)) + { + accessToken = await UserAuthorization.GetTurnTokenAsync(context, authHandlerName); + agentId = Utility.ResolveAgentIdentity(context, accessToken); + } + else if (TryGetBearerTokenForDevelopment(out var bearerToken)) + { + _logger.LogInformation("Using bearer token from environment."); + accessToken = bearerToken; + agentId = Utility.ResolveAgentIdentity(context, accessToken!); + } + + if (string.IsNullOrEmpty(accessToken) || string.IsNullOrEmpty(agentId)) + { + _logger.LogWarning("No auth token or agent identity available. Cannot connect to MCP."); + return null; + } + + try + { + var handlerForMcp = !string.IsNullOrEmpty(authHandlerName) + ? authHandlerName + : OboAuthHandlerName ?? AgenticAuthHandlerName ?? string.Empty; + var tokenOverride = string.IsNullOrEmpty(authHandlerName) ? accessToken : null; + + var allTools = await _toolService.GetMcpToolsAsync(agentId, UserAuthorization, handlerForMcp, context, tokenOverride).ConfigureAwait(false); + + // Filter to only W365 tools + var w365Tools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return name.StartsWith("W365_", StringComparison.OrdinalIgnoreCase); + }).ToList(); + + if (w365Tools != null && w365Tools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} W365 Computer Use tools", w365Tools.Count); + } + else + { + _logger.LogWarning("No W365 tools found among {TotalCount} MCP tools", allTools?.Count ?? 0); + } + + return w365Tools; + } + catch (Exception ex) + { + if (ShouldSkipToolingOnErrors()) + { + _logger.LogWarning(ex, "Failed to connect to MCP servers. Continuing without tools (SKIP_TOOLING_ON_ERRORS=true)."); + return null; + } + + _logger.LogError(ex, "Failed to connect to MCP servers."); + throw; + } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs b/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs new file mode 100644 index 00000000..94e1c1a7 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs @@ -0,0 +1,170 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.Authentication; +using Microsoft.Agents.Core; +using Microsoft.AspNetCore.Authentication.JwtBearer; +using Microsoft.IdentityModel.Protocols; +using Microsoft.IdentityModel.Protocols.OpenIdConnect; +using Microsoft.IdentityModel.Tokens; +using Microsoft.IdentityModel.Validators; +using System.Collections.Concurrent; +using System.Globalization; +using System.IdentityModel.Tokens.Jwt; + +namespace W365ComputerUseSample; + +public static class AspNetExtensions +{ + private static readonly ConcurrentDictionary> _openIdMetadataCache = new(); + + public static void AddAgentAspNetAuthentication(this IServiceCollection services, IConfiguration configuration, string tokenValidationSectionName = "TokenValidation") + { + IConfigurationSection tokenValidationSection = configuration.GetSection(tokenValidationSectionName); + + if (!tokenValidationSection.Exists() || !tokenValidationSection.GetValue("Enabled", true)) + { + System.Diagnostics.Trace.WriteLine("AddAgentAspNetAuthentication: Auth disabled"); + return; + } + + services.AddAgentAspNetAuthentication(tokenValidationSection.Get()!); + } + + public static void AddAgentAspNetAuthentication(this IServiceCollection services, TokenValidationOptions validationOptions) + { + AssertionHelpers.ThrowIfNull(validationOptions, nameof(validationOptions)); + + if (validationOptions.Audiences == null || validationOptions.Audiences.Count == 0) + { + throw new ArgumentException($"{nameof(TokenValidationOptions)}:Audiences requires at least one ClientId"); + } + + foreach (var audience in validationOptions.Audiences) + { + if (!Guid.TryParse(audience, out _)) + { + throw new ArgumentException($"{nameof(TokenValidationOptions)}:Audiences values must be a GUID"); + } + } + + if (validationOptions.ValidIssuers == null || validationOptions.ValidIssuers.Count == 0) + { + validationOptions.ValidIssuers = + [ + "https://api.botframework.com", + "https://sts.windows.net/d6d49420-f39b-4df7-a1dc-d59a935871db/", + "https://login.microsoftonline.com/d6d49420-f39b-4df7-a1dc-d59a935871db/v2.0", + "https://sts.windows.net/f8cdef31-a31e-4b4a-93e4-5f571e91255a/", + "https://login.microsoftonline.com/f8cdef31-a31e-4b4a-93e4-5f571e91255a/v2.0", + "https://sts.windows.net/69e9b82d-4842-4902-8d1e-abc5b98a55e8/", + "https://login.microsoftonline.com/69e9b82d-4842-4902-8d1e-abc5b98a55e8/v2.0", + ]; + + if (!string.IsNullOrEmpty(validationOptions.TenantId) && Guid.TryParse(validationOptions.TenantId, out _)) + { + validationOptions.ValidIssuers.Add(string.Format(CultureInfo.InvariantCulture, AuthenticationConstants.ValidTokenIssuerUrlTemplateV1, validationOptions.TenantId)); + validationOptions.ValidIssuers.Add(string.Format(CultureInfo.InvariantCulture, AuthenticationConstants.ValidTokenIssuerUrlTemplateV2, validationOptions.TenantId)); + } + } + + if (string.IsNullOrEmpty(validationOptions.AzureBotServiceOpenIdMetadataUrl)) + { + validationOptions.AzureBotServiceOpenIdMetadataUrl = validationOptions.IsGov ? AuthenticationConstants.GovAzureBotServiceOpenIdMetadataUrl : AuthenticationConstants.PublicAzureBotServiceOpenIdMetadataUrl; + } + + if (string.IsNullOrEmpty(validationOptions.OpenIdMetadataUrl)) + { + validationOptions.OpenIdMetadataUrl = validationOptions.IsGov ? AuthenticationConstants.GovOpenIdMetadataUrl : AuthenticationConstants.PublicOpenIdMetadataUrl; + } + + var openIdMetadataRefresh = validationOptions.OpenIdMetadataRefresh ?? BaseConfigurationManager.DefaultAutomaticRefreshInterval; + + _ = services.AddAuthentication(options => + { + options.DefaultAuthenticateScheme = JwtBearerDefaults.AuthenticationScheme; + options.DefaultChallengeScheme = JwtBearerDefaults.AuthenticationScheme; + }) + .AddJwtBearer(options => + { + options.SaveToken = true; + options.TokenValidationParameters = new TokenValidationParameters + { + ValidateIssuer = true, + ValidateAudience = true, + ValidateLifetime = true, + ClockSkew = TimeSpan.FromMinutes(5), + ValidIssuers = validationOptions.ValidIssuers, + ValidAudiences = validationOptions.Audiences, + ValidateIssuerSigningKey = true, + RequireSignedTokens = true, + }; + + options.TokenValidationParameters.EnableAadSigningKeyIssuerValidation(); + + options.Events = new JwtBearerEvents + { + OnMessageReceived = async context => + { + string authorizationHeader = context.Request.Headers.Authorization.ToString(); + + if (string.IsNullOrEmpty(authorizationHeader)) + { + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + await Task.CompletedTask.ConfigureAwait(false); + return; + } + + string[] parts = authorizationHeader?.Split(' ')!; + if (parts.Length != 2 || parts[0] != "Bearer") + { + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + await Task.CompletedTask.ConfigureAwait(false); + return; + } + + JwtSecurityToken token = new(parts[1]); + string issuer = token.Claims.FirstOrDefault(claim => claim.Type == AuthenticationConstants.IssuerClaim)?.Value!; + + if (validationOptions.AzureBotServiceTokenHandling && AuthenticationConstants.BotFrameworkTokenIssuer.Equals(issuer)) + { + context.Options.TokenValidationParameters.ConfigurationManager = _openIdMetadataCache.GetOrAdd(validationOptions.AzureBotServiceOpenIdMetadataUrl, key => + { + return new ConfigurationManager(validationOptions.AzureBotServiceOpenIdMetadataUrl, new OpenIdConnectConfigurationRetriever(), new HttpClient()) + { + AutomaticRefreshInterval = openIdMetadataRefresh + }; + }); + } + else + { + context.Options.TokenValidationParameters.ConfigurationManager = _openIdMetadataCache.GetOrAdd(validationOptions.OpenIdMetadataUrl, key => + { + return new ConfigurationManager(validationOptions.OpenIdMetadataUrl, new OpenIdConnectConfigurationRetriever(), new HttpClient()) + { + AutomaticRefreshInterval = openIdMetadataRefresh + }; + }); + } + + await Task.CompletedTask.ConfigureAwait(false); + }, + OnTokenValidated = context => Task.CompletedTask, + OnForbidden = context => Task.CompletedTask, + OnAuthenticationFailed = context => Task.CompletedTask + }; + }); + } + + public class TokenValidationOptions + { + public IList? Audiences { get; set; } + public string? TenantId { get; set; } + public IList? ValidIssuers { get; set; } + public bool IsGov { get; set; } = false; + public string? AzureBotServiceOpenIdMetadataUrl { get; set; } + public string? OpenIdMetadataUrl { get; set; } + public bool AzureBotServiceTokenHandling { get; set; } = true; + public TimeSpan? OpenIdMetadataRefresh { get; set; } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs new file mode 100644 index 00000000..c570c171 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -0,0 +1,321 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; +using System.Text.Json; +using Microsoft.Extensions.AI; +using W365ComputerUseSample.ComputerUse.Models; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Thin protocol adapter between OpenAI's computer-use-preview model and W365 MCP tools. +/// The model emits computer_call actions; this class translates them to MCP tool calls +/// and feeds back screenshots. The MCP server manages sessions automatically. +/// +public class ComputerUseOrchestrator +{ + private readonly HttpClient _httpClient; + private readonly ILogger _logger; + private readonly string _endpoint; + private readonly string _apiKey; + private readonly string _deploymentName; + private readonly int _maxIterations; + private readonly List _tools; + + private const string SystemInstructions = """ + You are a computer-using agent that can control a Windows desktop computer. + After each action, examine the screenshot to verify it worked. + If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). + Once you have completed the task, call the OnTaskComplete function. + Do NOT continue looping after the task is done. + """; + + public ComputerUseOrchestrator( + IHttpClientFactory httpClientFactory, + IConfiguration configuration, + ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _logger = logger; + + _endpoint = configuration["AIServices:AzureOpenAI:Endpoint"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:Endpoint is required."); + _apiKey = configuration["AIServices:AzureOpenAI:ApiKey"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:ApiKey is required."); + _deploymentName = configuration["AIServices:AzureOpenAI:DeploymentName"] ?? "computer-use-preview"; + _maxIterations = configuration.GetValue("ComputerUse:MaxIterations", 30); + + _tools = + [ + new ComputerUseTool + { + DisplayWidth = configuration.GetValue("ComputerUse:DisplayWidth", 1024), + DisplayHeight = configuration.GetValue("ComputerUse:DisplayHeight", 768), + Environment = "windows" + }, + new FunctionToolDefinition + { + Name = "OnTaskComplete", + Description = "Call this function when the given task has been completed successfully." + } + ]; + } + + /// + /// Run the CUA loop. The MCP server auto-manages sessions per user context. + /// + public async Task RunAsync( + string userMessage, + IList w365Tools, + Action? onStatusUpdate = null, + CancellationToken cancellationToken = default) + { + _logger.LogInformation("Starting CUA loop for: {Message}", Truncate(userMessage, 100)); + + // Start a W365 session — the server auto-discovers pools and provisions a VM. + // Session is tied to the user's identity; no session ID tracking needed. + onStatusUpdate?.Invoke("Starting W365 computing session..."); + await InvokeToolAsync(w365Tools, "W365_QuickStartSession", new Dictionary(), cancellationToken); + _logger.LogInformation("W365 session started via QuickStartSession"); + + var conversation = new List { CreateUserMessage(userMessage) }; + + for (var i = 0; i < _maxIterations; i++) + { + cancellationToken.ThrowIfCancellationRequested(); + + var response = await CallModelAsync(conversation, cancellationToken); + if (response?.Output == null || response.Output.Count == 0) + break; + + var hasActions = false; + + foreach (var item in response.Output) + { + var type = item.GetProperty("type").GetString(); + if (type == "reasoning") continue; + + conversation.Add(item); + + switch (type) + { + case "message": + return ExtractText(item); + + case "computer_call": + hasActions = true; + conversation.Add(await HandleComputerCallAsync(item, w365Tools, onStatusUpdate, cancellationToken)); + break; + + case "function_call": + hasActions = true; + conversation.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + if (item.GetProperty("name").GetString() == "OnTaskComplete") + { + await EndSessionAsync(w365Tools, cancellationToken); + return "Task completed successfully."; + } + break; + } + } + + if (!hasActions) break; + } + + await EndSessionAsync(w365Tools, cancellationToken); + return "The task could not be completed within the allowed number of steps."; + } + + private async Task EndSessionAsync(IList tools, CancellationToken ct) + { + try + { + await InvokeToolAsync(tools, "W365_EndSession", new Dictionary(), ct); + _logger.LogInformation("W365 session ended"); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to end W365 session"); + } + } + + private async Task CallModelAsync(List conversation, CancellationToken ct) + { + var body = JsonSerializer.Serialize(new ComputerUseRequest + { + Model = _deploymentName, + Instructions = SystemInstructions, + Input = conversation, + Tools = _tools, + Truncation = "auto" + }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); + + var url = $"{_endpoint.TrimEnd('/')}/openai/deployments/{_deploymentName}/responses?api-version=2025-03-01-preview"; + using var req = new HttpRequestMessage(HttpMethod.Post, url); + req.Content = new StringContent(body, Encoding.UTF8, "application/json"); + req.Headers.Add("api-key", _apiKey); + + var resp = await _httpClient.SendAsync(req, ct); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(ct); + throw new HttpRequestException($"Model API returned {resp.StatusCode}: {err}"); + } + + return JsonSerializer.Deserialize(await resp.Content.ReadAsStringAsync(ct)); + } + + /// + /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. + /// + private async Task HandleComputerCallAsync( + JsonElement call, IList tools, Action? onStatus, CancellationToken ct) + { + var callId = call.GetProperty("call_id").GetString()!; + var action = call.GetProperty("action"); + var actionType = action.GetProperty("type").GetString()!; + + onStatus?.Invoke($"Performing: {actionType}..."); + + // Execute the action (unless it's just requesting a screenshot) + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, action); + await InvokeToolAsync(tools, toolName, args, ct); + } + + // Always capture screenshot after action + var screenshot = await CaptureScreenshotAsync(tools, ct); + + var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) + ? sc : JsonSerializer.Deserialize("[]"); + + return ToJsonElement(new + { + type = "computer_call_output", + call_id = callId, + acknowledged_safety_checks = safetyChecks, + output = new { type = "computer_screenshot", image_url = $"data:image/png;base64,{screenshot}" } + }); + } + + /// + /// Map OpenAI computer_call action types to W365 MCP tool names and arguments. + /// sessionId is omitted — the MCP server resolves sessions by user context. + /// + private static (string ToolName, Dictionary Args) MapActionToMcpTool(string actionType, JsonElement action) + { + return actionType.ToLowerInvariant() switch + { + "click" => ("W365_Click2", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["button"] = action.TryGetProperty("button", out var b) ? b.GetString() : "left" + }), + "double_click" => ("W365_DoubleClick", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32() + }), + "type" => ("W365_WriteText", new Dictionary + { + ["text"] = action.GetProperty("text").GetString() + }), + "key" or "keys" or "keypress" => ("W365_MultiKeyPress", new Dictionary + { + ["keys"] = ExtractKeys(action) + }), + "scroll" => ("W365_Scroll", new Dictionary + { + ["atX"] = action.GetProperty("x").GetInt32(), + ["atY"] = action.GetProperty("y").GetInt32(), + ["deltaX"] = action.TryGetProperty("scroll_x", out var sx) ? sx.GetInt32() : 0, + ["deltaY"] = action.TryGetProperty("scroll_y", out var sy) ? sy.GetInt32() : 0 + }), + "move" => ("W365_MoveMouse", new Dictionary + { + ["toX"] = action.GetProperty("x").GetInt32(), + ["toY"] = action.GetProperty("y").GetInt32() + }), + "wait" => ("W365_Wait", new Dictionary + { + ["milliseconds"] = action.TryGetProperty("ms", out var ms) ? ms.GetInt32() : 500 + }), + "open_url" => ("W365_OpenUrl", new Dictionary + { + ["url"] = action.GetProperty("url").GetString() + }), + _ => throw new NotSupportedException($"Unsupported action: {actionType}") + }; + } + + private async Task CaptureScreenshotAsync(IList tools, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, "W365_CaptureScreenshot", new Dictionary(), ct); + var str = result?.ToString() ?? ""; + + try + { + using var doc = JsonDocument.Parse(str); + var root = doc.RootElement; + if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString() ?? ""; + if (root.TryGetProperty("image", out var img)) return img.GetString() ?? ""; + if (root.TryGetProperty("data", out var d)) return d.GetString() ?? ""; + } + catch (JsonException) { } + + // Fallback: result might be raw base64 + if (str.Length > 100) return str; + + throw new InvalidOperationException("Failed to extract screenshot from MCP response."); + } + + private static async Task InvokeToolAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var tool = tools.OfType().FirstOrDefault(t => t.Name.Equals(name, StringComparison.OrdinalIgnoreCase)) + ?? throw new InvalidOperationException($"Tool '{name}' not found."); + return await tool.InvokeAsync(new AIFunctionArguments(args), ct); + } + + private static string[] ExtractKeys(JsonElement action) + { + if (action.TryGetProperty("keys", out var k)) + { + if (k.ValueKind == JsonValueKind.Array) + return k.EnumerateArray().Select(e => e.GetString() ?? "").ToArray(); + if (k.ValueKind == JsonValueKind.String) + return [k.GetString() ?? ""]; + } + if (action.TryGetProperty("key", out var single) && single.ValueKind == JsonValueKind.String) + return [single.GetString() ?? ""]; + return []; + } + + private static string ExtractText(JsonElement msg) + { + if (msg.TryGetProperty("content", out var c) && c.ValueKind == JsonValueKind.Array) + foreach (var item in c.EnumerateArray()) + if (item.TryGetProperty("text", out var t)) + return t.GetString() ?? ""; + return ""; + } + + private static JsonElement CreateUserMessage(string text) => ToJsonElement(new + { + type = "message", role = "user", + content = new[] { new { type = "input_text", text } } + }); + + private static JsonElement CreateFunctionOutput(string callId) => ToJsonElement(new + { + type = "function_call_output", call_id = callId, output = "success" + }); + + private static JsonElement ToJsonElement(object obj) => + JsonSerializer.Deserialize(JsonSerializer.Serialize(obj)); + + private static string Truncate(string v, int max) => v.Length <= max ? v : v[..max] + "..."; +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs new file mode 100644 index 00000000..c6a08935 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace W365ComputerUseSample.ComputerUse.Models; + +/// +/// Response from the OpenAI Computer Use API. +/// +public class ComputerUseResponse +{ + [JsonPropertyName("id")] + public string? Id { get; set; } + + [JsonPropertyName("object")] + public string? Object { get; set; } + + [JsonPropertyName("created_at")] + public long CreatedAt { get; set; } + + [JsonPropertyName("model")] + public string? Model { get; set; } + + [JsonPropertyName("output")] + public List? Output { get; set; } +} + +/// +/// Request to the OpenAI Computer Use API. +/// +public class ComputerUseRequest +{ + [JsonPropertyName("model")] + public string Model { get; set; } = "computer-use-preview-2025-03-11"; + + [JsonPropertyName("truncation")] + public string Truncation { get; set; } = "auto"; + + [JsonPropertyName("instructions")] + public string? Instructions { get; set; } + + [JsonPropertyName("input")] + public List Input { get; set; } = []; + + [JsonPropertyName("tools")] + public List Tools { get; set; } = []; +} + +/// +/// Defines the computer_use_preview tool for the OpenAI Responses API. +/// +public class ComputerUseTool +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "computer_use_preview"; + + [JsonPropertyName("display_width")] + public int DisplayWidth { get; set; } = 1024; + + [JsonPropertyName("display_height")] + public int DisplayHeight { get; set; } = 768; + + [JsonPropertyName("environment")] + public string Environment { get; set; } = "windows"; +} + +/// +/// Defines a function tool for the OpenAI Responses API. +/// +public class FunctionToolDefinition +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "function"; + + [JsonPropertyName("name")] + public string Name { get; set; } = string.Empty; + + [JsonPropertyName("description")] + public string Description { get; set; } = string.Empty; + + [JsonPropertyName("parameters")] + public object? Parameters { get; set; } +} diff --git a/dotnet/w365-computer-use/sample-agent/Program.cs b/dotnet/w365-computer-use/sample-agent/Program.cs new file mode 100644 index 00000000..687f9ee9 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Program.cs @@ -0,0 +1,98 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using W365ComputerUseSample; +using W365ComputerUseSample.Agent; +using W365ComputerUseSample.ComputerUse; +using W365ComputerUseSample.Telemetry; +using Microsoft.Agents.A365.Observability; +using Microsoft.Agents.A365.Observability.Extensions.AgentFramework; +using Microsoft.Agents.A365.Observability.Runtime; +using Microsoft.Agents.A365.Tooling.Extensions.AgentFramework.Services; +using Microsoft.Agents.A365.Tooling.Services; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Core; +using Microsoft.Agents.Hosting.AspNetCore; +using Microsoft.Agents.Storage; +using System.Reflection; + +var builder = WebApplication.CreateBuilder(args); + +// Setup Aspire service defaults, including OpenTelemetry, Service Discovery, Resilience, and Health Checks +builder.ConfigureOpenTelemetry(); + +builder.Configuration.AddUserSecrets(Assembly.GetExecutingAssembly()); +builder.Services.AddControllers(); +builder.Services.AddHttpClient("WebClient", client => client.Timeout = TimeSpan.FromSeconds(600)); +builder.Services.AddHttpContextAccessor(); +builder.Logging.AddConsole(); + +// ********** Configure A365 Services ********** +// Configure observability. +builder.Services.AddAgenticTracingExporter(clusterCategory: "production"); + +// Add A365 tracing with Agent Framework integration +builder.AddA365Tracing(config => +{ + config.WithAgentFramework(); +}); + +// Add A365 Tooling Server integration +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +// ********** END Configure A365 Services ********** + +// Register the Computer Use orchestrator +builder.Services.AddSingleton(); + +// Add AspNet token validation +builder.Services.AddAgentAspNetAuthentication(builder.Configuration); + +// Register IStorage. For development, MemoryStorage is suitable. +builder.Services.AddSingleton(); + +// Add AgentApplicationOptions from config. +builder.AddAgentApplicationOptions(); + +// Add the bot (which is transient) +builder.AddAgent(); + +var app = builder.Build(); + +if (app.Environment.IsDevelopment()) +{ + app.UseDeveloperExceptionPage(); +} + +app.UseRouting(); +app.UseAuthentication(); +app.UseAuthorization(); + +// Map the /api/messages endpoint to the AgentApplication +app.MapPost("/api/messages", async (HttpRequest request, HttpResponse response, IAgentHttpAdapter adapter, IAgent agent, CancellationToken cancellationToken) => +{ + await AgentMetrics.InvokeObservedHttpOperation("agent.process_message", async () => + { + await adapter.ProcessAsync(request, response, agent, cancellationToken); + }).ConfigureAwait(false); +}); + +// Health check endpoint for CI/CD pipelines and monitoring +app.MapGet("/api/health", () => Results.Ok(new { status = "healthy", timestamp = DateTime.UtcNow })); + +if (app.Environment.IsDevelopment() || app.Environment.EnvironmentName == "Playground") +{ + app.MapGet("/", () => "W365 Computer Use Sample Agent"); + app.UseDeveloperExceptionPage(); + app.MapControllers().AllowAnonymous(); + + // Hard coded for brevity and ease of testing. + // In production, this should be set in configuration. + app.Urls.Add("http://localhost:3978"); +} +else +{ + app.MapControllers(); +} + +app.Run(); diff --git a/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs b/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs new file mode 100644 index 00000000..17ee7f95 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.A365.Observability; +using Microsoft.Agents.A365.Observability.Extensions.AgentFramework; +using Microsoft.Agents.A365.Observability.Runtime; +using OpenTelemetry.Metrics; +using OpenTelemetry.Resources; +using OpenTelemetry.Trace; +using W365ComputerUseSample.Telemetry; + +namespace W365ComputerUseSample; + +public static class ServiceExtensions +{ + public static void ConfigureOpenTelemetry(this WebApplicationBuilder builder) + { + builder.Services.AddOpenTelemetry() + .ConfigureResource(resource => resource.AddService("W365ComputerUseSample")) + .WithTracing(tracing => + { + tracing + .AddSource(AgentMetrics.SourceName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation(); + + if (builder.Environment.IsDevelopment()) + { + tracing.AddConsoleExporter(); + } + }) + .WithMetrics(metrics => + { + metrics + .AddMeter(AgentMetrics.SourceName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddRuntimeInstrumentation(); + }); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs b/dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs new file mode 100644 index 00000000..6e83aa41 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.A365.Observability.Caching; +using Microsoft.Agents.A365.Observability.Runtime.Common; +using Microsoft.Agents.A365.Runtime.Utils; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Builder.App.UserAuth; +using Microsoft.Agents.Builder.State; +using W365ComputerUseSample.Telemetry; + +namespace W365ComputerUseSample; + +public static class A365OtelWrapper +{ + public static async Task InvokeObservedAgentOperation( + string operationName, + ITurnContext turnContext, + ITurnState turnState, + IExporterTokenCache? agentTokenCache, + UserAuthorization authSystem, + string authHandlerName, + ILogger? logger, + Func func) + { + await AgentMetrics.InvokeObservedAgentOperation( + operationName, + turnContext, + async () => + { + (string agentId, string tenantId) = await ResolveTenantAndAgentId(turnContext, authSystem, authHandlerName); + + using var baggageScope = new BaggageBuilder() + .TenantId(tenantId) + .AgentId(agentId) + .Build(); + + try + { + agentTokenCache?.RegisterObservability(agentId, tenantId, new AgenticTokenStruct + { + UserAuthorization = authSystem, + TurnContext = turnContext, + AuthHandlerName = authHandlerName + }, EnvironmentUtils.GetObservabilityAuthenticationScope()); + } + catch (Exception ex) + { + logger?.LogWarning("There was an error registering for observability: {Message}", ex.Message); + } + + await func().ConfigureAwait(false); + }).ConfigureAwait(false); + } + + private static async Task<(string agentId, string tenantId)> ResolveTenantAndAgentId(ITurnContext turnContext, UserAuthorization authSystem, string authHandlerName) + { + string agentId = ""; + if (turnContext.Activity.IsAgenticRequest()) + { + agentId = turnContext.Activity.GetAgenticInstanceId(); + } + else + { + if (authSystem != null && !string.IsNullOrEmpty(authHandlerName)) + { + agentId = Utility.ResolveAgentIdentity(turnContext, await authSystem.GetTurnTokenAsync(turnContext, authHandlerName)); + } + } + + agentId = agentId ?? Guid.Empty.ToString(); + string? tempTenantId = turnContext?.Activity?.Conversation?.TenantId ?? turnContext?.Activity?.Recipient?.TenantId; + string tenantId = tempTenantId ?? Guid.Empty.ToString(); + + return (agentId, tenantId); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs b/dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs new file mode 100644 index 00000000..9329ec3f --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.Builder; +using Microsoft.Agents.Core; +using System.Diagnostics; +using System.Diagnostics.Metrics; + +namespace W365ComputerUseSample.Telemetry; + +public static class AgentMetrics +{ + public static readonly string SourceName = "A365.W365ComputerUse"; + + public static readonly ActivitySource ActivitySource = new(SourceName); + + private static readonly Meter Meter = new("A365.W365ComputerUse", "1.0.0"); + + public static readonly Counter MessageProcessedCounter = Meter.CreateCounter( + "agent.messages.processed", + "messages", + "Number of messages processed by the agent"); + + public static readonly Histogram MessageProcessingDuration = Meter.CreateHistogram( + "agent.message.processing.duration", + "ms", + "Duration of message processing in milliseconds"); + + public static readonly Counter CuaActionsExecuted = Meter.CreateCounter( + "agent.cua.actions.executed", + "actions", + "Number of CUA computer actions executed"); + + public static Activity InitializeMessageHandlingActivity(string handlerName, ITurnContext context) + { + var activity = ActivitySource.StartActivity(handlerName); + activity?.SetTag("Activity.Type", context.Activity.Type.ToString()); + activity?.SetTag("Agent.IsAgentic", context.IsAgenticRequest()); + activity?.SetTag("Caller.Id", context.Activity.From?.Id); + activity?.SetTag("Conversation.Id", context.Activity.Conversation?.Id); + activity?.SetTag("Channel.Id", context.Activity.ChannelId?.ToString()); + + return activity!; + } + + public static void FinalizeMessageHandlingActivity(Activity activity, ITurnContext context, long duration, bool success) + { + MessageProcessingDuration.Record(duration, + new("Conversation.Id", context.Activity.Conversation?.Id ?? "unknown"), + new("Channel.Id", context.Activity.ChannelId?.ToString() ?? "unknown")); + + if (success) + { + activity?.SetStatus(ActivityStatusCode.Ok); + } + else + { + activity?.SetStatus(ActivityStatusCode.Error); + } + + activity?.Stop(); + activity?.Dispose(); + } + + public static Task InvokeObservedHttpOperation(string operationName, Action func) + { + using var activity = ActivitySource.StartActivity(operationName); + try + { + func(); + activity?.SetStatus(ActivityStatusCode.Ok); + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.AddEvent(new ActivityEvent("exception", DateTimeOffset.UtcNow, new() + { + ["exception.type"] = ex.GetType().FullName, + ["exception.message"] = ex.Message, + ["exception.stacktrace"] = ex.StackTrace + })); + throw; + } + + return Task.CompletedTask; + } + + public static Task InvokeObservedAgentOperation(string operationName, ITurnContext context, Func func) + { + MessageProcessedCounter.Add(1); + var activity = InitializeMessageHandlingActivity(operationName, context); + var stopwatch = Stopwatch.StartNew(); + try + { + return func(); + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.AddEvent(new ActivityEvent("exception", DateTimeOffset.UtcNow, new() + { + ["exception.type"] = ex.GetType().FullName, + ["exception.message"] = ex.Message, + ["exception.stacktrace"] = ex.StackTrace + })); + throw; + } + finally + { + stopwatch.Stop(); + FinalizeMessageHandlingActivity(activity, context, stopwatch.ElapsedMilliseconds, true); + } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj new file mode 100644 index 00000000..0ab72faa --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -0,0 +1,32 @@ + + + + net8.0 + enable + a3c1d2e4-f567-8901-abcd-ef0123456789 + enable + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dotnet/w365-computer-use/sample-agent/appsettings.json b/dotnet/w365-computer-use/sample-agent/appsettings.json new file mode 100644 index 00000000..5b184e4b --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/appsettings.json @@ -0,0 +1,69 @@ +{ + "AgentApplication": { + "StartTypingTimer": false, + "RemoveRecipientMention": false, + "NormalizeMentions": false, + "AgenticAuthHandlerName": "agentic", + "UserAuthorization": { + "AutoSignin": false, + "Handlers": { + "agentic": { + "Type": "AgenticUserAuthorization", + "Settings": { + "Scopes": [ + "https://graph.microsoft.com/.default" + ] + } + } + } + } + }, + + "TokenValidation": { + "Audiences": [ + "{{ClientId}}" + ] + }, + + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "Microsoft.Agents": "Warning", + "Microsoft.Hosting.Lifetime": "Information" + } + }, + "AllowedHosts": "*", + "Connections": { + "ServiceConnection": { + "Settings": { + "AuthType": "UserManagedIdentity", + "AuthorityEndpoint": "https://login.microsoftonline.com/{{BOT_TENANT_ID}}", + "ClientId": "{{BOT_ID}}", + "Scopes": [ + "5a807f24-c9de-44ee-a3a7-329e88a00ffc/.default" + ] + } + } + }, + "ConnectionsMap": [ + { + "ServiceUrl": "*", + "Connection": "ServiceConnection" + } + ], + + "AIServices": { + "AzureOpenAI": { + "DeploymentName": "computer-use-preview", + "Endpoint": "<>", + "ApiKey": "<>" + } + }, + + "ComputerUse": { + "MaxIterations": 30, + "DisplayWidth": 1024, + "DisplayHeight": 768 + } +} From 6a0f6bc5c5047c94a9a48491031c0e62fd811fc8 Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Sun, 29 Mar 2026 21:22:21 -0700 Subject: [PATCH 02/17] add dual model support - 5.4 and computer-use-preview --- .../w365-computer-use/sample-agent/.gitignore | 2 + .../sample-agent/Agent/MyAgent.cs | 133 ++++-- .../ComputerUse/AzureOpenAIModelProvider.cs | 64 +++ .../ComputerUse/ComputerUseOrchestrator.cs | 388 +++++++++++++++--- .../ComputerUse/CustomEndpointProvider.cs | 119 ++++++ .../ComputerUse/ICuaModelProvider.cs | 17 + .../ComputerUse/LocalModelProvider.cs | 119 ++++++ .../ComputerUse/Models/ComputerUseModels.cs | 11 + .../w365-computer-use/sample-agent/Program.cs | 18 + .../Properties/launchSettings.json | 11 + .../w365-computer-use/sample-agent/README.md | 199 +++++++++ .../sample-agent/ToolingManifest.json | 8 + .../sample-agent/W365ComputerUseSample.csproj | 9 +- .../sample-agent/appsettings.json | 28 +- 14 files changed, 1018 insertions(+), 108 deletions(-) create mode 100644 dotnet/w365-computer-use/sample-agent/.gitignore create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs create mode 100644 dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json create mode 100644 dotnet/w365-computer-use/sample-agent/README.md create mode 100644 dotnet/w365-computer-use/sample-agent/ToolingManifest.json diff --git a/dotnet/w365-computer-use/sample-agent/.gitignore b/dotnet/w365-computer-use/sample-agent/.gitignore new file mode 100644 index 00000000..3b58674a --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/.gitignore @@ -0,0 +1,2 @@ +appsettings.Development.json +Screenshots/ diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index 8232d36e..be8f3be0 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Net.Http.Headers; using W365ComputerUseSample.ComputerUse; using W365ComputerUseSample.Telemetry; using Microsoft.Agents.A365.Observability.Caching; @@ -12,6 +13,7 @@ using Microsoft.Agents.Core; using Microsoft.Agents.Core.Models; using Microsoft.Extensions.AI; +using ModelContextProtocol.Client; namespace W365ComputerUseSample.Agent; @@ -21,11 +23,11 @@ public class MyAgent : AgentApplication private const string AgentHireMessage = "Thank you for hiring me! I can control a Windows desktop to accomplish tasks for you."; private const string AgentFarewellMessage = "Thank you for your time, I enjoyed working with you."; - private readonly IConfiguration _configuration; private readonly IExporterTokenCache? _agentTokenCache; private readonly ILogger _logger; private readonly IMcpToolRegistrationService _toolService; private readonly ComputerUseOrchestrator _orchestrator; + private readonly string? _mcpServerUrl; private readonly string? AgenticAuthHandlerName; private readonly string? OboAuthHandlerName; @@ -64,14 +66,14 @@ public MyAgent( ComputerUseOrchestrator orchestrator, ILogger logger) : base(options) { - _configuration = configuration; _agentTokenCache = agentTokenCache; _logger = logger; _toolService = toolService; _orchestrator = orchestrator; + _mcpServerUrl = configuration["McpServer:Url"]; - AgenticAuthHandlerName = _configuration.GetValue("AgentApplication:AgenticAuthHandlerName"); - OboAuthHandlerName = _configuration.GetValue("AgentApplication:OboAuthHandlerName"); + AgenticAuthHandlerName = configuration.GetValue("AgentApplication:AgenticAuthHandlerName"); + OboAuthHandlerName = configuration.GetValue("AgentApplication:OboAuthHandlerName"); // Greet when members are added OnConversationUpdate(ConversationUpdateEvents.MembersAdded, WelcomeMessageAsync); @@ -190,28 +192,51 @@ await A365OtelWrapper.InvokeObservedAgentOperation( { var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; - // Get W365 MCP tools via the A365 SDK - var w365Tools = await GetW365ToolsAsync(turnContext, ToolAuthHandlerName); + // Get W365 MCP tools — direct connection in Dev, SDK in Production + var (w365Tools, mcpClient) = await GetW365ToolsAsync(turnContext, ToolAuthHandlerName); - if (w365Tools == null || w365Tools.Count == 0) + try { - await turnContext.SendActivityAsync( - MessageFactory.Text("Unable to connect to the W365 Computer Use service. Please check your configuration."), - cancellationToken); - return; - } + if (w365Tools == null || w365Tools.Count == 0) + { + await turnContext.SendActivityAsync( + MessageFactory.Text("Unable to connect to the W365 Computer Use service. Please check your configuration."), + cancellationToken); + return; + } - await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Connected to W365. Working on your request...").ConfigureAwait(false); + await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Working on your request...").ConfigureAwait(false); - // Run the CUA loop — the MCP server manages sessions automatically - var response = await _orchestrator.RunAsync( - userText, - w365Tools, - onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), - cancellationToken: cancellationToken); + // Get Graph token for OneDrive screenshot upload. + // In production: acquired via agentic auth (UserAuthorization). + // In development: set GRAPH_TOKEN env var with a token that has Files.ReadWrite scope. + string? graphToken = null; + if (!string.IsNullOrEmpty(ToolAuthHandlerName)) + { + graphToken = await UserAuthorization.GetTurnTokenAsync(turnContext, ToolAuthHandlerName); + } + if (string.IsNullOrEmpty(graphToken)) + { + graphToken = Environment.GetEnvironmentVariable("GRAPH_TOKEN"); + } - // Send the response - turnContext.StreamingResponse.QueueTextChunk(response); + // Run the CUA loop — session is already active + var response = await _orchestrator.RunAsync( + userText, + w365Tools, + mcpClient: mcpClient, + graphAccessToken: graphToken, + onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), + cancellationToken: cancellationToken); + + // Send the response + turnContext.StreamingResponse.QueueTextChunk(response); + } + finally + { + // Don't dispose the MCP client — it's reused across messages and + // needed for EndSession on shutdown. It will be disposed with the app. + } } finally { @@ -224,10 +249,11 @@ await turnContext.SendActivityAsync( } /// - /// Get the W365 MCP tools via the A365 Tooling SDK. - /// Returns the tools as AITool wrappers that can invoke MCP server methods. + /// Get the W365 MCP tools. In Development mode with a bearer token, connects directly + /// to the MCP server URL from appsettings.json. In Production, uses the A365 SDK + /// to discover servers via the Tooling Gateway. /// - private async Task?> GetW365ToolsAsync(ITurnContext context, string? authHandlerName) + private async Task<(IList? Tools, IMcpClient? Client)> GetW365ToolsAsync(ITurnContext context, string? authHandlerName) { // Acquire access token string? accessToken = null; @@ -248,11 +274,20 @@ await turnContext.SendActivityAsync( if (string.IsNullOrEmpty(accessToken) || string.IsNullOrEmpty(agentId)) { _logger.LogWarning("No auth token or agent identity available. Cannot connect to MCP."); - return null; + return (null, null); } try { + // Development with bearer token: use orchestrator's cached MCP connection + if (TryGetBearerTokenForDevelopment(out _) && IsDevelopment()) + { + if (string.IsNullOrEmpty(_mcpServerUrl)) + throw new InvalidOperationException("McpServer:Url is required in appsettings.json for Development mode."); + return await _orchestrator.GetOrCreateMcpConnectionAsync(_mcpServerUrl, accessToken!); + } + + // Production: use the A365 SDK's tooling gateway for server discovery var handlerForMcp = !string.IsNullOrEmpty(authHandlerName) ? authHandlerName : OboAuthHandlerName ?? AgenticAuthHandlerName ?? string.Empty; @@ -261,33 +296,47 @@ await turnContext.SendActivityAsync( var allTools = await _toolService.GetMcpToolsAsync(agentId, UserAuthorization, handlerForMcp, context, tokenOverride).ConfigureAwait(false); // Filter to only W365 tools - var w365Tools = allTools?.Where(t => - { - var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; - return name.StartsWith("W365_", StringComparison.OrdinalIgnoreCase); - }).ToList(); - - if (w365Tools != null && w365Tools.Count > 0) - { - _logger.LogInformation("Found {ToolCount} W365 Computer Use tools", w365Tools.Count); - } - else - { - _logger.LogWarning("No W365 tools found among {TotalCount} MCP tools", allTools?.Count ?? 0); - } - - return w365Tools; + var w365Tools = FilterW365Tools(allTools); + return (w365Tools, null); } catch (Exception ex) { if (ShouldSkipToolingOnErrors()) { _logger.LogWarning(ex, "Failed to connect to MCP servers. Continuing without tools (SKIP_TOOLING_ON_ERRORS=true)."); - return null; + return (null, null); } _logger.LogError(ex, "Failed to connect to MCP servers."); throw; } } + + private static bool IsDevelopment() + { + var env = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") + ?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT") + ?? "Production"; + return env.Equals("Development", StringComparison.OrdinalIgnoreCase); + } + + private IList? FilterW365Tools(IList? allTools) + { + var w365Tools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return name.StartsWith("W365_", StringComparison.OrdinalIgnoreCase); + }).ToList(); + + if (w365Tools != null && w365Tools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} W365 Computer Use tools", w365Tools.Count); + } + else + { + _logger.LogWarning("No W365 tools found among {TotalCount} MCP tools", allTools?.Count ?? 0); + } + + return w365Tools; + } } diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs new file mode 100644 index 00000000..96a2a7c5 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Sends CUA model requests to Azure OpenAI using an API key. +/// This is the default provider for external customers. +/// +public class AzureOpenAIModelProvider : ICuaModelProvider +{ + private readonly HttpClient _httpClient; + private readonly string _url; + private readonly string _apiKey; + private readonly ILogger _logger; + + public string ModelName { get; } + + public AzureOpenAIModelProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _logger = logger; + var endpoint = configuration["AIServices:AzureOpenAI:Endpoint"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:Endpoint is required."); + _apiKey = configuration["AIServices:AzureOpenAI:ApiKey"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:ApiKey is required."); + var apiVersion = configuration["AIServices:AzureOpenAI:ApiVersion"] ?? "2025-04-01-preview"; + + // DeploymentName = deployment-based URL; ModelName = model-based URL (model sent in body) + var deploymentName = configuration["AIServices:AzureOpenAI:DeploymentName"]; + ModelName = configuration["AIServices:AzureOpenAI:ModelName"] + ?? deploymentName + ?? "computer-use-preview"; + + if (!string.IsNullOrEmpty(deploymentName)) + { + _url = $"{endpoint.TrimEnd('/')}/openai/deployments/{deploymentName}/responses?api-version={apiVersion}"; + } + else + { + // Model-based endpoint — model name goes in the request body, not the URL + _url = $"{endpoint.TrimEnd('/')}/openai/responses?api-version={apiVersion}"; + } + } + + public async Task SendAsync(string requestBody, CancellationToken cancellationToken) + { + _logger.LogInformation("Azure OpenAI request URL: {Url}", _url); + using var req = new HttpRequestMessage(HttpMethod.Post, _url); + req.Headers.Add("api-key", _apiKey); + req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); + + var resp = await _httpClient.SendAsync(req, cancellationToken); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(cancellationToken); + throw new HttpRequestException($"Azure OpenAI returned {resp.StatusCode}: {err}"); + } + + return await resp.Content.ReadAsStringAsync(cancellationToken); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index c570c171..fefb0228 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -1,9 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System.Text; +using System.Net.Http.Headers; using System.Text.Json; using Microsoft.Extensions.AI; +using ModelContextProtocol.Client; using W365ComputerUseSample.ComputerUse.Models; namespace W365ComputerUseSample.ComputerUse; @@ -15,14 +16,40 @@ namespace W365ComputerUseSample.ComputerUse; /// public class ComputerUseOrchestrator { + private readonly ICuaModelProvider _modelProvider; private readonly HttpClient _httpClient; private readonly ILogger _logger; - private readonly string _endpoint; - private readonly string _apiKey; - private readonly string _deploymentName; private readonly int _maxIterations; + private readonly string? _screenshotPath; + private readonly string? _oneDriveFolder; + private readonly string? _oneDriveUserId; + private readonly string _toolType; private readonly List _tools; + /// + /// Conversation history persisted across user messages. + /// This allows the model to maintain context across multiple turns + /// (e.g., "now save the file" after a previous "type hello" command). + /// + private readonly List _conversationHistory = []; + + /// + /// Whether a W365 session has been started. Tracked here (singleton) because + /// MyAgent is transient — a new agent instance is created per HTTP request. + /// + private bool _sessionStarted; + + /// + /// Cached reference to the last-used W365 tools, used for shutdown cleanup. + /// + private IList? _cachedTools; + + /// + /// Cached MCP client reference — kept alive for the app lifetime so EndSession + /// can be called on shutdown. Disposed when the app stops. + /// + private IMcpClient? _cachedMcpClient; + private const string SystemInstructions = """ You are a computer-using agent that can control a Windows desktop computer. After each action, examine the screenshot to verify it worked. @@ -32,28 +59,43 @@ Do NOT continue looping after the task is done. """; public ComputerUseOrchestrator( + ICuaModelProvider modelProvider, IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) { + _modelProvider = modelProvider; _httpClient = httpClientFactory.CreateClient("WebClient"); _logger = logger; - - _endpoint = configuration["AIServices:AzureOpenAI:Endpoint"] - ?? throw new InvalidOperationException("AIServices:AzureOpenAI:Endpoint is required."); - _apiKey = configuration["AIServices:AzureOpenAI:ApiKey"] - ?? throw new InvalidOperationException("AIServices:AzureOpenAI:ApiKey is required."); - _deploymentName = configuration["AIServices:AzureOpenAI:DeploymentName"] ?? "computer-use-preview"; _maxIterations = configuration.GetValue("ComputerUse:MaxIterations", 30); + _screenshotPath = configuration["Screenshots:LocalPath"]; + _oneDriveFolder = configuration["Screenshots:OneDriveFolder"]; + _oneDriveUserId = configuration["Screenshots:OneDriveUserId"]; + + _toolType = configuration["ComputerUse:ToolType"] ?? ""; + if (string.IsNullOrEmpty(_toolType)) + { + // Auto-derive from model name: gpt-* models use "computer", others use "computer_use_preview" + var modelName = _modelProvider.ModelName; + _toolType = modelName.StartsWith("gpt-", StringComparison.OrdinalIgnoreCase) ? "computer" : "computer_use_preview"; + } + var displayWidth = configuration.GetValue("ComputerUse:DisplayWidth", 1024); + var displayHeight = configuration.GetValue("ComputerUse:DisplayHeight", 768); + + // Build the computer tool definition based on the tool type: + // "computer_use_preview" — computer-use-preview model: display_width, display_height, environment + // "computer" — GPT-5.4+ models (Azure OpenAI): bare type, no params + object computerTool = _toolType switch + { + "computer" => new ComputerToolV2(), + _ => new ComputerUseTool { DisplayWidth = displayWidth, DisplayHeight = displayHeight } + }; + + _logger.LogInformation("CUA tool type: {ToolType}, display: {Width}x{Height}", _toolType, displayWidth, displayHeight); _tools = [ - new ComputerUseTool - { - DisplayWidth = configuration.GetValue("ComputerUse:DisplayWidth", 1024), - DisplayHeight = configuration.GetValue("ComputerUse:DisplayHeight", 768), - Environment = "windows" - }, + computerTool, new FunctionToolDefinition { Name = "OnTaskComplete", @@ -63,29 +105,58 @@ public ComputerUseOrchestrator( } /// - /// Run the CUA loop. The MCP server auto-manages sessions per user context. + /// Run the CUA loop. Session must already be started by the caller. /// public async Task RunAsync( string userMessage, IList w365Tools, + IMcpClient? mcpClient = null, + string? graphAccessToken = null, Action? onStatusUpdate = null, CancellationToken cancellationToken = default) { _logger.LogInformation("Starting CUA loop for: {Message}", Truncate(userMessage, 100)); + _cachedTools = w365Tools; + if (mcpClient != null) _cachedMcpClient = mcpClient; - // Start a W365 session — the server auto-discovers pools and provisions a VM. - // Session is tied to the user's identity; no session ID tracking needed. - onStatusUpdate?.Invoke("Starting W365 computing session..."); - await InvokeToolAsync(w365Tools, "W365_QuickStartSession", new Dictionary(), cancellationToken); - _logger.LogInformation("W365 session started via QuickStartSession"); + // Start session once — reuse across all messages + if (!_sessionStarted) + { + onStatusUpdate?.Invoke("Starting W365 computing session..."); + await StartSessionAsync(w365Tools, _logger, cancellationToken); + _sessionStarted = true; + } - var conversation = new List { CreateUserMessage(userMessage) }; + // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message + // so the model can see the screen. On subsequent messages, the history already has + // computer_call_output screenshots, so adding another input_image would cause a 400 error. + if (_toolType == "computer" && _conversationHistory.Count == 0) + { + var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, cancellationToken); + var initialName = $"{++_screenshotCounter:D3}_initial"; + SaveScreenshotToDisk(initialScreenshot!, initialName); + await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken); + _conversationHistory.Add(ToJsonElement(new + { + type = "message", + role = "user", + content = new object[] + { + new { type = "input_text", text = userMessage }, + new { type = "input_image", image_url = $"data:image/png;base64,{initialScreenshot}" } + } + })); + } + else + { + _conversationHistory.Add(CreateUserMessage(userMessage)); + } for (var i = 0; i < _maxIterations; i++) { cancellationToken.ThrowIfCancellationRequested(); - var response = await CallModelAsync(conversation, cancellationToken); + var response = await CallModelAsync(_conversationHistory, cancellationToken); if (response?.Output == null || response.Output.Count == 0) break; @@ -96,7 +167,7 @@ public async Task RunAsync( var type = item.GetProperty("type").GetString(); if (type == "reasoning") continue; - conversation.Add(item); + _conversationHistory.Add(item); switch (type) { @@ -105,15 +176,17 @@ public async Task RunAsync( case "computer_call": hasActions = true; - conversation.Add(await HandleComputerCallAsync(item, w365Tools, onStatusUpdate, cancellationToken)); + _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); + _conversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, graphAccessToken, onStatusUpdate, cancellationToken)); break; case "function_call": hasActions = true; - conversation.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); - if (item.GetProperty("name").GetString() == "OnTaskComplete") + var funcName = item.GetProperty("name").GetString(); + _logger.LogInformation("CUA iteration {Iteration}: function_call {Name}", i + 1, funcName); + _conversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + if (funcName == "OnTaskComplete") { - await EndSessionAsync(w365Tools, cancellationToken); return "Task completed successfully."; } break; @@ -123,70 +196,159 @@ public async Task RunAsync( if (!hasActions) break; } - await EndSessionAsync(w365Tools, cancellationToken); return "The task could not be completed within the allowed number of steps."; } - private async Task EndSessionAsync(IList tools, CancellationToken ct) + /// + /// End the W365 session. Called by the agent on shutdown or explicit end. + /// + public static async Task EndSessionAsync(IList tools, ILogger logger, CancellationToken ct) { try { await InvokeToolAsync(tools, "W365_EndSession", new Dictionary(), ct); - _logger.LogInformation("W365 session ended"); + logger.LogInformation("W365 session ended"); + } + catch (ObjectDisposedException) + { + // MCP client already disposed (dev mode SSE connection closed) — session will time out on server + logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); } catch (Exception ex) { - _logger.LogWarning(ex, "Failed to end W365 session"); + logger.LogWarning(ex, "Failed to end W365 session"); + } + } + + /// + /// End the session using cached tools. Called from app shutdown hook. + /// + public async Task EndSessionOnShutdownAsync() + { + if (_cachedTools == null || !_sessionStarted) + { + _logger.LogInformation("No active session to end on shutdown"); + return; + } + + await EndSessionAsync(_cachedTools, _logger, CancellationToken.None); + _cachedTools = null; + _sessionStarted = false; + + // Dispose the MCP client if we own it + if (_cachedMcpClient != null) + { + await _cachedMcpClient.DisposeAsync(); + _cachedMcpClient = null; } } + /// + /// Start a W365 session. Called by the agent on first message. + /// + public static async Task StartSessionAsync(IList tools, ILogger logger, CancellationToken ct) + { + await InvokeToolAsync(tools, "W365_QuickStartSession", new Dictionary(), ct); + logger.LogInformation("W365 session started via QuickStartSession"); + } + + /// + /// Get or create the MCP client and tool list. Creates the connection once on first call, + /// then returns the cached result on subsequent calls. This ensures the SSE connection + /// stays alive across messages (MyAgent is transient, but this orchestrator is singleton). + /// + public async Task<(IList Tools, IMcpClient? Client)> GetOrCreateMcpConnectionAsync( + string mcpUrl, string accessToken) + { + if (_cachedTools != null) + return (_cachedTools, _cachedMcpClient); + + var httpClient = _httpClient; + httpClient.DefaultRequestHeaders.Authorization = + new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", accessToken); + + var transport = new SseClientTransport(new SseClientTransportOptions + { + Endpoint = new Uri(mcpUrl), + TransportMode = HttpTransportMode.AutoDetect, + }, httpClient); + + _cachedMcpClient = await McpClientFactory.CreateAsync(transport); + var allTools = (await _cachedMcpClient.ListToolsAsync()).Cast().ToList(); + + // Filter to W365 tools only + _cachedTools = allTools.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return name.StartsWith("W365_", StringComparison.OrdinalIgnoreCase); + }).ToList(); + + _logger.LogInformation("Connected to MCP server at {Url}, loaded {Count} W365 tools", mcpUrl, _cachedTools.Count); + return (_cachedTools, _cachedMcpClient); + } + private async Task CallModelAsync(List conversation, CancellationToken ct) { var body = JsonSerializer.Serialize(new ComputerUseRequest { - Model = _deploymentName, + Model = _modelProvider.ModelName, Instructions = SystemInstructions, Input = conversation, Tools = _tools, Truncation = "auto" }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); - var url = $"{_endpoint.TrimEnd('/')}/openai/deployments/{_deploymentName}/responses?api-version=2025-03-01-preview"; - using var req = new HttpRequestMessage(HttpMethod.Post, url); - req.Content = new StringContent(body, Encoding.UTF8, "application/json"); - req.Headers.Add("api-key", _apiKey); - - var resp = await _httpClient.SendAsync(req, ct); - if (!resp.IsSuccessStatusCode) - { - var err = await resp.Content.ReadAsStringAsync(ct); - throw new HttpRequestException($"Model API returned {resp.StatusCode}: {err}"); - } - - return JsonSerializer.Deserialize(await resp.Content.ReadAsStringAsync(ct)); + var responseJson = await _modelProvider.SendAsync(body, ct); + return JsonSerializer.Deserialize(responseJson); } /// /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. /// private async Task HandleComputerCallAsync( - JsonElement call, IList tools, Action? onStatus, CancellationToken ct) + JsonElement call, IList tools, IMcpClient? mcpClient, string? graphAccessToken, Action? onStatus, CancellationToken ct) { var callId = call.GetProperty("call_id").GetString()!; - var action = call.GetProperty("action"); - var actionType = action.GetProperty("type").GetString()!; - onStatus?.Invoke($"Performing: {actionType}..."); + // GPT-5.4 uses "actions" (non-empty array), older models use "action" (singular). + // Some models return both: "action": {...}, "actions": [] — so we must check the array is non-empty. + if (call.TryGetProperty("actions", out var actionsArray) + && actionsArray.ValueKind == JsonValueKind.Array + && actionsArray.GetArrayLength() > 0) + { + // Process batch actions (GPT-5.4 format) + foreach (var action in actionsArray.EnumerateArray()) + { + var actionType = action.GetProperty("type").GetString()!; + onStatus?.Invoke($"Performing: {actionType}..."); - // Execute the action (unless it's just requesting a screenshot) - if (actionType != "screenshot") + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, action); + await InvokeToolAsync(tools, toolName, args, ct); + } + } + } + else if (call.TryGetProperty("action", out var singleAction)) { - var (toolName, args) = MapActionToMcpTool(actionType, action); - await InvokeToolAsync(tools, toolName, args, ct); + // Single action (computer-use-preview format) + var actionType = singleAction.GetProperty("type").GetString()!; + onStatus?.Invoke($"Performing: {actionType}..."); + + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, singleAction); + await InvokeToolAsync(tools, toolName, args, ct); + } } // Always capture screenshot after action - var screenshot = await CaptureScreenshotAsync(tools, ct); + var screenshot = await CaptureScreenshotAsync(tools, mcpClient, ct); + + // Save screenshot locally and/or upload to OneDrive + var stepName = $"{++_screenshotCounter:D3}_step"; + SaveScreenshotToDisk(screenshot!, stepName); + await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken); var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) ? sc : JsonSerializer.Deserialize("[]"); @@ -251,10 +413,29 @@ private static (string ToolName, Dictionary Args) MapActionToMc }; } - private async Task CaptureScreenshotAsync(IList tools, CancellationToken ct) + private async Task CaptureScreenshotAsync(IList tools, IMcpClient? mcpClient, CancellationToken ct) { - var result = await InvokeToolAsync(tools, "W365_CaptureScreenshot", new Dictionary(), ct); - var str = result?.ToString() ?? ""; + // Use direct MCP client when available — AIFunction wrappers drop image content blocks + if (mcpClient != null) + { + var result = await mcpClient.CallToolAsync("W365_CaptureScreenshot", new Dictionary(), cancellationToken: ct); + foreach (var item in result.Content) + { + if (item.Type == "image" && !string.IsNullOrEmpty(item.Data)) + return item.Data; + if (item.Type == "text" && !string.IsNullOrEmpty(item.Text)) + { + var nested = ExtractBase64FromText(item.Text); + if (!string.IsNullOrEmpty(nested)) return nested; + } + } + + throw new InvalidOperationException($"Screenshot MCP response had {result.Content.Count} content blocks but no extractable image data."); + } + + // Fallback: AIFunction wrapper (may lose image content) + var aiResult = await InvokeToolAsync(tools, "W365_CaptureScreenshot", new Dictionary(), ct); + var str = aiResult?.ToString() ?? ""; try { @@ -266,13 +447,26 @@ private async Task CaptureScreenshotAsync(IList tools, Cancellat } catch (JsonException) { } - // Fallback: result might be raw base64 if (str.Length > 100) return str; + throw new InvalidOperationException($"Failed to extract screenshot. Response length: {str.Length}"); + } - throw new InvalidOperationException("Failed to extract screenshot from MCP response."); + private static string? ExtractBase64FromText(string? text) + { + if (string.IsNullOrEmpty(text)) return null; + try + { + using var doc = JsonDocument.Parse(text); + var root = doc.RootElement; + if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString(); + if (root.TryGetProperty("image", out var img)) return img.GetString(); + if (root.TryGetProperty("data", out var d)) return d.GetString(); + } + catch (JsonException) { } + return null; } - private static async Task InvokeToolAsync( + internal static async Task InvokeToolAsync( IList tools, string name, Dictionary args, CancellationToken ct) { var tool = tools.OfType().FirstOrDefault(t => t.Name.Equals(name, StringComparison.OrdinalIgnoreCase)) @@ -318,4 +512,74 @@ private static JsonElement ToJsonElement(object obj) => JsonSerializer.Deserialize(JsonSerializer.Serialize(obj)); private static string Truncate(string v, int max) => v.Length <= max ? v : v[..max] + "..."; + + private int _screenshotCounter; + + private void SaveScreenshotToDisk(string base64Data, string name) + { + if (string.IsNullOrEmpty(base64Data) || string.IsNullOrEmpty(_screenshotPath)) return; + try + { + Directory.CreateDirectory(_screenshotPath); + var path = Path.Combine(_screenshotPath, $"{name}.png"); + File.WriteAllBytes(path, Convert.FromBase64String(base64Data)); + _logger.LogInformation("Screenshot saved: {Path}", path); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to save screenshot"); + } + } + + /// + /// Upload a screenshot to the user's OneDrive via Microsoft Graph. + /// Requires a Graph access token with Files.ReadWrite scope. + /// Files are uploaded to /CUA-Sessions/{date}/ folder. + /// + private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken) + { + if (string.IsNullOrEmpty(graphAccessToken)) + { + _logger.LogDebug("OneDrive upload skipped: no Graph token"); + return; + } + if (string.IsNullOrEmpty(base64Data)) + { + _logger.LogDebug("OneDrive upload skipped: no screenshot data"); + return; + } + if (string.IsNullOrEmpty(_oneDriveFolder)) + { + _logger.LogDebug("OneDrive upload skipped: OneDriveFolder not configured"); + return; + } + + try + { + // Use /me/drive for token owner, or /users/{id}/drive for a specific user + var driveBase = string.IsNullOrEmpty(_oneDriveUserId) + ? "https://graph.microsoft.com/v1.0/me/drive" + : $"https://graph.microsoft.com/v1.0/users/{_oneDriveUserId}/drive"; + var url = $"{driveBase}/root:/{_oneDriveFolder.TrimStart('/')}/{fileName}:/content"; + + using var request = new HttpRequestMessage(HttpMethod.Put, url); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + request.Content = new ByteArrayContent(Convert.FromBase64String(base64Data)); + request.Content.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("image/png"); + + var response = await _httpClient.SendAsync(request); + if (response.IsSuccessStatusCode) + { + _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", _oneDriveFolder, fileName); + } + else + { + _logger.LogWarning("OneDrive upload failed: {Status}", response.StatusCode); + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to upload screenshot to OneDrive"); + } + } } diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs new file mode 100644 index 00000000..de7dba54 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Net.Http.Headers; +using System.Security.Cryptography.X509Certificates; +using System.Text; +using System.Text.Json; +using Microsoft.Identity.Client; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Sends CUA model requests via a local or custom model endpoint. +/// Supports certificate-based MSAL authentication for secured endpoints. +/// +public class CustomEndpointProvider : ICuaModelProvider +{ + private readonly HttpClient _httpClient; + private readonly string _endpoint; + private readonly string _customerId; + private readonly string? _modelTenantId; + private readonly string? _clientPrincipalId; + private readonly string? _partnerSource; + private readonly IConfidentialClientApplication? _msalApp; + private readonly string _scope; + private string? _cachedToken; + private DateTime _tokenExpiry = DateTime.MinValue; + + public string ModelName { get; } + + public CustomEndpointProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _endpoint = configuration["AIServices:CustomEndpoint:Endpoint"] + ?? throw new InvalidOperationException("AIServices:CustomEndpoint:Endpoint is required."); + _customerId = configuration["AIServices:CustomEndpoint:CustomerId"] + ?? throw new InvalidOperationException("AIServices:CustomEndpoint:CustomerId is required."); + _scope = configuration["AIServices:CustomEndpoint:Scope"] + ?? throw new InvalidOperationException("AIServices:CustomEndpoint:Scope is required."); + ModelName = configuration["AIServices:CustomEndpoint:Model"] ?? "computer-use-preview-2025-03-11"; + _modelTenantId = configuration["AIServices:CustomEndpoint:ModelTenantId"]; + _clientPrincipalId = configuration["AIServices:CustomEndpoint:ClientPrincipalId"]; + _partnerSource = configuration["AIServices:CustomEndpoint:PartnerSource"]; + + // Initialize MSAL with certificate + var certSubject = configuration["AIServices:CustomEndpoint:CertificateSubject"] ?? ""; + var clientId = configuration["AIServices:CustomEndpoint:ClientId"] ?? ""; + var tenantId = configuration["AIServices:CustomEndpoint:TenantId"] ?? ""; + + var cert = LoadCertificate(certSubject); + if (cert != null) + { + _msalApp = ConfidentialClientApplicationBuilder + .Create(clientId) + .WithAuthority($"https://login.microsoftonline.com/{tenantId}") + .WithCertificate(cert) + .Build(); + logger.LogInformation("CustomEndpoint MSAL initialized with certificate '{Subject}'", certSubject); + } + else + { + logger.LogWarning("CustomEndpoint certificate '{Subject}' not found. Auth will fail at runtime.", certSubject); + } + } + + public async Task SendAsync(string requestBody, CancellationToken cancellationToken) + { + var url = $"{_endpoint.TrimEnd('/')}/v0/resourceproxy/tenantId.{_customerId}/azureopenai/responses"; + var token = await GetTokenAsync(); + + using var req = new HttpRequestMessage(HttpMethod.Post, url); + req.Headers.Authorization = new AuthenticationHeaderValue("Bearer", token); + req.Headers.TryAddWithoutValidation("x-ms-client-principal-id", _clientPrincipalId); + req.Headers.TryAddWithoutValidation("x-ms-client-tenant-id", _modelTenantId); + req.Headers.TryAddWithoutValidation("X-ms-Source", + JsonSerializer.Serialize(new { consumptionSource = "Api", partnerSource = _partnerSource ?? "BICEvaluationService" })); + req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); + + var resp = await _httpClient.SendAsync(req, cancellationToken); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(cancellationToken); + throw new HttpRequestException($"CustomEndpoint returned {resp.StatusCode}: {err}"); + } + + return await resp.Content.ReadAsStringAsync(cancellationToken); + } + + private async Task GetTokenAsync() + { + if (!string.IsNullOrEmpty(_cachedToken) && DateTime.UtcNow < _tokenExpiry.AddMinutes(-5)) + return _cachedToken; + + if (_msalApp == null) + throw new InvalidOperationException("MSAL not initialized. Check CustomEndpoint certificate configuration."); + + var result = await _msalApp + .AcquireTokenForClient(new[] { _scope }) + .WithSendX5C(true) + .ExecuteAsync(); + + _cachedToken = result.AccessToken; + _tokenExpiry = result.ExpiresOn.DateTime; + return _cachedToken; + } + + private static X509Certificate2? LoadCertificate(string subject) + { + if (string.IsNullOrEmpty(subject)) return null; + foreach (var location in new[] { StoreLocation.CurrentUser, StoreLocation.LocalMachine }) + { + using var store = new X509Store(StoreName.My, location); + store.Open(OpenFlags.ReadOnly); + var certs = store.Certificates.Find(X509FindType.FindBySubjectName, subject, false); + if (certs.Count > 0) return certs[0]; + } + return null; + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs new file mode 100644 index 00000000..b26db535 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Abstraction for sending requests to a CUA-capable model (OpenAI Responses API). +/// Implementations handle authentication and endpoint differences. +/// +public interface ICuaModelProvider +{ + /// The model name to include in the request body. + string ModelName { get; } + + /// Send a serialized request body and return the raw JSON response. + Task SendAsync(string requestBody, CancellationToken cancellationToken); +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs new file mode 100644 index 00000000..684b112a --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Net.Http.Headers; +using System.Security.Cryptography.X509Certificates; +using System.Text; +using System.Text.Json; +using Microsoft.Identity.Client; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Sends CUA model requests via a local or custom model endpoint. +/// Supports certificate-based MSAL authentication for secured endpoints. +/// +public class LocalModelProvider : ICuaModelProvider +{ + private readonly HttpClient _httpClient; + private readonly string _endpoint; + private readonly string _customerId; + private readonly string? _modelTenantId; + private readonly string? _clientPrincipalId; + private readonly string? _partnerSource; + private readonly IConfidentialClientApplication? _msalApp; + private readonly string _scope; + private string? _cachedToken; + private DateTime _tokenExpiry = DateTime.MinValue; + + public string ModelName { get; } + + public LocalModelProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _endpoint = configuration["AIServices:LocalModel:Endpoint"] + ?? throw new InvalidOperationException("AIServices:LocalModel:Endpoint is required."); + _customerId = configuration["AIServices:LocalModel:CustomerId"] + ?? throw new InvalidOperationException("AIServices:LocalModel:CustomerId is required."); + _scope = configuration["AIServices:LocalModel:Scope"] + ?? throw new InvalidOperationException("AIServices:LocalModel:Scope is required."); + ModelName = configuration["AIServices:LocalModel:Model"] ?? "computer-use-preview-2025-03-11"; + _modelTenantId = configuration["AIServices:LocalModel:ModelTenantId"]; + _clientPrincipalId = configuration["AIServices:LocalModel:ClientPrincipalId"]; + _partnerSource = configuration["AIServices:LocalModel:PartnerSource"]; + + // Initialize MSAL with certificate + var certSubject = configuration["AIServices:LocalModel:CertificateSubject"] ?? ""; + var clientId = configuration["AIServices:LocalModel:ClientId"] ?? ""; + var tenantId = configuration["AIServices:LocalModel:TenantId"] ?? ""; + + var cert = LoadCertificate(certSubject); + if (cert != null) + { + _msalApp = ConfidentialClientApplicationBuilder + .Create(clientId) + .WithAuthority($"https://login.microsoftonline.com/{tenantId}") + .WithCertificate(cert) + .Build(); + logger.LogInformation("LocalModel MSAL initialized with certificate '{Subject}'", certSubject); + } + else + { + logger.LogWarning("LocalModel certificate '{Subject}' not found. Auth will fail at runtime.", certSubject); + } + } + + public async Task SendAsync(string requestBody, CancellationToken cancellationToken) + { + var url = $"{_endpoint.TrimEnd('/')}/v0/resourceproxy/tenantId.{_customerId}/azureopenai/responses"; + var token = await GetTokenAsync(); + + using var req = new HttpRequestMessage(HttpMethod.Post, url); + req.Headers.Authorization = new AuthenticationHeaderValue("Bearer", token); + req.Headers.TryAddWithoutValidation("x-ms-client-principal-id", _clientPrincipalId); + req.Headers.TryAddWithoutValidation("x-ms-client-tenant-id", _modelTenantId); + req.Headers.TryAddWithoutValidation("X-ms-Source", + JsonSerializer.Serialize(new { consumptionSource = "Api", partnerSource = _partnerSource ?? "BICEvaluationService" })); + req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); + + var resp = await _httpClient.SendAsync(req, cancellationToken); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(cancellationToken); + throw new HttpRequestException($"LocalModel returned {resp.StatusCode}: {err}"); + } + + return await resp.Content.ReadAsStringAsync(cancellationToken); + } + + private async Task GetTokenAsync() + { + if (!string.IsNullOrEmpty(_cachedToken) && DateTime.UtcNow < _tokenExpiry.AddMinutes(-5)) + return _cachedToken; + + if (_msalApp == null) + throw new InvalidOperationException("MSAL not initialized. Check LocalModel certificate configuration."); + + var result = await _msalApp + .AcquireTokenForClient(new[] { _scope }) + .WithSendX5C(true) + .ExecuteAsync(); + + _cachedToken = result.AccessToken; + _tokenExpiry = result.ExpiresOn.DateTime; + return _cachedToken; + } + + private static X509Certificate2? LoadCertificate(string subject) + { + if (string.IsNullOrEmpty(subject)) return null; + foreach (var location in new[] { StoreLocation.CurrentUser, StoreLocation.LocalMachine }) + { + using var store = new X509Store(StoreName.My, location); + store.Open(OpenFlags.ReadOnly); + var certs = store.Certificates.Find(X509FindType.FindBySubjectName, subject, false); + if (certs.Count > 0) return certs[0]; + } + return null; + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs index c6a08935..1f774b33 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs @@ -50,6 +50,7 @@ public class ComputerUseRequest /// /// Defines the computer_use_preview tool for the OpenAI Responses API. +/// Used by computer-use-preview models. /// public class ComputerUseTool { @@ -66,6 +67,16 @@ public class ComputerUseTool public string Environment { get; set; } = "windows"; } +/// +/// Defines the "computer" tool for GPT-5.4+ models. +/// Bare type with no parameters — the model infers screen dimensions from screenshots. +/// +public class ComputerToolV2 +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "computer"; +} + /// /// Defines a function tool for the OpenAI Responses API. /// diff --git a/dotnet/w365-computer-use/sample-agent/Program.cs b/dotnet/w365-computer-use/sample-agent/Program.cs index 687f9ee9..a43258d2 100644 --- a/dotnet/w365-computer-use/sample-agent/Program.cs +++ b/dotnet/w365-computer-use/sample-agent/Program.cs @@ -42,6 +42,17 @@ builder.Services.AddSingleton(); // ********** END Configure A365 Services ********** +// Register the model provider based on configuration +var aiProvider = builder.Configuration["AIServices:Provider"] ?? "AzureOpenAI"; +if (aiProvider.Equals("CustomEndpoint", StringComparison.OrdinalIgnoreCase)) +{ + builder.Services.AddSingleton(); +} +else +{ + builder.Services.AddSingleton(); +} + // Register the Computer Use orchestrator builder.Services.AddSingleton(); @@ -95,4 +106,11 @@ await AgentMetrics.InvokeObservedHttpOperation("agent.process_message", async () app.MapControllers(); } +// End active W365 session on shutdown to release the VM back to the pool +app.Lifetime.ApplicationStopping.Register(() => +{ + var orchestrator = app.Services.GetRequiredService(); + orchestrator.EndSessionOnShutdownAsync().GetAwaiter().GetResult(); +}); + app.Run(); diff --git a/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json b/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json new file mode 100644 index 00000000..427cd153 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json @@ -0,0 +1,11 @@ +{ + "profiles": { + "W365ComputerUseSample": { + "commandName": "Project", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "applicationUrl": "http://localhost:3978" + } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/README.md b/dotnet/w365-computer-use/sample-agent/README.md new file mode 100644 index 00000000..2a3ddf71 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/README.md @@ -0,0 +1,199 @@ +# W365 Computer Use Sample + +## Overview + +This sample demonstrates how to build an agent that controls a Windows 365 Cloud PC using the OpenAI Responses API and the W365 Computer Use MCP server. + +The agent receives a natural language task from the user, provisions a W365 desktop session via MCP tools, then runs a CUA (Computer Use Agent) loop: the model sees screenshots, decides actions (click, type, scroll), and the MCP server executes them on the VM. + +It supports two model types: +- **`computer-use-preview`** - The original CUA model on Azure OpenAI +- **`gpt-5.4` / `gpt-5.4-mini`** - Newer GPT models with built-in computer use capability + +## Architecture + +``` +User Message + | +MyAgent (Agent Framework) + | connects to MCP server +W365 MCP Tools (QuickStartSession, CaptureScreenshot, Click, Type, etc.) + | provisions and controls +Windows 365 Cloud PC + | screenshots fed back to +CUA Model (Azure OpenAI) + | emits computer_call actions +ComputerUseOrchestrator (translates actions to MCP tool calls) + | loop until task complete +Response to User +``` + +**Key components:** + +| File | Purpose | +|------|---------| +| `Agent/MyAgent.cs` | Message handler - acquires tokens, connects to MCP, runs orchestrator | +| `ComputerUse/ComputerUseOrchestrator.cs` | CUA loop - sends screenshots to model, maps actions to MCP tools | +| `ComputerUse/ICuaModelProvider.cs` | Abstraction for the CUA model API | +| `ComputerUse/AzureOpenAIModelProvider.cs` | Azure OpenAI Responses API provider | + +## Prerequisites + +- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later +- Azure OpenAI resource with a CUA-capable model deployment: + - `computer-use-preview` or `gpt-5.4` / `gpt-5.4-mini` + - [Request access to gpt-5.4](https://aka.ms/OAI/gpt54access) if needed +- Access to the W365 Computer Use MCP server (via [Agent 365 MCP Platform](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/)) +- A bearer token with `McpServers.W365ComputerUse.All` scope + +## Setup + +### 1. Clone the repository + +```bash +git clone https://github.com/microsoft/Agent365-Samples.git +cd Agent365-Samples/dotnet/w365-computer-use/sample-agent +``` + +### 2. Restore dependencies + +```bash +dotnet restore +``` + +### 3. Create your local configuration + +Create `appsettings.Development.json` (this file is gitignored): + +**For `computer-use-preview` model:** +```json +{ + "AIServices": { + "Provider": "AzureOpenAI", + "AzureOpenAI": { + "DeploymentName": "computer-use-preview", + "Endpoint": "https://your-resource.openai.azure.com", + "ApiKey": "your-api-key" + } + }, + "McpServer": { + "Url": "http://localhost:52857/mcp/environments/Default-{your-tenant-id}/servers/mcp_W365ComputerUse" + } +} +``` + +**For `gpt-5.4-mini` model:** +```json +{ + "AIServices": { + "Provider": "AzureOpenAI", + "AzureOpenAI": { + "ModelName": "gpt-5.4-mini", + "Endpoint": "https://your-resource.openai.azure.com", + "ApiKey": "your-api-key" + } + }, + "McpServer": { + "Url": "http://localhost:52857/mcp/environments/Default-{your-tenant-id}/servers/mcp_W365ComputerUse" + } +} +``` + +### 4. Obtain a bearer token + +Get a token with the `McpServers.W365ComputerUse.All` scope for your tenant. See the [Agent 365 MCP Platform docs](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/) for details. + +### 5. Start the MCP Platform server + +Ensure the MCP Platform is running locally on port 52857, or update the `McpServer:Url` in your config. + +### 6. Run the agent + +```powershell +cd sample-agent +$env:ASPNETCORE_ENVIRONMENT = "Development" +$env:BEARER_TOKEN = "" +$env:GRAPH_TOKEN = "" +dotnet run +``` + +### 7. Test with Agent Builder + +1. Open [Microsoft 365 Agents Playground](https://dev.agents.cloud.microsoft/) +2. Connect to `http://localhost:3978/api/messages` +3. Send a message like: *"Open Notepad and type Hello World"* +4. Screenshots are saved to `./Screenshots/` automatically + +## Configuration Reference + +| Setting | Description | Default | +|---------|-------------|---------| +| `AIServices:Provider` | Model provider | `AzureOpenAI` | +| `AIServices:AzureOpenAI:Endpoint` | Azure OpenAI resource URL | - | +| `AIServices:AzureOpenAI:ApiKey` | API key | - | +| `AIServices:AzureOpenAI:DeploymentName` | Deployment name (for deployment-based URLs) | `computer-use-preview` | +| `AIServices:AzureOpenAI:ModelName` | Model name (for model-based URLs, e.g., `gpt-5.4-mini`) | - | +| `McpServer:Url` | MCP server URL (dev only; omit for production) | - | +| `ComputerUse:MaxIterations` | Max CUA loop iterations | `30` | +| `ComputerUse:DisplayWidth` | Display width for computer_use_preview tool | `1024` | +| `ComputerUse:DisplayHeight` | Display height for computer_use_preview tool | `768` | +| `Screenshots:LocalPath` | Local path to save screenshots | `./Screenshots` | +| `Screenshots:OneDriveFolder` | OneDrive folder for screenshot upload | `CUA-Sessions` | +| `Screenshots:OneDriveUserId` | UPN/email to upload screenshots to a specific user's OneDrive (instead of token owner) | - | +| `BEARER_TOKEN` (env var) | MCP Platform token with `McpServers.W365ComputerUse.All` scope (dev only) | - | +| `GRAPH_TOKEN` (env var) | Graph API token with `Files.ReadWrite` scope for OneDrive upload (dev only) | - | + +## Supported Models + +| Model | Tool Type | Config | Notes | +|-------|-----------|--------|-------| +| `computer-use-preview` | `computer_use_preview` | `DeploymentName: "computer-use-preview"` | Uses `display_width`, `display_height`, `environment` params | +| `gpt-5.4` / `gpt-5.4-mini` | `computer` | `ModelName: "gpt-5.4-mini"` | Bare `{"type": "computer"}`. Initial screenshot sent with first message | + +The tool type is auto-derived from the model name (`gpt-*` -> `computer`, otherwise -> `computer_use_preview`). + +## How It Works + +1. **User sends a message** -> `MyAgent.OnMessageAsync` +2. **MCP connection** established (direct SSE in dev, A365 SDK gateway in prod) +3. **QuickStartSession** provisions a W365 Cloud PC (once per app lifetime, reused across messages) +4. **CUA loop** in `ComputerUseOrchestrator.RunAsync`: + - User message + conversation history sent to the model + - Model returns `computer_call` actions (click, type, scroll, etc.) + - Actions translated to MCP tool calls (`W365_Click2`, `W365_WriteText`, etc.) + - Screenshot captured after each action and fed back to the model + - Loop continues until model calls `OnTaskComplete` or max iterations reached +5. **Response** sent back to user +6. **Session persists** across messages for follow-up tasks +7. **EndSession** called on app shutdown (Ctrl+C) to release the VM + +## Session Management + +- Sessions are started **once** on the first message and reused across all subsequent messages +- Conversation history accumulates across messages, giving the model context for follow-up tasks +- On app shutdown (`Ctrl+C`), the agent calls `EndSession` to release the VM back to the pool +- If the app crashes, sessions auto-expire after ~30 minutes on the W365 backend + +## Production Deployment + +1. Register an Azure Bot and configure the agent +2. Set `AIServices` config with your Azure OpenAI credentials +3. Remove `McpServer:Url` - the A365 SDK will discover the MCP server via the Tooling Gateway +4. Deploy and install the agent in Teams / M365 + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `McpServer:Url is required` | Create `appsettings.Development.json` with the MCP server URL | +| `BEARER_TOKEN` not set | Set `$env:BEARER_TOKEN` before running | +| Model returns 400 | Check that the tool type matches your model (see Supported Models table) | +| Screenshot extraction fails | Ensure MCP server returns image content blocks | +| Session orphaned after crash | Sessions auto-expire after ~30 min on the W365 backend | +| Multiple sessions started | Ensure only one agent instance is running per MCP server | + +## Links + +- [Microsoft Agent 365 Developer Documentation](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/) +- [Microsoft 365 Agents SDK](https://learn.microsoft.com/microsoft-365/agents-sdk/) +- [Azure OpenAI Computer Use Guide](https://learn.microsoft.com/en-us/azure/foundry-classic/openai/how-to/computer-use) \ No newline at end of file diff --git a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json new file mode 100644 index 00000000..b2c5acbf --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json @@ -0,0 +1,8 @@ +{ + "mcpServers": [ + { + "mcpServerName": "mcp_W365ComputerUse", + "url": "mcp_W365ComputerUse" + } + ] +} diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj index 0ab72faa..217f5ab6 100644 --- a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -12,9 +12,10 @@ + + + - - @@ -29,4 +30,8 @@ + + + + diff --git a/dotnet/w365-computer-use/sample-agent/appsettings.json b/dotnet/w365-computer-use/sample-agent/appsettings.json index 5b184e4b..4c0aad31 100644 --- a/dotnet/w365-computer-use/sample-agent/appsettings.json +++ b/dotnet/w365-computer-use/sample-agent/appsettings.json @@ -20,6 +20,7 @@ }, "TokenValidation": { + "Enabled": false, "Audiences": [ "{{ClientId}}" ] @@ -54,10 +55,27 @@ ], "AIServices": { + "Provider": "AzureOpenAI", + "AzureOpenAI": { - "DeploymentName": "computer-use-preview", + "DeploymentName": "<>", + "ModelName": "", "Endpoint": "<>", - "ApiKey": "<>" + "ApiKey": "<>", + "ApiVersion": "2025-04-01-preview" + }, + + "CustomEndpoint": { + "Endpoint": "<>", + "CertificateSubject": "<>", + "ClientId": "<>", + "TenantId": "<>", + "Scope": "<>", + "Model": "computer-use-preview-2025-03-11", + "ModelTenantId": "<>", + "ClientPrincipalId": "<>", + "PartnerSource": "<>", + "CustomerId": "<>" } }, @@ -65,5 +83,11 @@ "MaxIterations": 30, "DisplayWidth": 1024, "DisplayHeight": 768 + }, + + "Screenshots": { + "LocalPath": "./Screenshots", + "OneDriveFolder": "CUA-Sessions", + "OneDriveUserId": "" } } From 2581f90ce120091d258dc4fdfadc2158c746f096 Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Mon, 30 Mar 2026 10:13:07 -0700 Subject: [PATCH 03/17] fix nits --- .../ComputerUse/LocalModelProvider.cs | 119 ------------------ .../w365-computer-use/sample-agent/Program.cs | 2 +- 2 files changed, 1 insertion(+), 120 deletions(-) delete mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs deleted file mode 100644 index 684b112a..00000000 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/LocalModelProvider.cs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Net.Http.Headers; -using System.Security.Cryptography.X509Certificates; -using System.Text; -using System.Text.Json; -using Microsoft.Identity.Client; - -namespace W365ComputerUseSample.ComputerUse; - -/// -/// Sends CUA model requests via a local or custom model endpoint. -/// Supports certificate-based MSAL authentication for secured endpoints. -/// -public class LocalModelProvider : ICuaModelProvider -{ - private readonly HttpClient _httpClient; - private readonly string _endpoint; - private readonly string _customerId; - private readonly string? _modelTenantId; - private readonly string? _clientPrincipalId; - private readonly string? _partnerSource; - private readonly IConfidentialClientApplication? _msalApp; - private readonly string _scope; - private string? _cachedToken; - private DateTime _tokenExpiry = DateTime.MinValue; - - public string ModelName { get; } - - public LocalModelProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) - { - _httpClient = httpClientFactory.CreateClient("WebClient"); - _endpoint = configuration["AIServices:LocalModel:Endpoint"] - ?? throw new InvalidOperationException("AIServices:LocalModel:Endpoint is required."); - _customerId = configuration["AIServices:LocalModel:CustomerId"] - ?? throw new InvalidOperationException("AIServices:LocalModel:CustomerId is required."); - _scope = configuration["AIServices:LocalModel:Scope"] - ?? throw new InvalidOperationException("AIServices:LocalModel:Scope is required."); - ModelName = configuration["AIServices:LocalModel:Model"] ?? "computer-use-preview-2025-03-11"; - _modelTenantId = configuration["AIServices:LocalModel:ModelTenantId"]; - _clientPrincipalId = configuration["AIServices:LocalModel:ClientPrincipalId"]; - _partnerSource = configuration["AIServices:LocalModel:PartnerSource"]; - - // Initialize MSAL with certificate - var certSubject = configuration["AIServices:LocalModel:CertificateSubject"] ?? ""; - var clientId = configuration["AIServices:LocalModel:ClientId"] ?? ""; - var tenantId = configuration["AIServices:LocalModel:TenantId"] ?? ""; - - var cert = LoadCertificate(certSubject); - if (cert != null) - { - _msalApp = ConfidentialClientApplicationBuilder - .Create(clientId) - .WithAuthority($"https://login.microsoftonline.com/{tenantId}") - .WithCertificate(cert) - .Build(); - logger.LogInformation("LocalModel MSAL initialized with certificate '{Subject}'", certSubject); - } - else - { - logger.LogWarning("LocalModel certificate '{Subject}' not found. Auth will fail at runtime.", certSubject); - } - } - - public async Task SendAsync(string requestBody, CancellationToken cancellationToken) - { - var url = $"{_endpoint.TrimEnd('/')}/v0/resourceproxy/tenantId.{_customerId}/azureopenai/responses"; - var token = await GetTokenAsync(); - - using var req = new HttpRequestMessage(HttpMethod.Post, url); - req.Headers.Authorization = new AuthenticationHeaderValue("Bearer", token); - req.Headers.TryAddWithoutValidation("x-ms-client-principal-id", _clientPrincipalId); - req.Headers.TryAddWithoutValidation("x-ms-client-tenant-id", _modelTenantId); - req.Headers.TryAddWithoutValidation("X-ms-Source", - JsonSerializer.Serialize(new { consumptionSource = "Api", partnerSource = _partnerSource ?? "BICEvaluationService" })); - req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); - - var resp = await _httpClient.SendAsync(req, cancellationToken); - if (!resp.IsSuccessStatusCode) - { - var err = await resp.Content.ReadAsStringAsync(cancellationToken); - throw new HttpRequestException($"LocalModel returned {resp.StatusCode}: {err}"); - } - - return await resp.Content.ReadAsStringAsync(cancellationToken); - } - - private async Task GetTokenAsync() - { - if (!string.IsNullOrEmpty(_cachedToken) && DateTime.UtcNow < _tokenExpiry.AddMinutes(-5)) - return _cachedToken; - - if (_msalApp == null) - throw new InvalidOperationException("MSAL not initialized. Check LocalModel certificate configuration."); - - var result = await _msalApp - .AcquireTokenForClient(new[] { _scope }) - .WithSendX5C(true) - .ExecuteAsync(); - - _cachedToken = result.AccessToken; - _tokenExpiry = result.ExpiresOn.DateTime; - return _cachedToken; - } - - private static X509Certificate2? LoadCertificate(string subject) - { - if (string.IsNullOrEmpty(subject)) return null; - foreach (var location in new[] { StoreLocation.CurrentUser, StoreLocation.LocalMachine }) - { - using var store = new X509Store(StoreName.My, location); - store.Open(OpenFlags.ReadOnly); - var certs = store.Certificates.Find(X509FindType.FindBySubjectName, subject, false); - if (certs.Count > 0) return certs[0]; - } - return null; - } -} diff --git a/dotnet/w365-computer-use/sample-agent/Program.cs b/dotnet/w365-computer-use/sample-agent/Program.cs index a43258d2..56d030a9 100644 --- a/dotnet/w365-computer-use/sample-agent/Program.cs +++ b/dotnet/w365-computer-use/sample-agent/Program.cs @@ -18,7 +18,7 @@ var builder = WebApplication.CreateBuilder(args); -// Setup Aspire service defaults, including OpenTelemetry, Service Discovery, Resilience, and Health Checks +// Setup ASP service defaults, including OpenTelemetry, Service Discovery, Resilience, and Health Checks builder.ConfigureOpenTelemetry(); builder.Configuration.AddUserSecrets(Assembly.GetExecutingAssembly()); From 64b2121c9952cfc47abba0dc1918d955b418cc5d Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Thu, 2 Apr 2026 14:55:41 -0700 Subject: [PATCH 04/17] add comment --- .../ComputerUse/ComputerUseOrchestrator.cs | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index fefb0228..b0c15b80 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -122,9 +122,15 @@ public async Task RunAsync( // Start session once — reuse across all messages if (!_sessionStarted) { + _logger.LogInformation("No active session — calling QuickStartSession (first message)"); onStatusUpdate?.Invoke("Starting W365 computing session..."); await StartSessionAsync(w365Tools, _logger, cancellationToken); _sessionStarted = true; + _logger.LogInformation("Session started successfully, _sessionStarted={Started}", _sessionStarted); + } + else + { + _logger.LogInformation("Reusing existing session (_sessionStarted={Started})", _sessionStarted); } // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message @@ -248,8 +254,17 @@ public async Task EndSessionOnShutdownAsync() /// public static async Task StartSessionAsync(IList tools, ILogger logger, CancellationToken ct) { - await InvokeToolAsync(tools, "W365_QuickStartSession", new Dictionary(), ct); - logger.LogInformation("W365 session started via QuickStartSession"); + logger.LogInformation("Starting W365 session via QuickStartSession..."); + try + { + var result = await InvokeToolAsync(tools, "W365_QuickStartSession", new Dictionary(), ct); + logger.LogInformation("W365 QuickStartSession result: {Result}", result?.ToString()?[..Math.Min(500, result?.ToString()?.Length ?? 0)]); + } + catch (Exception ex) + { + logger.LogError(ex, "W365 QuickStartSession FAILED"); + throw; + } } /// @@ -421,8 +436,13 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien var result = await mcpClient.CallToolAsync("W365_CaptureScreenshot", new Dictionary(), cancellationToken: ct); foreach (var item in result.Content) { + _logger.LogDebug("Screenshot content block: Type={Type}, DataLen={DataLen}, TextLen={TextLen}, MimeType={Mime}", + item.Type, item.Data?.Length ?? 0, item.Text?.Length ?? 0, item.MimeType); + if (item.Type == "image" && !string.IsNullOrEmpty(item.Data)) return item.Data; + if (!string.IsNullOrEmpty(item.Data)) + return item.Data; if (item.Type == "text" && !string.IsNullOrEmpty(item.Text)) { var nested = ExtractBase64FromText(item.Text); @@ -430,6 +450,10 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien } } + // Log full content for debugging + foreach (var item in result.Content) + _logger.LogWarning("Unhandled screenshot block: Type={Type}, Text={Preview}", item.Type, item.Text?[..Math.Min(200, item.Text.Length)]); + throw new InvalidOperationException($"Screenshot MCP response had {result.Content.Count} content blocks but no extractable image data."); } From f11bfc36b60d6f58a9e2ce574f8afab324426fa2 Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Sun, 5 Apr 2026 19:53:57 -0700 Subject: [PATCH 05/17] feat: multi-session handling per conversation - Add ConversationSession class to track per-conversation W365 sessions - Refactor ComputerUseOrchestrator to use ConcurrentDictionary keyed by conversationId - Parse sessionId from QuickStartSession response and pass to all MCP tool calls - Pass conversationId from turnContext to orchestrator - Add deployment artifacts to .gitignore (a365 configs, app.zip, publish/) --- .../w365-computer-use/sample-agent/.gitignore | 4 + .../sample-agent/Agent/MyAgent.cs | 4 +- .../ComputerUse/ComputerUseOrchestrator.cs | 212 ++++++++++++------ 3 files changed, 153 insertions(+), 67 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/.gitignore b/dotnet/w365-computer-use/sample-agent/.gitignore index 3b58674a..72ffa741 100644 --- a/dotnet/w365-computer-use/sample-agent/.gitignore +++ b/dotnet/w365-computer-use/sample-agent/.gitignore @@ -1,2 +1,6 @@ appsettings.Development.json Screenshots/ +a365.config.json +a365.generated.config.json +app.zip +publish/ diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index be8f3be0..b1e61b84 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -220,8 +220,10 @@ await turnContext.SendActivityAsync( graphToken = Environment.GetEnvironmentVariable("GRAPH_TOKEN"); } - // Run the CUA loop — session is already active + // Run the CUA loop — session is managed per conversation + var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); var response = await _orchestrator.RunAsync( + conversationId, userText, w365Tools, mcpClient: mcpClient, diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index b0c15b80..6160816e 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Collections.Concurrent; using System.Net.Http.Headers; using System.Text.Json; using Microsoft.Extensions.AI; @@ -12,7 +13,7 @@ namespace W365ComputerUseSample.ComputerUse; /// /// Thin protocol adapter between OpenAI's computer-use-preview model and W365 MCP tools. /// The model emits computer_call actions; this class translates them to MCP tool calls -/// and feeds back screenshots. The MCP server manages sessions automatically. +/// and feeds back screenshots. Supports multiple concurrent sessions keyed by conversation ID. /// public class ComputerUseOrchestrator { @@ -27,29 +28,21 @@ public class ComputerUseOrchestrator private readonly List _tools; /// - /// Conversation history persisted across user messages. - /// This allows the model to maintain context across multiple turns - /// (e.g., "now save the file" after a previous "type hello" command). + /// Per-conversation session state. Each conversation (user chat) gets its own + /// W365 session, conversation history, and screenshot counter. /// - private readonly List _conversationHistory = []; + private readonly ConcurrentDictionary _sessions = new(); /// - /// Whether a W365 session has been started. Tracked here (singleton) because - /// MyAgent is transient — a new agent instance is created per HTTP request. + /// Shared MCP client — one SSE connection reused across all conversations. /// - private bool _sessionStarted; + private IMcpClient? _cachedMcpClient; /// - /// Cached reference to the last-used W365 tools, used for shutdown cleanup. + /// Shared tool list — same tools for all conversations. /// private IList? _cachedTools; - /// - /// Cached MCP client reference — kept alive for the app lifetime so EndSession - /// can be called on shutdown. Disposed when the app stops. - /// - private IMcpClient? _cachedMcpClient; - private const string SystemInstructions = """ You are a computer-using agent that can control a Windows desktop computer. After each action, examine the screenshot to verify it worked. @@ -105,9 +98,10 @@ public ComputerUseOrchestrator( } /// - /// Run the CUA loop. Session must already be started by the caller. + /// Run the CUA loop for a specific conversation. /// public async Task RunAsync( + string conversationId, string userMessage, IList w365Tools, IMcpClient? mcpClient = null, @@ -115,34 +109,34 @@ public async Task RunAsync( Action? onStatusUpdate = null, CancellationToken cancellationToken = default) { - _logger.LogInformation("Starting CUA loop for: {Message}", Truncate(userMessage, 100)); + _logger.LogInformation("Starting CUA loop for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); _cachedTools = w365Tools; if (mcpClient != null) _cachedMcpClient = mcpClient; - // Start session once — reuse across all messages - if (!_sessionStarted) + var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); + + // Start session once per conversation + if (!session.SessionStarted) { - _logger.LogInformation("No active session — calling QuickStartSession (first message)"); + _logger.LogInformation("No active session for conversation {ConversationId} — calling QuickStartSession", conversationId); onStatusUpdate?.Invoke("Starting W365 computing session..."); - await StartSessionAsync(w365Tools, _logger, cancellationToken); - _sessionStarted = true; - _logger.LogInformation("Session started successfully, _sessionStarted={Started}", _sessionStarted); + session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); + session.SessionStarted = true; + _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } else { - _logger.LogInformation("Reusing existing session (_sessionStarted={Started})", _sessionStarted); + _logger.LogInformation("Reusing session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message - // so the model can see the screen. On subsequent messages, the history already has - // computer_call_output screenshots, so adding another input_image would cause a 400 error. - if (_toolType == "computer" && _conversationHistory.Count == 0) + if (_toolType == "computer" && session.ConversationHistory.Count == 0) { - var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, cancellationToken); - var initialName = $"{++_screenshotCounter:D3}_initial"; + var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); + var initialName = $"{conversationId[..8]}_{++session.ScreenshotCounter:D3}_initial"; SaveScreenshotToDisk(initialScreenshot!, initialName); await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken); - _conversationHistory.Add(ToJsonElement(new + session.ConversationHistory.Add(ToJsonElement(new { type = "message", role = "user", @@ -155,14 +149,14 @@ public async Task RunAsync( } else { - _conversationHistory.Add(CreateUserMessage(userMessage)); + session.ConversationHistory.Add(CreateUserMessage(userMessage)); } for (var i = 0; i < _maxIterations; i++) { cancellationToken.ThrowIfCancellationRequested(); - var response = await CallModelAsync(_conversationHistory, cancellationToken); + var response = await CallModelAsync(session.ConversationHistory, cancellationToken); if (response?.Output == null || response.Output.Count == 0) break; @@ -173,7 +167,7 @@ public async Task RunAsync( var type = item.GetProperty("type").GetString(); if (type == "reasoning") continue; - _conversationHistory.Add(item); + session.ConversationHistory.Add(item); switch (type) { @@ -183,14 +177,14 @@ public async Task RunAsync( case "computer_call": hasActions = true; _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); - _conversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, graphAccessToken, onStatusUpdate, cancellationToken)); + session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, cancellationToken)); break; case "function_call": hasActions = true; var funcName = item.GetProperty("name").GetString(); _logger.LogInformation("CUA iteration {Iteration}: function_call {Name}", i + 1, funcName); - _conversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); if (funcName == "OnTaskComplete") { return "Task completed successfully."; @@ -208,16 +202,18 @@ public async Task RunAsync( /// /// End the W365 session. Called by the agent on shutdown or explicit end. /// - public static async Task EndSessionAsync(IList tools, ILogger logger, CancellationToken ct) + public static async Task EndSessionAsync(IList tools, ILogger logger, string? sessionId, CancellationToken ct) { try { - await InvokeToolAsync(tools, "W365_EndSession", new Dictionary(), ct); - logger.LogInformation("W365 session ended"); + var args = new Dictionary(); + if (!string.IsNullOrEmpty(sessionId)) + args["sessionId"] = sessionId; + await InvokeToolAsync(tools, "W365_EndSession", args, ct); + logger.LogInformation("W365 session ended (sessionId={SessionId})", sessionId); } catch (ObjectDisposedException) { - // MCP client already disposed (dev mode SSE connection closed) — session will time out on server logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); } catch (Exception ex) @@ -227,21 +223,28 @@ public static async Task EndSessionAsync(IList tools, ILogger logger, Ca } /// - /// End the session using cached tools. Called from app shutdown hook. + /// End all active sessions on shutdown. /// public async Task EndSessionOnShutdownAsync() { - if (_cachedTools == null || !_sessionStarted) + if (_cachedTools == null) { - _logger.LogInformation("No active session to end on shutdown"); + _logger.LogInformation("No tools cached — nothing to clean up on shutdown"); return; } - await EndSessionAsync(_cachedTools, _logger, CancellationToken.None); + foreach (var (convId, session) in _sessions) + { + if (session.SessionStarted) + { + _logger.LogInformation("Ending session for conversation {ConversationId}, W365SessionId={SessionId}", convId, session.W365SessionId); + await EndSessionAsync(_cachedTools, _logger, session.W365SessionId, CancellationToken.None); + } + } + + _sessions.Clear(); _cachedTools = null; - _sessionStarted = false; - // Dispose the MCP client if we own it if (_cachedMcpClient != null) { await _cachedMcpClient.DisposeAsync(); @@ -250,15 +253,43 @@ public async Task EndSessionOnShutdownAsync() } /// - /// Start a W365 session. Called by the agent on first message. + /// Start a W365 session and return the sessionId. /// - public static async Task StartSessionAsync(IList tools, ILogger logger, CancellationToken ct) + public static async Task StartSessionAsync(IList tools, ILogger logger, CancellationToken ct) { logger.LogInformation("Starting W365 session via QuickStartSession..."); try { var result = await InvokeToolAsync(tools, "W365_QuickStartSession", new Dictionary(), ct); - logger.LogInformation("W365 QuickStartSession result: {Result}", result?.ToString()?[..Math.Min(500, result?.ToString()?.Length ?? 0)]); + var resultStr = result?.ToString() ?? ""; + logger.LogInformation("W365 QuickStartSession result: {Result}", resultStr[..Math.Min(500, resultStr.Length)]); + + // Parse sessionId from response + try + { + using var doc = JsonDocument.Parse(resultStr); + if (doc.RootElement.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) + { + foreach (var block in content.EnumerateArray()) + { + if (block.TryGetProperty("text", out var text)) + { + var textStr = text.GetString() ?? ""; + try + { + using var innerDoc = JsonDocument.Parse(textStr); + if (innerDoc.RootElement.TryGetProperty("sessionId", out var sid)) + return sid.GetString(); + } + catch (JsonException) { } + } + } + } + } + catch (JsonException) { } + + logger.LogWarning("Could not parse sessionId from QuickStartSession response — using default session"); + return null; } catch (Exception ex) { @@ -321,17 +352,16 @@ public static async Task StartSessionAsync(IList tools, ILogger logger, /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. /// private async Task HandleComputerCallAsync( - JsonElement call, IList tools, IMcpClient? mcpClient, string? graphAccessToken, Action? onStatus, CancellationToken ct) + JsonElement call, IList tools, IMcpClient? mcpClient, ConversationSession session, string? graphAccessToken, Action? onStatus, CancellationToken ct) { var callId = call.GetProperty("call_id").GetString()!; + var sessionId = session.W365SessionId; // GPT-5.4 uses "actions" (non-empty array), older models use "action" (singular). - // Some models return both: "action": {...}, "actions": [] — so we must check the array is non-empty. if (call.TryGetProperty("actions", out var actionsArray) && actionsArray.ValueKind == JsonValueKind.Array && actionsArray.GetArrayLength() > 0) { - // Process batch actions (GPT-5.4 format) foreach (var action in actionsArray.EnumerateArray()) { var actionType = action.GetProperty("type").GetString()!; @@ -339,35 +369,44 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { - var (toolName, args) = MapActionToMcpTool(actionType, action); + var (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); await InvokeToolAsync(tools, toolName, args, ct); } } } else if (call.TryGetProperty("action", out var singleAction)) { - // Single action (computer-use-preview format) var actionType = singleAction.GetProperty("type").GetString()!; onStatus?.Invoke($"Performing: {actionType}..."); if (actionType != "screenshot") { - var (toolName, args) = MapActionToMcpTool(actionType, singleAction); + var (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); await InvokeToolAsync(tools, toolName, args, ct); } } // Always capture screenshot after action - var screenshot = await CaptureScreenshotAsync(tools, mcpClient, ct); + var screenshot = await CaptureScreenshotAsync(tools, mcpClient, sessionId, ct); - // Save screenshot locally and/or upload to OneDrive - var stepName = $"{++_screenshotCounter:D3}_step"; + var stepName = $"{++session.ScreenshotCounter:D3}_step"; SaveScreenshotToDisk(screenshot!, stepName); await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken); var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) ? sc : JsonSerializer.Deserialize("[]"); + // "computer" tool type (gpt-5.4+) doesn't support acknowledged_safety_checks + if (_toolType == "computer") + { + return ToJsonElement(new + { + type = "computer_call_output", + call_id = callId, + output = new { type = "computer_screenshot", image_url = $"data:image/png;base64,{screenshot}" } + }); + } + return ToJsonElement(new { type = "computer_call_output", @@ -379,11 +418,11 @@ private async Task HandleComputerCallAsync( /// /// Map OpenAI computer_call action types to W365 MCP tool names and arguments. - /// sessionId is omitted — the MCP server resolves sessions by user context. + /// Includes sessionId so the MCP server uses the correct session. /// - private static (string ToolName, Dictionary Args) MapActionToMcpTool(string actionType, JsonElement action) + private static (string ToolName, Dictionary Args) MapActionToMcpTool(string actionType, JsonElement action, string? sessionId) { - return actionType.ToLowerInvariant() switch + var (toolName, args) = actionType.ToLowerInvariant() switch { "click" => ("W365_Click2", new Dictionary { @@ -426,14 +465,24 @@ private static (string ToolName, Dictionary Args) MapActionToMc }), _ => throw new NotSupportedException($"Unsupported action: {actionType}") }; + + // Add sessionId to all tool calls so the MCP server routes to the correct session + if (!string.IsNullOrEmpty(sessionId)) + args["sessionId"] = sessionId; + + return (toolName, args); } - private async Task CaptureScreenshotAsync(IList tools, IMcpClient? mcpClient, CancellationToken ct) + private async Task CaptureScreenshotAsync(IList tools, IMcpClient? mcpClient, string? sessionId, CancellationToken ct) { + var screenshotArgs = new Dictionary(); + if (!string.IsNullOrEmpty(sessionId)) + screenshotArgs["sessionId"] = sessionId; + // Use direct MCP client when available — AIFunction wrappers drop image content blocks if (mcpClient != null) { - var result = await mcpClient.CallToolAsync("W365_CaptureScreenshot", new Dictionary(), cancellationToken: ct); + var result = await mcpClient.CallToolAsync("W365_CaptureScreenshot", screenshotArgs, cancellationToken: ct); foreach (var item in result.Content) { _logger.LogDebug("Screenshot content block: Type={Type}, DataLen={DataLen}, TextLen={TextLen}, MimeType={Mime}", @@ -458,9 +507,12 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien } // Fallback: AIFunction wrapper (may lose image content) - var aiResult = await InvokeToolAsync(tools, "W365_CaptureScreenshot", new Dictionary(), ct); + var aiResult = await InvokeToolAsync(tools, "W365_CaptureScreenshot", screenshotArgs, ct); var str = aiResult?.ToString() ?? ""; + _logger.LogInformation("Screenshot fallback: result type={Type}, length={Length}, preview={Preview}", + aiResult?.GetType().Name ?? "null", str.Length, str[..Math.Min(200, str.Length)]); + try { using var doc = JsonDocument.Parse(str); @@ -468,10 +520,28 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString() ?? ""; if (root.TryGetProperty("image", out var img)) return img.GetString() ?? ""; if (root.TryGetProperty("data", out var d)) return d.GetString() ?? ""; + + // Try nested content array (SDK gateway format) + if (root.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) + { + foreach (var block in content.EnumerateArray()) + { + if (block.TryGetProperty("data", out var blockData) && !string.IsNullOrEmpty(blockData.GetString())) + return blockData.GetString(); + if (block.TryGetProperty("text", out var blockText)) + { + var extracted = ExtractBase64FromText(blockText.GetString()); + if (!string.IsNullOrEmpty(extracted)) return extracted; + } + } + } } catch (JsonException) { } - if (str.Length > 100) return str; + // Last resort: if it looks like raw base64 (long string, no JSON), use it directly + if (str.Length > 1000 && !str.StartsWith("{") && !str.StartsWith("[")) + return str; + throw new InvalidOperationException($"Failed to extract screenshot. Response length: {str.Length}"); } @@ -537,8 +607,6 @@ private static JsonElement ToJsonElement(object obj) => private static string Truncate(string v, int max) => v.Length <= max ? v : v[..max] + "..."; - private int _screenshotCounter; - private void SaveScreenshotToDisk(string base64Data, string name) { if (string.IsNullOrEmpty(base64Data) || string.IsNullOrEmpty(_screenshotPath)) return; @@ -606,4 +674,16 @@ private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fil _logger.LogWarning(ex, "Failed to upload screenshot to OneDrive"); } } + + /// + /// Per-conversation session state. Holds the W365 session ID, conversation history, + /// and screenshot counter for a single user conversation. + /// + private sealed class ConversationSession + { + public bool SessionStarted { get; set; } + public string? W365SessionId { get; set; } + public List ConversationHistory { get; } = []; + public int ScreenshotCounter { get; set; } + } } From 407b43c318c9b70aeca789adcba3ed416ce1daba Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Mon, 6 Apr 2026 15:55:58 -0700 Subject: [PATCH 06/17] allow ending sessions and conversational messages, fix nuget and package refs --- .../sample-agent/Agent/MyAgent.cs | 2 +- .../ComputerUse/ComputerUseOrchestrator.cs | 171 ++++++++++++++---- .../sample-agent/W365ComputerUseSample.csproj | 4 +- .../sample-agent/nuget.config | 7 + 4 files changed, 147 insertions(+), 37 deletions(-) create mode 100644 dotnet/w365-computer-use/sample-agent/nuget.config diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index b1e61b84..9e8ef532 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -191,6 +191,7 @@ await A365OtelWrapper.InvokeObservedAgentOperation( try { var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; + var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); // Get W365 MCP tools — direct connection in Dev, SDK in Production var (w365Tools, mcpClient) = await GetW365ToolsAsync(turnContext, ToolAuthHandlerName); @@ -221,7 +222,6 @@ await turnContext.SendActivityAsync( } // Run the CUA loop — session is managed per conversation - var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); var response = await _orchestrator.RunAsync( conversationId, userText, diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 6160816e..a2072387 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -49,6 +49,8 @@ You are a computer-using agent that can control a Windows desktop computer. If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). Once you have completed the task, call the OnTaskComplete function. Do NOT continue looping after the task is done. + If the user wants to end, quit, or disconnect their session, call the EndSession function. + If the user sends a casual greeting or question that does not require computer use, reply with a helpful text message. """; public ComputerUseOrchestrator( @@ -93,6 +95,11 @@ public ComputerUseOrchestrator( { Name = "OnTaskComplete", Description = "Call this function when the given task has been completed successfully." + }, + new FunctionToolDefinition + { + Name = "EndSession", + Description = "Call this function when the user wants to end, quit, disconnect, or release their computer session." } ]; } @@ -115,42 +122,14 @@ public async Task RunAsync( var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); - // Start session once per conversation - if (!session.SessionStarted) - { - _logger.LogInformation("No active session for conversation {ConversationId} — calling QuickStartSession", conversationId); - onStatusUpdate?.Invoke("Starting W365 computing session..."); - session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); - session.SessionStarted = true; - _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); - } - else + if (session.SessionStarted) { _logger.LogInformation("Reusing session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } - // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message - if (_toolType == "computer" && session.ConversationHistory.Count == 0) - { - var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); - var initialName = $"{conversationId[..8]}_{++session.ScreenshotCounter:D3}_initial"; - SaveScreenshotToDisk(initialScreenshot!, initialName); - await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken); - session.ConversationHistory.Add(ToJsonElement(new - { - type = "message", - role = "user", - content = new object[] - { - new { type = "input_text", text = userMessage }, - new { type = "input_image", image_url = $"data:image/png;base64,{initialScreenshot}" } - } - })); - } - else - { - session.ConversationHistory.Add(CreateUserMessage(userMessage)); - } + // Always add the user message as text — session start and initial screenshot + // are deferred until the model emits its first computer_call. + session.ConversationHistory.Add(CreateUserMessage(userMessage)); for (var i = 0; i < _maxIterations; i++) { @@ -177,6 +156,26 @@ public async Task RunAsync( case "computer_call": hasActions = true; _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); + + // Lazy session start — only spin up the VM when the model actually needs the computer + if (!session.SessionStarted) + { + _logger.LogInformation("First computer_call for conversation {ConversationId} — starting session", conversationId); + onStatusUpdate?.Invoke("Starting W365 computing session..."); + session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); + session.SessionStarted = true; + _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + + // For gpt-5.4+ ("computer" tool type), capture an initial screenshot + if (_toolType == "computer") + { + var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); + var initialName = $"{conversationId[..Math.Min(8, conversationId.Length)]}_{++session.ScreenshotCounter:D3}_initial"; + SaveScreenshotToDisk(initialScreenshot!, initialName); + await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken); + } + } + session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, cancellationToken)); break; @@ -189,6 +188,19 @@ public async Task RunAsync( { return "Task completed successfully."; } + if (funcName == "EndSession") + { + if (session.SessionStarted) + { + _logger.LogInformation("EndSession requested by model for conversation {ConversationId}", conversationId); + onStatusUpdate?.Invoke("Ending session..."); + await EndSessionAsync(w365Tools, _logger, session.W365SessionId, cancellationToken); + session.SessionStarted = false; + session.W365SessionId = null; + _sessions.TryRemove(conversationId, out _); + } + return "Session ended. The VM has been released back to the pool."; + } break; } } @@ -199,6 +211,26 @@ public async Task RunAsync( return "The task could not be completed within the allowed number of steps."; } + /// + /// Check if a conversation has an active W365 session. + /// + public bool HasActiveSession(string conversationId) + { + return _sessions.TryGetValue(conversationId, out var session) && session.SessionStarted; + } + + /// + /// End the session for a specific conversation and clean up state. + /// + public async Task EndConversationSessionAsync(string conversationId, IList tools, CancellationToken ct) + { + if (_sessions.TryRemove(conversationId, out var session) && session.SessionStarted) + { + _logger.LogInformation("Ending session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + await EndSessionAsync(tools, _logger, session.W365SessionId, ct); + } + } + /// /// End the W365 session. Called by the agent on shutdown or explicit end. /// @@ -216,6 +248,10 @@ public static async Task EndSessionAsync(IList tools, ILogger logger, st { logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); } + catch (HttpRequestException httpEx) when (httpEx.StatusCode == System.Net.HttpStatusCode.NotFound) + { + logger.LogInformation("MCP transport session expired (404) — W365 session will be released by server timeout"); + } catch (Exception ex) { logger.LogWarning(ex, "Failed to end W365 session"); @@ -370,7 +406,15 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { var (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); - await InvokeToolAsync(tools, toolName, args, ct); + var (_, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + onStatus?.Invoke("Session lost — recovering..."); + sessionId = await RecoverSessionAsync(session, tools, _logger, ct); + // Re-map with new sessionId and retry + (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); + await InvokeToolAsync(tools, toolName, args, ct); + } } } } @@ -382,7 +426,14 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { var (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); - await InvokeToolAsync(tools, toolName, args, ct); + var (_, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + onStatus?.Invoke("Session lost — recovering..."); + sessionId = await RecoverSessionAsync(session, tools, _logger, ct); + (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); + await InvokeToolAsync(tools, toolName, args, ct); + } } } @@ -568,6 +619,58 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien return await tool.InvokeAsync(new AIFunctionArguments(args), ct); } + /// + /// Invoke a tool and detect session-not-found errors. Returns (result, isSessionLost). + /// + private static async Task<(object? Result, bool IsSessionLost)> InvokeToolCheckSessionAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? ""; + if (IsSessionNotFoundError(resultStr)) + return (result, true); + return (result, false); + } + + /// + /// Check if a tool response indicates the session is no longer valid. + /// + private static bool IsSessionNotFoundError(string response) + { + if (string.IsNullOrEmpty(response)) return false; + var lower = response.ToLowerInvariant(); + return lower.Contains("no active session found") || + lower.Contains("session not found") || + lower.Contains("session expired") || + lower.Contains("session has been terminated"); + } + + /// + /// Recover from a lost session: end the stale session (best-effort) and start a new one. + /// + private async Task RecoverSessionAsync( + ConversationSession session, IList tools, ILogger logger, CancellationToken ct) + { + logger.LogWarning("Session lost for W365SessionId={SessionId}. Recovering — ending stale session and starting new one.", session.W365SessionId); + + // Best-effort end the stale session + try + { + await EndSessionAsync(tools, logger, session.W365SessionId, ct); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Best-effort EndSession during recovery failed for {SessionId}", session.W365SessionId); + } + + // Start a fresh session + var newSessionId = await StartSessionAsync(tools, logger, ct); + session.W365SessionId = newSessionId; + session.SessionStarted = true; + logger.LogInformation("Session recovered. New W365SessionId={SessionId}", newSessionId); + return newSessionId; + } + private static string[] ExtractKeys(JsonElement action) { if (action.TryGetProperty("keys", out var k)) diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj index 217f5ab6..e0455440 100644 --- a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -9,8 +9,8 @@ - - + + diff --git a/dotnet/w365-computer-use/sample-agent/nuget.config b/dotnet/w365-computer-use/sample-agent/nuget.config new file mode 100644 index 00000000..765346e5 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/nuget.config @@ -0,0 +1,7 @@ + + + + + + + From 030efb0d2e55d459d654b5fe89f6b2703ecaf78d Mon Sep 17 00:00:00 2001 From: Mohamed Abdelkader Date: Mon, 6 Apr 2026 23:50:28 -0700 Subject: [PATCH 07/17] add per agent user onedrive creation --- .../sample-agent/Agent/MyAgent.cs | 2 + .../ComputerUse/ComputerUseOrchestrator.cs | 141 +++++++++++++++--- 2 files changed, 122 insertions(+), 21 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index 9e8ef532..e265ec9d 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -229,6 +229,8 @@ await turnContext.SendActivityAsync( mcpClient: mcpClient, graphAccessToken: graphToken, onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), + onFolderLinkReady: async url => await turnContext.SendActivityAsync( + MessageFactory.Text($"📸 Screenshots for this session: [View folder]({url})"), cancellationToken), cancellationToken: cancellationToken); // Send the response diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index a2072387..b3721a11 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -114,13 +114,23 @@ public async Task RunAsync( IMcpClient? mcpClient = null, string? graphAccessToken = null, Action? onStatusUpdate = null, + Func? onFolderLinkReady = null, CancellationToken cancellationToken = default) { _logger.LogInformation("Starting CUA loop for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); _cachedTools = w365Tools; if (mcpClient != null) _cachedMcpClient = mcpClient; - var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); + var session = _sessions.GetOrAdd(conversationId, _ => + { + // Build a safe subfolder name from date + truncated conversation ID + var safeId = new string(conversationId.Where(c => char.IsLetterOrDigit(c)).ToArray()); + safeId = safeId.Length > 8 ? safeId[..8] : safeId; + return new ConversationSession + { + ScreenshotSubfolder = $"{DateTime.UtcNow:yyyyMMdd}_{safeId}" + }; + }); if (session.SessionStarted) { @@ -162,21 +172,35 @@ public async Task RunAsync( { _logger.LogInformation("First computer_call for conversation {ConversationId} — starting session", conversationId); onStatusUpdate?.Invoke("Starting W365 computing session..."); - session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); - session.SessionStarted = true; - _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + try + { + session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); + session.SessionStarted = true; + _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + } + catch (Exception ex) + { + _logger.LogError(ex, "QuickStartSession failed for conversation {ConversationId}", conversationId); + return "Unable to start a W365 Cloud PC session. This could mean:\n" + + "- No Cloud PC pools are available for your agent user\n" + + "- All sessions in the pool are currently in use\n" + + "- The agent user doesn't have the required permissions\n\n" + + $"Error: {ex.Message}"; + } // For gpt-5.4+ ("computer" tool type), capture an initial screenshot if (_toolType == "computer") { var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); - var initialName = $"{conversationId[..Math.Min(8, conversationId.Length)]}_{++session.ScreenshotCounter:D3}_initial"; + var initialName = $"{++session.ScreenshotCounter:D3}_initial"; SaveScreenshotToDisk(initialScreenshot!, initialName); - await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken); + var folderUrl = await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken, session.ScreenshotSubfolder, session); + if (folderUrl != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrl); } } - session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, cancellationToken)); + session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, onFolderLinkReady, cancellationToken)); break; case "function_call": @@ -388,7 +412,7 @@ public async Task EndSessionOnShutdownAsync() /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. /// private async Task HandleComputerCallAsync( - JsonElement call, IList tools, IMcpClient? mcpClient, ConversationSession session, string? graphAccessToken, Action? onStatus, CancellationToken ct) + JsonElement call, IList tools, IMcpClient? mcpClient, ConversationSession session, string? graphAccessToken, Action? onStatus, Func? onFolderLinkReady, CancellationToken ct) { var callId = call.GetProperty("call_id").GetString()!; var sessionId = session.W365SessionId; @@ -442,7 +466,9 @@ private async Task HandleComputerCallAsync( var stepName = $"{++session.ScreenshotCounter:D3}_step"; SaveScreenshotToDisk(screenshot!, stepName); - await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken); + var folderUrl = await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken, session.ScreenshotSubfolder, session); + if (folderUrl != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrl); var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) ? sc : JsonSerializer.Deserialize("[]"); @@ -731,31 +757,31 @@ private void SaveScreenshotToDisk(string base64Data, string name) /// Requires a Graph access token with Files.ReadWrite scope. /// Files are uploaded to /CUA-Sessions/{date}/ folder. /// - private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken) + private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken, string? subfolder, ConversationSession session) { if (string.IsNullOrEmpty(graphAccessToken)) { _logger.LogDebug("OneDrive upload skipped: no Graph token"); - return; + return null; } if (string.IsNullOrEmpty(base64Data)) { _logger.LogDebug("OneDrive upload skipped: no screenshot data"); - return; + return null; } if (string.IsNullOrEmpty(_oneDriveFolder)) { _logger.LogDebug("OneDrive upload skipped: OneDriveFolder not configured"); - return; + return null; } try { - // Use /me/drive for token owner, or /users/{id}/drive for a specific user - var driveBase = string.IsNullOrEmpty(_oneDriveUserId) - ? "https://graph.microsoft.com/v1.0/me/drive" - : $"https://graph.microsoft.com/v1.0/users/{_oneDriveUserId}/drive"; - var url = $"{driveBase}/root:/{_oneDriveFolder.TrimStart('/')}/{fileName}:/content"; + // Upload to /CUA-Sessions/{subfolder}/{fileName} — subfolder is per-conversation + var folderPath = string.IsNullOrEmpty(subfolder) + ? _oneDriveFolder.TrimStart('/') + : $"{_oneDriveFolder.TrimStart('/')}/{subfolder}"; + var url = $"https://graph.microsoft.com/v1.0/me/drive/root:/{folderPath}/{fileName}:/content"; using var request = new HttpRequestMessage(HttpMethod.Put, url); request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); @@ -765,17 +791,88 @@ private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fil var response = await _httpClient.SendAsync(request); if (response.IsSuccessStatusCode) { - _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", _oneDriveFolder, fileName); + _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", folderPath, fileName); + + // On first upload, create an org-scoped sharing link for the folder + if (!session.FolderShared) + { + var shareUrl = await ShareConversationFolderAsync(folderPath, graphAccessToken); + if (shareUrl != null) + { + session.FolderShared = true; + return shareUrl; + } + } } else { - _logger.LogWarning("OneDrive upload failed: {Status}", response.StatusCode); + var content = await response.Content.ReadAsStringAsync(); + _logger.LogWarning("OneDrive upload failed: {Status} {Content}", response.StatusCode, content); } } catch (Exception ex) { _logger.LogWarning(ex, "Failed to upload screenshot to OneDrive"); } + + return null; + } + + /// + /// Create an organization-scoped sharing link for the conversation's screenshot folder. + /// Returns the web URL that anyone in the org can use to view the folder. + /// + private async Task ShareConversationFolderAsync(string folderPath, string graphAccessToken) + { + try + { + // Get the folder's item ID + var folderUrl = $"https://graph.microsoft.com/v1.0/me/drive/root:/{folderPath}"; + using var getRequest = new HttpRequestMessage(HttpMethod.Get, folderUrl); + getRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + var getResponse = await _httpClient.SendAsync(getRequest); + + if (!getResponse.IsSuccessStatusCode) + { + _logger.LogWarning("Failed to get folder item for sharing: {Status}", getResponse.StatusCode); + return null; + } + + var folderJson = await getResponse.Content.ReadAsStringAsync(); + using var doc = JsonDocument.Parse(folderJson); + var folderId = doc.RootElement.GetProperty("id").GetString(); + var webUrl = doc.RootElement.TryGetProperty("webUrl", out var wu) ? wu.GetString() : null; + + // Create an organization-scoped view link + var linkUrl = $"https://graph.microsoft.com/v1.0/me/drive/items/{folderId}/createLink"; + using var linkRequest = new HttpRequestMessage(HttpMethod.Post, linkUrl); + linkRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + linkRequest.Content = new StringContent( + JsonSerializer.Serialize(new { type = "view", scope = "organization" }), + System.Text.Encoding.UTF8, "application/json"); + + var linkResponse = await _httpClient.SendAsync(linkRequest); + if (linkResponse.IsSuccessStatusCode) + { + var linkJson = await linkResponse.Content.ReadAsStringAsync(); + using var linkDoc = JsonDocument.Parse(linkJson); + var shareUrl = linkDoc.RootElement.GetProperty("link").GetProperty("webUrl").GetString(); + _logger.LogInformation("Folder shared with org: {Url}", shareUrl); + return shareUrl; + } + else + { + var errorContent = await linkResponse.Content.ReadAsStringAsync(); + _logger.LogWarning("Failed to create sharing link: {Status} {Content}", linkResponse.StatusCode, errorContent); + // Fall back to the folder's webUrl (user may not be able to access without sharing) + return webUrl; + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to share conversation folder"); + return null; + } } /// @@ -786,7 +883,9 @@ private sealed class ConversationSession { public bool SessionStarted { get; set; } public string? W365SessionId { get; set; } - public List ConversationHistory { get; } = []; + public List ConversationHistory { get; } = []; public int ScreenshotCounter { get; set; } + public bool FolderShared { get; set; } + public string? ScreenshotSubfolder { get; set; } } } From 8d4931035d5e8911a988e0056238b5680343cb4e Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Tue, 7 Apr 2026 16:31:58 -0700 Subject: [PATCH 08/17] feat: use previous_response_id to avoid resending screenshots in CUA loop Instead of sending the full conversation history (including all base64 screenshots) on every model call, use the OpenAI Responses API's previous_response_id to let the server reconstruct prior context. Only new items (computer_call_output, function_call_output) are sent per iteration, reducing API payload by ~15x. Between user messages, computer actions and screenshots are pruned from history while text context is preserved for conversational continuity. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../ComputerUse/ComputerUseOrchestrator.cs | 63 ++++++++++++++++--- .../ComputerUse/Models/ComputerUseModels.cs | 3 + 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index b3721a11..2560b00c 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -137,15 +137,27 @@ public async Task RunAsync( _logger.LogInformation("Reusing session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } - // Always add the user message as text — session start and initial screenshot - // are deferred until the model emits its first computer_call. - session.ConversationHistory.Add(CreateUserMessage(userMessage)); + // Between user messages: keep text context (user messages + model text replies) + // but drop computer actions and screenshots. This preserves conversational memory + // (so follow-ups like "how many?" work) without carrying stale screen state or + // heavy base64 screenshot data from previous tasks. + session.ConversationHistory.RemoveAll(item => + { + var type = item.TryGetProperty("type", out var t) ? t.GetString() : null; + return type is "computer_call" or "computer_call_output" or "function_call" or "function_call_output"; + }); + session.NewItems.Clear(); + session.LastResponseId = null; + + var userMsg = CreateUserMessage(userMessage); + session.ConversationHistory.Add(userMsg); + session.NewItems.Add(userMsg); for (var i = 0; i < _maxIterations; i++) { cancellationToken.ThrowIfCancellationRequested(); - var response = await CallModelAsync(session.ConversationHistory, cancellationToken); + var response = await CallModelAsync(session, cancellationToken); if (response?.Output == null || response.Output.Count == 0) break; @@ -157,6 +169,9 @@ public async Task RunAsync( if (type == "reasoning") continue; session.ConversationHistory.Add(item); + // No need to add model output items to NewItems — the API reconstructs + // its own output from previous_response_id. We only need to send new + // user-side items (user messages, computer_call_output, function_call_output). switch (type) { @@ -200,14 +215,18 @@ public async Task RunAsync( } } - session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, onFolderLinkReady, cancellationToken)); + var callOutput = await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, onFolderLinkReady, cancellationToken); + session.ConversationHistory.Add(callOutput); + session.NewItems.Add(callOutput); break; case "function_call": hasActions = true; var funcName = item.GetProperty("name").GetString(); _logger.LogInformation("CUA iteration {Iteration}: function_call {Name}", i + 1, funcName); - session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + var funcOutput = CreateFunctionOutput(item.GetProperty("call_id").GetString()!); + session.ConversationHistory.Add(funcOutput); + session.NewItems.Add(funcOutput); if (funcName == "OnTaskComplete") { return "Task completed successfully."; @@ -393,19 +412,41 @@ public async Task EndSessionOnShutdownAsync() return (_cachedTools, _cachedMcpClient); } - private async Task CallModelAsync(List conversation, CancellationToken ct) + private async Task CallModelAsync(ConversationSession session, CancellationToken ct) { + List input; + string? previousResponseId = null; + + if (session.LastResponseId != null) + { + // Send only the items added since the last model call + input = session.NewItems; + previousResponseId = session.LastResponseId; + } + else + { + // First call — send the full conversation history + input = session.ConversationHistory; + } + var body = JsonSerializer.Serialize(new ComputerUseRequest { Model = _modelProvider.ModelName, Instructions = SystemInstructions, - Input = conversation, + PreviousResponseId = previousResponseId, + Input = input, Tools = _tools, Truncation = "auto" }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); var responseJson = await _modelProvider.SendAsync(body, ct); - return JsonSerializer.Deserialize(responseJson); + var response = JsonSerializer.Deserialize(responseJson); + + // Store the response ID for the next call and reset new items + session.LastResponseId = response?.Id; + session.NewItems.Clear(); + + return response; } /// @@ -883,7 +924,9 @@ private sealed class ConversationSession { public bool SessionStarted { get; set; } public string? W365SessionId { get; set; } - public List ConversationHistory { get; } = []; + public List ConversationHistory { get; } = []; + public List NewItems { get; } = []; + public string? LastResponseId { get; set; } public int ScreenshotCounter { get; set; } public bool FolderShared { get; set; } public string? ScreenshotSubfolder { get; set; } diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs index 1f774b33..bd974729 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs @@ -41,6 +41,9 @@ public class ComputerUseRequest [JsonPropertyName("instructions")] public string? Instructions { get; set; } + [JsonPropertyName("previous_response_id")] + public string? PreviousResponseId { get; set; } + [JsonPropertyName("input")] public List Input { get; set; } = []; From 9cb82297670e4fadbe6cf02cbea182b836d92558 Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Wed, 8 Apr 2026 14:23:26 -0700 Subject: [PATCH 09/17] fix: keep last screenshot pair between messages + session recovery for screenshots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: 1. Keep the last computer_call + computer_call_output pair when pruning history between user messages. The API requires a matching computer_call for every computer_call_output (linked by call_id) — dropping one causes BadRequest: "No tool call found for computer call with call_id". This also gives the model visual context for simple follow-ups. 2. Add session recovery to the screenshot capture path, matching the existing pattern used by action tools (click, type, scroll). If CaptureScreenshot returns "no active session", recover the session and retry once — instead of failing outright. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../ComputerUse/ComputerUseOrchestrator.cs | 64 +++++++++++++++++-- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 2560b00c..83913517 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -138,14 +138,45 @@ public async Task RunAsync( } // Between user messages: keep text context (user messages + model text replies) - // but drop computer actions and screenshots. This preserves conversational memory - // (so follow-ups like "how many?" work) without carrying stale screen state or - // heavy base64 screenshot data from previous tasks. + // and the LAST computer_call + computer_call_output pair so the model has visual + // context for simple follow-ups. Both must be kept together — the API requires + // a matching computer_call for every computer_call_output (linked by call_id). + JsonElement? lastCall = null; + JsonElement? lastCallOutput = null; + for (var i = session.ConversationHistory.Count - 1; i >= 0; i--) + { + var item = session.ConversationHistory[i]; + if (item.TryGetProperty("type", out var t) && t.GetString() == "computer_call_output") + { + lastCallOutput = item; + // Find the matching computer_call by call_id + var callId = item.TryGetProperty("call_id", out var cid) ? cid.GetString() : null; + if (callId != null) + { + for (var j = i - 1; j >= 0; j--) + { + var candidate = session.ConversationHistory[j]; + if (candidate.TryGetProperty("type", out var ct) && ct.GetString() == "computer_call" + && candidate.TryGetProperty("call_id", out var ccid) && ccid.GetString() == callId) + { + lastCall = candidate; + break; + } + } + } + break; + } + } session.ConversationHistory.RemoveAll(item => { var type = item.TryGetProperty("type", out var t) ? t.GetString() : null; return type is "computer_call" or "computer_call_output" or "function_call" or "function_call_output"; }); + if (lastCall.HasValue && lastCallOutput.HasValue) + { + session.ConversationHistory.Add(lastCall.Value); + session.ConversationHistory.Add(lastCallOutput.Value); + } session.NewItems.Clear(); session.LastResponseId = null; @@ -206,7 +237,7 @@ public async Task RunAsync( // For gpt-5.4+ ("computer" tool type), capture an initial screenshot if (_toolType == "computer") { - var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); + var initialScreenshot = await CaptureScreenshotCoreAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); var initialName = $"{++session.ScreenshotCounter:D3}_initial"; SaveScreenshotToDisk(initialScreenshot!, initialName); var folderUrl = await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken, session.ScreenshotSubfolder, session); @@ -502,8 +533,18 @@ private async Task HandleComputerCallAsync( } } - // Always capture screenshot after action - var screenshot = await CaptureScreenshotAsync(tools, mcpClient, sessionId, ct); + // Always capture screenshot after action — with session recovery (same pattern as action tools) + string screenshot; + try + { + screenshot = await CaptureScreenshotCoreAsync(tools, mcpClient, sessionId, ct); + } + catch (InvalidOperationException ex) when (IsSessionNotFoundError(ex.Message)) + { + onStatus?.Invoke("Session lost — recovering..."); + sessionId = await RecoverSessionAsync(session, tools, _logger, ct); + screenshot = await CaptureScreenshotCoreAsync(tools, mcpClient, sessionId, ct); + } var stepName = $"{++session.ScreenshotCounter:D3}_step"; SaveScreenshotToDisk(screenshot!, stepName); @@ -591,7 +632,7 @@ private static (string ToolName, Dictionary Args) MapActionToMc return (toolName, args); } - private async Task CaptureScreenshotAsync(IList tools, IMcpClient? mcpClient, string? sessionId, CancellationToken ct) + private async Task CaptureScreenshotCoreAsync(IList tools, IMcpClient? mcpClient, string? sessionId, CancellationToken ct) { var screenshotArgs = new Dictionary(); if (!string.IsNullOrEmpty(sessionId)) @@ -617,6 +658,11 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien } } + // Check if the error is a session-not-found before throwing + var contentText = string.Join(" ", result.Content.Select(c => c.Text ?? "")); + if (IsSessionNotFoundError(contentText)) + throw new InvalidOperationException($"Screenshot failed: no active session. Response: {contentText[..Math.Min(300, contentText.Length)]}"); + // Log full content for debugging foreach (var item in result.Content) _logger.LogWarning("Unhandled screenshot block: Type={Type}, Text={Preview}", item.Type, item.Text?[..Math.Min(200, item.Text.Length)]); @@ -631,6 +677,10 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien _logger.LogInformation("Screenshot fallback: result type={Type}, length={Length}, preview={Preview}", aiResult?.GetType().Name ?? "null", str.Length, str[..Math.Min(200, str.Length)]); + // Detect session-not-found early so the retry wrapper can recover the session + if (IsSessionNotFoundError(str)) + throw new InvalidOperationException($"Screenshot failed: no active session. Response: {str[..Math.Min(300, str.Length)]}"); + try { using var doc = JsonDocument.Parse(str); From 322daf17fbbf6eb80b657667293e7f41596e4f09 Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Wed, 8 Apr 2026 14:36:36 -0700 Subject: [PATCH 10/17] fix: use W365 session ID for OneDrive screenshot folder names Use the W365 session ID instead of truncated conversation ID for the OneDrive screenshot subfolder. Updated in both initial session start and session recovery paths. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../ComputerUse/ComputerUseOrchestrator.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 83913517..381a8f9a 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -222,6 +222,11 @@ public async Task RunAsync( { session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); session.SessionStarted = true; + // Update subfolder to use session ID instead of conversation ID + var safeSessionId = session.W365SessionId != null + ? new string(session.W365SessionId.Where(c => char.IsLetterOrDigit(c) || c == '-').ToArray()) + : "unknown"; + session.ScreenshotSubfolder = $"{DateTime.UtcNow:yyyyMMdd}_{safeSessionId}"; _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } catch (Exception ex) @@ -784,6 +789,11 @@ private static bool IsSessionNotFoundError(string response) var newSessionId = await StartSessionAsync(tools, logger, ct); session.W365SessionId = newSessionId; session.SessionStarted = true; + // Update subfolder to use new session ID + var safeSessionId = newSessionId != null + ? new string(newSessionId.Where(c => char.IsLetterOrDigit(c) || c == '-').ToArray()) + : "unknown"; + session.ScreenshotSubfolder = $"{DateTime.UtcNow:yyyyMMdd}_{safeSessionId}"; logger.LogInformation("Session recovered. New W365SessionId={SessionId}", newSessionId); return newSessionId; } From e54839251a05fd9ccd030488fa3e40793216ebea Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Wed, 8 Apr 2026 15:22:01 -0700 Subject: [PATCH 11/17] refactor: improve variable names in history pruning logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback — rename cryptic loop variables (i, j, t, ct, cid, ccid) to descriptive names (histIdx, searchIdx, entryType, earlierType, outputCallId, etc.). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../ComputerUse/ComputerUseOrchestrator.cs | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 381a8f9a..dbc012d1 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -143,23 +143,23 @@ public async Task RunAsync( // a matching computer_call for every computer_call_output (linked by call_id). JsonElement? lastCall = null; JsonElement? lastCallOutput = null; - for (var i = session.ConversationHistory.Count - 1; i >= 0; i--) + for (var histIdx = session.ConversationHistory.Count - 1; histIdx >= 0; histIdx--) { - var item = session.ConversationHistory[i]; - if (item.TryGetProperty("type", out var t) && t.GetString() == "computer_call_output") + var entry = session.ConversationHistory[histIdx]; + if (entry.TryGetProperty("type", out var entryType) && entryType.GetString() == "computer_call_output") { - lastCallOutput = item; + lastCallOutput = entry; // Find the matching computer_call by call_id - var callId = item.TryGetProperty("call_id", out var cid) ? cid.GetString() : null; - if (callId != null) + var outputCallId = entry.TryGetProperty("call_id", out var callIdProp) ? callIdProp.GetString() : null; + if (outputCallId != null) { - for (var j = i - 1; j >= 0; j--) + for (var searchIdx = histIdx - 1; searchIdx >= 0; searchIdx--) { - var candidate = session.ConversationHistory[j]; - if (candidate.TryGetProperty("type", out var ct) && ct.GetString() == "computer_call" - && candidate.TryGetProperty("call_id", out var ccid) && ccid.GetString() == callId) + var earlier = session.ConversationHistory[searchIdx]; + if (earlier.TryGetProperty("type", out var earlierType) && earlierType.GetString() == "computer_call" + && earlier.TryGetProperty("call_id", out var earlierCallId) && earlierCallId.GetString() == outputCallId) { - lastCall = candidate; + lastCall = earlier; break; } } @@ -169,8 +169,8 @@ public async Task RunAsync( } session.ConversationHistory.RemoveAll(item => { - var type = item.TryGetProperty("type", out var t) ? t.GetString() : null; - return type is "computer_call" or "computer_call_output" or "function_call" or "function_call_output"; + var itemType = item.TryGetProperty("type", out var typeProp) ? typeProp.GetString() : null; + return itemType is "computer_call" or "computer_call_output" or "function_call" or "function_call_output"; }); if (lastCall.HasValue && lastCallOutput.HasValue) { From 09148bc84bc3643d4947befefd2508b73c2ca1e6 Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Thu, 16 Apr 2026 09:29:07 -0700 Subject: [PATCH 12/17] feat: multi-server MCP support and function tool integration - Support multiple MCP server URLs (W365 + MailTools) with per-server HttpClient - Fix _cachedTools overwrite bug that dropped mail tools on second message - Add function tool instructions to system prompt so model prefers them over CUA - Per-server error handling so one server failure doesn't block others - Fix CancellationTokenSource disposal races in typing indicator - Make CUA session start message ephemeral (informative update) Co-Authored-By: Claude Opus 4.6 --- .../sample-agent/Agent/MyAgent.cs | 96 +++++--- .../ComputerUse/ComputerUseOrchestrator.cs | 212 ++++++++++++++---- 2 files changed, 230 insertions(+), 78 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index b1e61b84..e239996d 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -27,7 +27,7 @@ public class MyAgent : AgentApplication private readonly ILogger _logger; private readonly IMcpToolRegistrationService _toolService; private readonly ComputerUseOrchestrator _orchestrator; - private readonly string? _mcpServerUrl; + private readonly string[] _mcpServerUrls; private readonly string? AgenticAuthHandlerName; private readonly string? OboAuthHandlerName; @@ -70,7 +70,15 @@ public MyAgent( _logger = logger; _toolService = toolService; _orchestrator = orchestrator; - _mcpServerUrl = configuration["McpServer:Url"]; + + // Support multiple MCP server URLs; fall back to single McpServer:Url for backward compat + _mcpServerUrls = configuration.GetSection("McpServers").Get() ?? []; + if (_mcpServerUrls.Length == 0) + { + var singleUrl = configuration["McpServer:Url"]; + if (!string.IsNullOrEmpty(singleUrl)) + _mcpServerUrls = [singleUrl]; + } AgenticAuthHandlerName = configuration.GetValue("AgentApplication:AgenticAuthHandlerName"); OboAuthHandlerName = configuration.GetValue("AgentApplication:OboAuthHandlerName"); @@ -168,9 +176,6 @@ await A365OtelWrapper.InvokeObservedAgentOperation( _logger, async () => { - // Immediate acknowledgment - await turnContext.SendActivityAsync(MessageFactory.Text("Got it — working on it…"), cancellationToken).ConfigureAwait(false); - // Typing indicator await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), cancellationToken).ConfigureAwait(false); @@ -186,14 +191,15 @@ await A365OtelWrapper.InvokeObservedAgentOperation( } } catch (OperationCanceledException) { /* expected */ } + catch (ObjectDisposedException) { /* CTS disposed before task finished */ } }, typingCts.Token); try { var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; - // Get W365 MCP tools — direct connection in Dev, SDK in Production - var (w365Tools, mcpClient) = await GetW365ToolsAsync(turnContext, ToolAuthHandlerName); + // Get MCP tools — direct connection in Dev, SDK in Production + var (w365Tools, additionalTools, mcpClient) = await GetToolsAsync(turnContext, ToolAuthHandlerName); try { @@ -205,8 +211,6 @@ await turnContext.SendActivityAsync( return; } - await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Working on your request...").ConfigureAwait(false); - // Get Graph token for OneDrive screenshot upload. // In production: acquired via agentic auth (UserAuthorization). // In development: set GRAPH_TOKEN env var with a token that has Files.ReadWrite scope. @@ -226,9 +230,18 @@ await turnContext.SendActivityAsync( conversationId, userText, w365Tools, + additionalTools: additionalTools, mcpClient: mcpClient, graphAccessToken: graphToken, onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), + onCuaStarting: async (isNewSession) => + { + await turnContext.SendActivityAsync(MessageFactory.Text("Got it — working on it…"), cancellationToken).ConfigureAwait(false); + if (isNewSession) + { + await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Starting a session to a Windows 365 Cloud PC…"); + } + }, cancellationToken: cancellationToken); // Send the response @@ -242,20 +255,22 @@ await turnContext.SendActivityAsync( } finally { - typingCts.Cancel(); + try { typingCts.Cancel(); } catch (ObjectDisposedException) { } try { await typingTask.ConfigureAwait(false); } catch (OperationCanceledException) { /* expected */ } - await turnContext.StreamingResponse.EndStreamAsync(cancellationToken).ConfigureAwait(false); + catch (ObjectDisposedException) { /* expected */ } + try { await turnContext.StreamingResponse.EndStreamAsync(cancellationToken).ConfigureAwait(false); } + catch (ObjectDisposedException) { /* stream already disposed */ } } }); } /// - /// Get the W365 MCP tools. In Development mode with a bearer token, connects directly - /// to the MCP server URL from appsettings.json. In Production, uses the A365 SDK - /// to discover servers via the Tooling Gateway. + /// Get MCP tools, separated into W365 (CUA) and additional (function) tools. + /// In Development mode with a bearer token, connects directly to the MCP server URL. + /// In Production, uses the A365 SDK to discover servers via the Tooling Gateway. /// - private async Task<(IList? Tools, IMcpClient? Client)> GetW365ToolsAsync(ITurnContext context, string? authHandlerName) + private async Task<(IList? W365Tools, IList? AdditionalTools, IMcpClient? Client)> GetToolsAsync(ITurnContext context, string? authHandlerName) { // Acquire access token string? accessToken = null; @@ -276,37 +291,42 @@ await turnContext.SendActivityAsync( if (string.IsNullOrEmpty(accessToken) || string.IsNullOrEmpty(agentId)) { _logger.LogWarning("No auth token or agent identity available. Cannot connect to MCP."); - return (null, null); + return (null, null, null); } try { + IList? allTools; + IMcpClient? mcpClient = null; + // Development with bearer token: use orchestrator's cached MCP connection if (TryGetBearerTokenForDevelopment(out _) && IsDevelopment()) { - if (string.IsNullOrEmpty(_mcpServerUrl)) - throw new InvalidOperationException("McpServer:Url is required in appsettings.json for Development mode."); - return await _orchestrator.GetOrCreateMcpConnectionAsync(_mcpServerUrl, accessToken!); + if (_mcpServerUrls.Length == 0) + throw new InvalidOperationException("McpServers (or McpServer:Url) is required in appsettings for Development mode."); + (allTools, mcpClient) = await _orchestrator.GetOrCreateMcpConnectionAsync(_mcpServerUrls, accessToken!); } + else + { + // Production: use the A365 SDK's tooling gateway for server discovery + var handlerForMcp = !string.IsNullOrEmpty(authHandlerName) + ? authHandlerName + : OboAuthHandlerName ?? AgenticAuthHandlerName ?? string.Empty; + var tokenOverride = string.IsNullOrEmpty(authHandlerName) ? accessToken : null; - // Production: use the A365 SDK's tooling gateway for server discovery - var handlerForMcp = !string.IsNullOrEmpty(authHandlerName) - ? authHandlerName - : OboAuthHandlerName ?? AgenticAuthHandlerName ?? string.Empty; - var tokenOverride = string.IsNullOrEmpty(authHandlerName) ? accessToken : null; - - var allTools = await _toolService.GetMcpToolsAsync(agentId, UserAuthorization, handlerForMcp, context, tokenOverride).ConfigureAwait(false); + allTools = await _toolService.GetMcpToolsAsync(agentId, UserAuthorization, handlerForMcp, context, tokenOverride).ConfigureAwait(false); + } - // Filter to only W365 tools var w365Tools = FilterW365Tools(allTools); - return (w365Tools, null); + var additionalTools = FilterAdditionalTools(allTools); + return (w365Tools, additionalTools, mcpClient); } catch (Exception ex) { if (ShouldSkipToolingOnErrors()) { _logger.LogWarning(ex, "Failed to connect to MCP servers. Continuing without tools (SKIP_TOOLING_ON_ERRORS=true)."); - return (null, null); + return (null, null, null); } _logger.LogError(ex, "Failed to connect to MCP servers."); @@ -341,4 +361,22 @@ private static bool IsDevelopment() return w365Tools; } + + private IList? FilterAdditionalTools(IList? allTools) + { + var additionalTools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return !name.StartsWith("W365_", StringComparison.OrdinalIgnoreCase); + }).ToList(); + + if (additionalTools != null && additionalTools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} additional function tools: {Names}", + additionalTools.Count, + string.Join(", ", additionalTools.Select(t => (t as AIFunction)?.Name ?? "?"))); + } + + return additionalTools; + } } diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 6160816e..52c9c0b9 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -18,6 +18,7 @@ namespace W365ComputerUseSample.ComputerUse; public class ComputerUseOrchestrator { private readonly ICuaModelProvider _modelProvider; + private readonly IHttpClientFactory _httpClientFactory; private readonly HttpClient _httpClient; private readonly ILogger _logger; private readonly int _maxIterations; @@ -34,20 +35,36 @@ public class ComputerUseOrchestrator private readonly ConcurrentDictionary _sessions = new(); /// - /// Shared MCP client — one SSE connection reused across all conversations. + /// Primary MCP client (W365 server) — used for direct screenshot calls. /// private IMcpClient? _cachedMcpClient; /// - /// Shared tool list — same tools for all conversations. + /// All MCP clients — one per connected server, for cleanup on shutdown. + /// + private readonly List _allMcpClients = []; + + /// + /// Shared tool list — merged tools from all connected servers. /// private IList? _cachedTools; private const string SystemInstructions = """ - You are a computer-using agent that can control a Windows desktop computer. - After each action, examine the screenshot to verify it worked. + You are a helpful assistant that can also control a Windows desktop computer. + If the user's message is conversational or doesn't require computer use, respond with a helpful text message. + + ## Function tools (email, calendar, etc.) + You have access to function tools for tasks like sending email, managing calendar, etc. + ALWAYS use function tools when available — they are faster and more reliable than computer actions. + When the user asks you to send an email, search messages, or perform any action that matches a function tool, call that tool directly. + After calling a function tool, respond with a text message describing what you did and the result. + Do NOT call OnTaskComplete after using function tools — just respond with text. + + ## Computer use (desktop control) + Only use computer actions when no function tool can accomplish the task. + When a task requires computer use, perform the actions and examine screenshots to verify they worked. If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). - Once you have completed the task, call the OnTaskComplete function. + Once you have completed a computer use task, call the OnTaskComplete function. Do NOT continue looping after the task is done. """; @@ -58,6 +75,7 @@ public ComputerUseOrchestrator( ILogger logger) { _modelProvider = modelProvider; + _httpClientFactory = httpClientFactory; _httpClient = httpClientFactory.CreateClient("WebClient"); _logger = logger; _maxIterations = configuration.GetValue("ComputerUse:MaxIterations", 30); @@ -104,33 +122,24 @@ public async Task RunAsync( string conversationId, string userMessage, IList w365Tools, + IList? additionalTools = null, IMcpClient? mcpClient = null, string? graphAccessToken = null, Action? onStatusUpdate = null, + Func? onCuaStarting = null, CancellationToken cancellationToken = default) { - _logger.LogInformation("Starting CUA loop for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); - _cachedTools = w365Tools; - if (mcpClient != null) _cachedMcpClient = mcpClient; + _logger.LogInformation("Processing message for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); - // Start session once per conversation - if (!session.SessionStarted) - { - _logger.LogInformation("No active session for conversation {ConversationId} — calling QuickStartSession", conversationId); - onStatusUpdate?.Invoke("Starting W365 computing session..."); - session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); - session.SessionStarted = true; - _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); - } - else + if (session.SessionStarted) { _logger.LogInformation("Reusing session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } - // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message - if (_toolType == "computer" && session.ConversationHistory.Count == 0) + // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message if session already active + if (_toolType == "computer" && session.ConversationHistory.Count == 0 && session.SessionStarted) { var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); var initialName = $"{conversationId[..8]}_{++session.ScreenshotCounter:D3}_initial"; @@ -152,11 +161,35 @@ public async Task RunAsync( session.ConversationHistory.Add(CreateUserMessage(userMessage)); } + // Build the model's tools list — computer + OnTaskComplete + any additional function tools + var modelTools = new List(_tools); + if (additionalTools?.Count > 0) + { + foreach (var tool in additionalTools.OfType()) + { + modelTools.Add(new FunctionToolDefinition + { + Name = tool.Name, + Description = tool.Description ?? string.Empty, + Parameters = tool.JsonSchema + }); + } + + _logger.LogInformation("Added {Count} additional function tools to model", additionalTools.Count); + foreach (var tool in additionalTools.OfType()) + { + var schemaStr = tool.JsonSchema.GetRawText(); + _logger.LogInformation("Function tool: {Name}, Description: {Desc}, Schema: {Schema}", + tool.Name, Truncate(tool.Description ?? "", 80), Truncate(schemaStr, 200)); + } + } + + var cuaAcknowledged = false; for (var i = 0; i < _maxIterations; i++) { cancellationToken.ThrowIfCancellationRequested(); - var response = await CallModelAsync(session.ConversationHistory, cancellationToken); + var response = await CallModelAsync(session.ConversationHistory, modelTools, cancellationToken); if (response?.Output == null || response.Output.Count == 0) break; @@ -176,6 +209,27 @@ public async Task RunAsync( case "computer_call": hasActions = true; + // Lazy session start: only start when CUA is actually needed + if (!cuaAcknowledged) + { + if (!session.SessionStarted) + { + _logger.LogInformation("CUA needed for conversation {ConversationId} — starting session", conversationId); + if (onCuaStarting != null) + await onCuaStarting(true); + onStatusUpdate?.Invoke("Starting W365 computing session..."); + session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); + session.SessionStarted = true; + _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + } + else if (onCuaStarting != null) + { + await onCuaStarting(false); + } + + cuaAcknowledged = true; + } + _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, cancellationToken)); break; @@ -184,11 +238,19 @@ public async Task RunAsync( hasActions = true; var funcName = item.GetProperty("name").GetString(); _logger.LogInformation("CUA iteration {Iteration}: function_call {Name}", i + 1, funcName); - session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); if (funcName == "OnTaskComplete") { + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); return "Task completed successfully."; } + + // Invoke additional MCP function tool + if (additionalTools != null) + { + var callResult = await InvokeFunctionCallAsync(item, additionalTools, cancellationToken); + session.ConversationHistory.Add(callResult); + } + break; } } @@ -245,11 +307,14 @@ public async Task EndSessionOnShutdownAsync() _sessions.Clear(); _cachedTools = null; - if (_cachedMcpClient != null) + foreach (var client in _allMcpClients) { - await _cachedMcpClient.DisposeAsync(); - _cachedMcpClient = null; + try { await client.DisposeAsync(); } + catch (Exception ex) { _logger.LogWarning(ex, "Failed to dispose MCP client"); } } + + _allMcpClients.Clear(); + _cachedMcpClient = null; } /// @@ -299,52 +364,78 @@ public async Task EndSessionOnShutdownAsync() } /// - /// Get or create the MCP client and tool list. Creates the connection once on first call, - /// then returns the cached result on subsequent calls. This ensures the SSE connection - /// stays alive across messages (MyAgent is transient, but this orchestrator is singleton). + /// Get or create MCP clients and merged tool list. Connects to each server URL once on first call, + /// then returns the cached result on subsequent calls. The SSE connections stay alive across + /// messages (MyAgent is transient, but this orchestrator is singleton). + /// The primary MCP client (for W365 screenshot calls) is the one whose tools start with "W365_". /// public async Task<(IList Tools, IMcpClient? Client)> GetOrCreateMcpConnectionAsync( - string mcpUrl, string accessToken) + IList mcpUrls, string accessToken) { if (_cachedTools != null) return (_cachedTools, _cachedMcpClient); - var httpClient = _httpClient; - httpClient.DefaultRequestHeaders.Authorization = - new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", accessToken); + var allTools = new List(); - var transport = new SseClientTransport(new SseClientTransportOptions + foreach (var url in mcpUrls) { - Endpoint = new Uri(mcpUrl), - TransportMode = HttpTransportMode.AutoDetect, - }, httpClient); + try + { + // Each MCP server needs its own HttpClient — the auto-detect transport + // manages internal state that conflicts when shared across connections. + var httpClient = _httpClientFactory.CreateClient("McpConnection"); + httpClient.DefaultRequestHeaders.Authorization = + new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", accessToken); - _cachedMcpClient = await McpClientFactory.CreateAsync(transport); - var allTools = (await _cachedMcpClient.ListToolsAsync()).Cast().ToList(); + var transport = new SseClientTransport(new SseClientTransportOptions + { + Endpoint = new Uri(url), + TransportMode = HttpTransportMode.AutoDetect, + }, httpClient); - // Filter to W365 tools only - _cachedTools = allTools.Where(t => - { - var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; - return name.StartsWith("W365_", StringComparison.OrdinalIgnoreCase); - }).ToList(); + var client = await McpClientFactory.CreateAsync(transport); + var tools = (await client.ListToolsAsync()).Cast().ToList(); + + _allMcpClients.Add(client); + allTools.AddRange(tools); + + // Use the W365 server's client for direct screenshot calls + var hasW365Tools = tools.Any(t => (t as AIFunction)?.Name?.StartsWith("W365_", StringComparison.OrdinalIgnoreCase) == true); + if (hasW365Tools) + _cachedMcpClient = client; + + _logger.LogInformation("Connected to MCP server at {Url}, loaded {Count} tools: {Names}", + url, tools.Count, string.Join(", ", tools.Select(t => (t as AIFunction)?.Name ?? "?"))); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to connect to MCP server at {Url}. Skipping.", url); + } + } + + // Fallback: use first client if no W365 server found + _cachedMcpClient ??= _allMcpClients.FirstOrDefault(); - _logger.LogInformation("Connected to MCP server at {Url}, loaded {Count} W365 tools", mcpUrl, _cachedTools.Count); + _cachedTools = allTools; + _logger.LogInformation("Total tools from {ServerCount} MCP server(s): {ToolCount}", mcpUrls.Count, allTools.Count); return (_cachedTools, _cachedMcpClient); } - private async Task CallModelAsync(List conversation, CancellationToken ct) + private async Task CallModelAsync(List conversation, List tools, CancellationToken ct) { var body = JsonSerializer.Serialize(new ComputerUseRequest { Model = _modelProvider.ModelName, Instructions = SystemInstructions, Input = conversation, - Tools = _tools, + Tools = tools, Truncation = "auto" }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); + _logger.LogInformation("Model request (first 2000 chars): {Body}", body[..Math.Min(2000, body.Length)]); + var responseJson = await _modelProvider.SendAsync(body, ct); + _logger.LogInformation("Model response (first 2000 chars): {Response}", responseJson[..Math.Min(2000, responseJson.Length)]); return JsonSerializer.Deserialize(responseJson); } @@ -597,11 +688,34 @@ private static JsonElement CreateUserMessage(string text) => ToJsonElement(new content = new[] { new { type = "input_text", text } } }); - private static JsonElement CreateFunctionOutput(string callId) => ToJsonElement(new + private static JsonElement CreateFunctionOutput(string callId, string output = "success") => ToJsonElement(new { - type = "function_call_output", call_id = callId, output = "success" + type = "function_call_output", call_id = callId, output }); + /// + /// Invoke an MCP function tool from a model function_call and return the function_call_output. + /// + private async Task InvokeFunctionCallAsync(JsonElement functionCall, IList tools, CancellationToken ct) + { + var callId = functionCall.GetProperty("call_id").GetString()!; + var name = functionCall.GetProperty("name").GetString()!; + var argsStr = functionCall.GetProperty("arguments").GetString() ?? "{}"; + + try + { + var args = JsonSerializer.Deserialize>(argsStr) ?? []; + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? "success"; + return CreateFunctionOutput(callId, resultStr); + } + catch (Exception ex) + { + _logger.LogError(ex, "Function call {Name} failed", name); + return CreateFunctionOutput(callId, $"Error: {ex.Message}"); + } + } + private static JsonElement ToJsonElement(object obj) => JsonSerializer.Deserialize(JsonSerializer.Serialize(obj)); From 5ea71c344411db89cd168425a1d26adc23be3aad Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Fri, 17 Apr 2026 14:17:45 -0700 Subject: [PATCH 13/17] Restore OneDrive folder link, add mail tooling, logging, prompt tweaks - ToolingManifest.json: add mcp_MailTools entry so Mail MCP is discovered in prod - ComputerUseOrchestrator: restore onFolderLinkReady callback, session-ID-based screenshot subfolder, ShareConversationFolderAsync, FolderShared flag, and have UploadScreenshotToOneDriveAsync return the share URL - ComputerUseOrchestrator: log function_call args and returned output for troubleshooting MCP tool invocations - ComputerUseOrchestrator: update system prompt so model tells the user "I can't" when no matching tool exists (instead of silently calling OnTaskComplete) - MyAgent.cs: pass onFolderLinkReady callback that posts a View-folder link Co-Authored-By: Claude Opus 4.7 (1M context) --- .../sample-agent/Agent/MyAgent.cs | 2 + .../ComputerUse/ComputerUseOrchestrator.cs | 135 +++++++++++++++--- .../sample-agent/ToolingManifest.json | 4 + 3 files changed, 125 insertions(+), 16 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index e239996d..cd84dcd4 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -242,6 +242,8 @@ await turnContext.SendActivityAsync( await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Starting a session to a Windows 365 Cloud PC…"); } }, + onFolderLinkReady: async url => await turnContext.SendActivityAsync( + MessageFactory.Text($"📸 Screenshots for this session: [View folder]({url})"), cancellationToken), cancellationToken: cancellationToken); // Send the response diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 52c9c0b9..8d0597da 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -54,12 +54,17 @@ You are a helpful assistant that can also control a Windows desktop computer. If the user's message is conversational or doesn't require computer use, respond with a helpful text message. ## Function tools (email, calendar, etc.) - You have access to function tools for tasks like sending email, managing calendar, etc. - ALWAYS use function tools when available — they are faster and more reliable than computer actions. - When the user asks you to send an email, search messages, or perform any action that matches a function tool, call that tool directly. + You may have access to function tools for tasks like sending email, managing calendar, etc. + Prefer function tools over computer use when a matching one is available — they are faster and more reliable. After calling a function tool, respond with a text message describing what you did and the result. Do NOT call OnTaskComplete after using function tools — just respond with text. + ## When no tool can accomplish the request + If the user asks for something and no function tool matches AND computer use cannot accomplish it either, + respond with a text message explaining clearly that you are unable to perform that task and why + (e.g. "I don't have an email tool available in this environment"). + Do NOT call OnTaskComplete in this case — only call OnTaskComplete when you have actually completed a computer-use task. + ## Computer use (desktop control) Only use computer actions when no function tool can accomplish the task. When a task requires computer use, perform the actions and examine screenshots to verify they worked. @@ -127,11 +132,20 @@ public async Task RunAsync( string? graphAccessToken = null, Action? onStatusUpdate = null, Func? onCuaStarting = null, + Func? onFolderLinkReady = null, CancellationToken cancellationToken = default) { _logger.LogInformation("Processing message for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); - var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); + var session = _sessions.GetOrAdd(conversationId, _ => + { + var safeId = new string(conversationId.Where(c => char.IsLetterOrDigit(c)).ToArray()); + safeId = safeId.Length > 8 ? safeId[..8] : safeId; + return new ConversationSession + { + ScreenshotSubfolder = $"{DateTime.UtcNow:yyyyMMdd}_{safeId}" + }; + }); if (session.SessionStarted) { @@ -144,7 +158,9 @@ public async Task RunAsync( var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); var initialName = $"{conversationId[..8]}_{++session.ScreenshotCounter:D3}_initial"; SaveScreenshotToDisk(initialScreenshot!, initialName); - await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken); + var folderUrlReuse = await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken, session.ScreenshotSubfolder, session); + if (folderUrlReuse != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrlReuse); session.ConversationHistory.Add(ToJsonElement(new { type = "message", @@ -220,6 +236,11 @@ public async Task RunAsync( onStatusUpdate?.Invoke("Starting W365 computing session..."); session.W365SessionId = await StartSessionAsync(w365Tools, _logger, cancellationToken); session.SessionStarted = true; + // Update subfolder to use W365 session ID instead of conversation ID + var safeSessionId = session.W365SessionId != null + ? new string(session.W365SessionId.Where(c => char.IsLetterOrDigit(c) || c == '-').ToArray()) + : "unknown"; + session.ScreenshotSubfolder = $"{DateTime.UtcNow:yyyyMMdd}_{safeSessionId}"; _logger.LogInformation("Session started for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); } else if (onCuaStarting != null) @@ -231,7 +252,7 @@ public async Task RunAsync( } _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); - session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, cancellationToken)); + session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, onFolderLinkReady, cancellationToken)); break; case "function_call": @@ -443,7 +464,7 @@ public async Task EndSessionOnShutdownAsync() /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. /// private async Task HandleComputerCallAsync( - JsonElement call, IList tools, IMcpClient? mcpClient, ConversationSession session, string? graphAccessToken, Action? onStatus, CancellationToken ct) + JsonElement call, IList tools, IMcpClient? mcpClient, ConversationSession session, string? graphAccessToken, Action? onStatus, Func? onFolderLinkReady, CancellationToken ct) { var callId = call.GetProperty("call_id").GetString()!; var sessionId = session.W365SessionId; @@ -482,7 +503,9 @@ private async Task HandleComputerCallAsync( var stepName = $"{++session.ScreenshotCounter:D3}_step"; SaveScreenshotToDisk(screenshot!, stepName); - await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken); + var folderUrl = await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken, session.ScreenshotSubfolder, session); + if (folderUrl != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrl); var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) ? sc : JsonSerializer.Deserialize("[]"); @@ -702,16 +725,21 @@ private async Task InvokeFunctionCallAsync(JsonElement functionCall var name = functionCall.GetProperty("name").GetString()!; var argsStr = functionCall.GetProperty("arguments").GetString() ?? "{}"; + _logger.LogInformation("Function call {Name} invoked. call_id={CallId}, args={Args}", + name, callId, Truncate(argsStr, 1000)); + try { var args = JsonSerializer.Deserialize>(argsStr) ?? []; var result = await InvokeToolAsync(tools, name, args, ct); var resultStr = result?.ToString() ?? "success"; + _logger.LogInformation("Function call {Name} returned ({Length} chars): {Result}", + name, resultStr.Length, Truncate(resultStr, 2000)); return CreateFunctionOutput(callId, resultStr); } catch (Exception ex) { - _logger.LogError(ex, "Function call {Name} failed", name); + _logger.LogError(ex, "Function call {Name} threw. call_id={CallId}", name, callId); return CreateFunctionOutput(callId, $"Error: {ex.Message}"); } } @@ -742,22 +770,22 @@ private void SaveScreenshotToDisk(string base64Data, string name) /// Requires a Graph access token with Files.ReadWrite scope. /// Files are uploaded to /CUA-Sessions/{date}/ folder. /// - private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken) + private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken, string? subfolder, ConversationSession session) { if (string.IsNullOrEmpty(graphAccessToken)) { _logger.LogDebug("OneDrive upload skipped: no Graph token"); - return; + return null; } if (string.IsNullOrEmpty(base64Data)) { _logger.LogDebug("OneDrive upload skipped: no screenshot data"); - return; + return null; } if (string.IsNullOrEmpty(_oneDriveFolder)) { _logger.LogDebug("OneDrive upload skipped: OneDriveFolder not configured"); - return; + return null; } try @@ -766,7 +794,10 @@ private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fil var driveBase = string.IsNullOrEmpty(_oneDriveUserId) ? "https://graph.microsoft.com/v1.0/me/drive" : $"https://graph.microsoft.com/v1.0/users/{_oneDriveUserId}/drive"; - var url = $"{driveBase}/root:/{_oneDriveFolder.TrimStart('/')}/{fileName}:/content"; + var folderPath = string.IsNullOrEmpty(subfolder) + ? _oneDriveFolder.TrimStart('/') + : $"{_oneDriveFolder.TrimStart('/')}/{subfolder}"; + var url = $"{driveBase}/root:/{folderPath}/{fileName}:/content"; using var request = new HttpRequestMessage(HttpMethod.Put, url); request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); @@ -776,17 +807,87 @@ private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fil var response = await _httpClient.SendAsync(request); if (response.IsSuccessStatusCode) { - _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", _oneDriveFolder, fileName); + _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", folderPath, fileName); + + // On first upload, create an org-scoped sharing link for the folder + if (!session.FolderShared) + { + var shareUrl = await ShareConversationFolderAsync(folderPath, graphAccessToken); + if (shareUrl != null) + { + session.FolderShared = true; + return shareUrl; + } + } } else { - _logger.LogWarning("OneDrive upload failed: {Status}", response.StatusCode); + var content = await response.Content.ReadAsStringAsync(); + _logger.LogWarning("OneDrive upload failed: {Status} {Content}", response.StatusCode, content); } } catch (Exception ex) { _logger.LogWarning(ex, "Failed to upload screenshot to OneDrive"); } + + return null; + } + + /// + /// Create an organization-scoped sharing link for the conversation's screenshot folder. + /// Returns the web URL that anyone in the org can use to view the folder. + /// + private async Task ShareConversationFolderAsync(string folderPath, string graphAccessToken) + { + try + { + var driveBase = string.IsNullOrEmpty(_oneDriveUserId) + ? "https://graph.microsoft.com/v1.0/me/drive" + : $"https://graph.microsoft.com/v1.0/users/{_oneDriveUserId}/drive"; + + using var getRequest = new HttpRequestMessage(HttpMethod.Get, $"{driveBase}/root:/{folderPath}"); + getRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + var getResponse = await _httpClient.SendAsync(getRequest); + + if (!getResponse.IsSuccessStatusCode) + { + _logger.LogWarning("Failed to get folder item for sharing: {Status}", getResponse.StatusCode); + return null; + } + + var folderJson = await getResponse.Content.ReadAsStringAsync(); + using var doc = JsonDocument.Parse(folderJson); + var folderId = doc.RootElement.GetProperty("id").GetString(); + var webUrl = doc.RootElement.TryGetProperty("webUrl", out var wu) ? wu.GetString() : null; + + using var linkRequest = new HttpRequestMessage(HttpMethod.Post, $"{driveBase}/items/{folderId}/createLink"); + linkRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + linkRequest.Content = new StringContent( + JsonSerializer.Serialize(new { type = "view", scope = "organization" }), + System.Text.Encoding.UTF8, "application/json"); + + var linkResponse = await _httpClient.SendAsync(linkRequest); + if (linkResponse.IsSuccessStatusCode) + { + var linkJson = await linkResponse.Content.ReadAsStringAsync(); + using var linkDoc = JsonDocument.Parse(linkJson); + var shareUrl = linkDoc.RootElement.GetProperty("link").GetProperty("webUrl").GetString(); + _logger.LogInformation("Folder shared with org: {Url}", shareUrl); + return shareUrl; + } + else + { + var errorContent = await linkResponse.Content.ReadAsStringAsync(); + _logger.LogWarning("Failed to create sharing link: {Status} {Content}", linkResponse.StatusCode, errorContent); + return webUrl; + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to share conversation folder"); + return null; + } } /// @@ -799,5 +900,7 @@ private sealed class ConversationSession public string? W365SessionId { get; set; } public List ConversationHistory { get; } = []; public int ScreenshotCounter { get; set; } + public string? ScreenshotSubfolder { get; set; } + public bool FolderShared { get; set; } } } diff --git a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json index b2c5acbf..35b03e1c 100644 --- a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json +++ b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json @@ -3,6 +3,10 @@ { "mcpServerName": "mcp_W365ComputerUse", "url": "mcp_W365ComputerUse" + }, + { + "mcpServerName": "mcp_MailTools", + "url": "mcp_MailTools" } ] } From 908ef9ec930f1d1ee594272ada64443d2afbd61c Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Fri, 17 Apr 2026 14:21:29 -0700 Subject: [PATCH 14/17] Revert "Merge remote-tracking branch 'myfork/mabdelkader/cuaAgentSample' into multi-mcp-tools" This reverts commit bcd331b094b2e3b00270e3f09e1bb3f9b5cf6f9a, reversing changes made to 5ea71c344411db89cd168425a1d26adc23be3aad. --- .../sample-agent/Agent/MyAgent.cs | 2 +- .../ComputerUse/ComputerUseOrchestrator.cs | 190 ++---------------- .../ComputerUse/Models/ComputerUseModels.cs | 3 - .../sample-agent/W365ComputerUseSample.csproj | 4 +- .../sample-agent/nuget.config | 7 - 5 files changed, 16 insertions(+), 190 deletions(-) delete mode 100644 dotnet/w365-computer-use/sample-agent/nuget.config diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index 517effeb..cd84dcd4 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -197,7 +197,6 @@ await A365OtelWrapper.InvokeObservedAgentOperation( try { var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; - var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); // Get MCP tools — direct connection in Dev, SDK in Production var (w365Tools, additionalTools, mcpClient) = await GetToolsAsync(turnContext, ToolAuthHandlerName); @@ -226,6 +225,7 @@ await turnContext.SendActivityAsync( } // Run the CUA loop — session is managed per conversation + var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); var response = await _orchestrator.RunAsync( conversationId, userText, diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index d3c23ef7..8d0597da 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -71,8 +71,6 @@ Only use computer actions when no function tool can accomplish the task. If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). Once you have completed a computer use task, call the OnTaskComplete function. Do NOT continue looping after the task is done. - If the user wants to end, quit, or disconnect their session, call the EndSession function. - If the user sends a casual greeting or question that does not require computer use, reply with a helpful text message. """; public ComputerUseOrchestrator( @@ -118,11 +116,6 @@ public ComputerUseOrchestrator( { Name = "OnTaskComplete", Description = "Call this function when the given task has been completed successfully." - }, - new FunctionToolDefinition - { - Name = "EndSession", - Description = "Call this function when the user wants to end, quit, disconnect, or release their computer session." } ]; } @@ -170,41 +163,19 @@ public async Task RunAsync( await onFolderLinkReady(folderUrlReuse); session.ConversationHistory.Add(ToJsonElement(new { - lastCallOutput = entry; - // Find the matching computer_call by call_id - var outputCallId = entry.TryGetProperty("call_id", out var callIdProp) ? callIdProp.GetString() : null; - if (outputCallId != null) + type = "message", + role = "user", + content = new object[] { - for (var searchIdx = histIdx - 1; searchIdx >= 0; searchIdx--) - { - var earlier = session.ConversationHistory[searchIdx]; - if (earlier.TryGetProperty("type", out var earlierType) && earlierType.GetString() == "computer_call" - && earlier.TryGetProperty("call_id", out var earlierCallId) && earlierCallId.GetString() == outputCallId) - { - lastCall = earlier; - break; - } - } + new { type = "input_text", text = userMessage }, + new { type = "input_image", image_url = $"data:image/png;base64,{initialScreenshot}" } } - break; - } + })); } - session.ConversationHistory.RemoveAll(item => - { - var itemType = item.TryGetProperty("type", out var typeProp) ? typeProp.GetString() : null; - return itemType is "computer_call" or "computer_call_output" or "function_call" or "function_call_output"; - }); - if (lastCall.HasValue && lastCallOutput.HasValue) + else { - session.ConversationHistory.Add(lastCall.Value); - session.ConversationHistory.Add(lastCallOutput.Value); + session.ConversationHistory.Add(CreateUserMessage(userMessage)); } - session.NewItems.Clear(); - session.LastResponseId = null; - - var userMsg = CreateUserMessage(userMessage); - session.ConversationHistory.Add(userMsg); - session.NewItems.Add(userMsg); // Build the model's tools list — computer + OnTaskComplete + any additional function tools var modelTools = new List(_tools); @@ -246,9 +217,6 @@ public async Task RunAsync( if (type == "reasoning") continue; session.ConversationHistory.Add(item); - // No need to add model output items to NewItems — the API reconstructs - // its own output from previous_response_id. We only need to send new - // user-side items (user messages, computer_call_output, function_call_output). switch (type) { @@ -314,26 +282,6 @@ public async Task RunAsync( return "The task could not be completed within the allowed number of steps."; } - /// - /// Check if a conversation has an active W365 session. - /// - public bool HasActiveSession(string conversationId) - { - return _sessions.TryGetValue(conversationId, out var session) && session.SessionStarted; - } - - /// - /// End the session for a specific conversation and clean up state. - /// - public async Task EndConversationSessionAsync(string conversationId, IList tools, CancellationToken ct) - { - if (_sessions.TryRemove(conversationId, out var session) && session.SessionStarted) - { - _logger.LogInformation("Ending session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); - await EndSessionAsync(tools, _logger, session.W365SessionId, ct); - } - } - /// /// End the W365 session. Called by the agent on shutdown or explicit end. /// @@ -351,10 +299,6 @@ public static async Task EndSessionAsync(IList tools, ILogger logger, st { logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); } - catch (HttpRequestException httpEx) when (httpEx.StatusCode == System.Net.HttpStatusCode.NotFound) - { - logger.LogInformation("MCP transport session expired (404) — W365 session will be released by server timeout"); - } catch (Exception ex) { logger.LogWarning(ex, "Failed to end W365 session"); @@ -500,21 +444,6 @@ public async Task EndSessionOnShutdownAsync() private async Task CallModelAsync(List conversation, List tools, CancellationToken ct) { - List input; - string? previousResponseId = null; - - if (session.LastResponseId != null) - { - // Send only the items added since the last model call - input = session.NewItems; - previousResponseId = session.LastResponseId; - } - else - { - // First call — send the full conversation history - input = session.ConversationHistory; - } - var body = JsonSerializer.Serialize(new ComputerUseRequest { Model = _modelProvider.ModelName, @@ -553,15 +482,7 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { var (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); - var (_, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); - if (sessionLost) - { - onStatus?.Invoke("Session lost — recovering..."); - sessionId = await RecoverSessionAsync(session, tools, _logger, ct); - // Re-map with new sessionId and retry - (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); - await InvokeToolAsync(tools, toolName, args, ct); - } + await InvokeToolAsync(tools, toolName, args, ct); } } } @@ -573,29 +494,12 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { var (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); - var (_, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); - if (sessionLost) - { - onStatus?.Invoke("Session lost — recovering..."); - sessionId = await RecoverSessionAsync(session, tools, _logger, ct); - (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); - await InvokeToolAsync(tools, toolName, args, ct); - } + await InvokeToolAsync(tools, toolName, args, ct); } } - // Always capture screenshot after action — with session recovery (same pattern as action tools) - string screenshot; - try - { - screenshot = await CaptureScreenshotCoreAsync(tools, mcpClient, sessionId, ct); - } - catch (InvalidOperationException ex) when (IsSessionNotFoundError(ex.Message)) - { - onStatus?.Invoke("Session lost — recovering..."); - sessionId = await RecoverSessionAsync(session, tools, _logger, ct); - screenshot = await CaptureScreenshotCoreAsync(tools, mcpClient, sessionId, ct); - } + // Always capture screenshot after action + var screenshot = await CaptureScreenshotAsync(tools, mcpClient, sessionId, ct); var stepName = $"{++session.ScreenshotCounter:D3}_step"; SaveScreenshotToDisk(screenshot!, stepName); @@ -683,7 +587,7 @@ private static (string ToolName, Dictionary Args) MapActionToMc return (toolName, args); } - private async Task CaptureScreenshotCoreAsync(IList tools, IMcpClient? mcpClient, string? sessionId, CancellationToken ct) + private async Task CaptureScreenshotAsync(IList tools, IMcpClient? mcpClient, string? sessionId, CancellationToken ct) { var screenshotArgs = new Dictionary(); if (!string.IsNullOrEmpty(sessionId)) @@ -709,11 +613,6 @@ private async Task CaptureScreenshotCoreAsync(IList tools, IMcpC } } - // Check if the error is a session-not-found before throwing - var contentText = string.Join(" ", result.Content.Select(c => c.Text ?? "")); - if (IsSessionNotFoundError(contentText)) - throw new InvalidOperationException($"Screenshot failed: no active session. Response: {contentText[..Math.Min(300, contentText.Length)]}"); - // Log full content for debugging foreach (var item in result.Content) _logger.LogWarning("Unhandled screenshot block: Type={Type}, Text={Preview}", item.Type, item.Text?[..Math.Min(200, item.Text.Length)]); @@ -728,10 +627,6 @@ private async Task CaptureScreenshotCoreAsync(IList tools, IMcpC _logger.LogInformation("Screenshot fallback: result type={Type}, length={Length}, preview={Preview}", aiResult?.GetType().Name ?? "null", str.Length, str[..Math.Min(200, str.Length)]); - // Detect session-not-found early so the retry wrapper can recover the session - if (IsSessionNotFoundError(str)) - throw new InvalidOperationException($"Screenshot failed: no active session. Response: {str[..Math.Min(300, str.Length)]}"); - try { using var doc = JsonDocument.Parse(str); @@ -787,63 +682,6 @@ private async Task CaptureScreenshotCoreAsync(IList tools, IMcpC return await tool.InvokeAsync(new AIFunctionArguments(args), ct); } - /// - /// Invoke a tool and detect session-not-found errors. Returns (result, isSessionLost). - /// - private static async Task<(object? Result, bool IsSessionLost)> InvokeToolCheckSessionAsync( - IList tools, string name, Dictionary args, CancellationToken ct) - { - var result = await InvokeToolAsync(tools, name, args, ct); - var resultStr = result?.ToString() ?? ""; - if (IsSessionNotFoundError(resultStr)) - return (result, true); - return (result, false); - } - - /// - /// Check if a tool response indicates the session is no longer valid. - /// - private static bool IsSessionNotFoundError(string response) - { - if (string.IsNullOrEmpty(response)) return false; - var lower = response.ToLowerInvariant(); - return lower.Contains("no active session found") || - lower.Contains("session not found") || - lower.Contains("session expired") || - lower.Contains("session has been terminated"); - } - - /// - /// Recover from a lost session: end the stale session (best-effort) and start a new one. - /// - private async Task RecoverSessionAsync( - ConversationSession session, IList tools, ILogger logger, CancellationToken ct) - { - logger.LogWarning("Session lost for W365SessionId={SessionId}. Recovering — ending stale session and starting new one.", session.W365SessionId); - - // Best-effort end the stale session - try - { - await EndSessionAsync(tools, logger, session.W365SessionId, ct); - } - catch (Exception ex) - { - logger.LogWarning(ex, "Best-effort EndSession during recovery failed for {SessionId}", session.W365SessionId); - } - - // Start a fresh session - var newSessionId = await StartSessionAsync(tools, logger, ct); - session.W365SessionId = newSessionId; - session.SessionStarted = true; - // Update subfolder to use new session ID - var safeSessionId = newSessionId != null - ? new string(newSessionId.Where(c => char.IsLetterOrDigit(c) || c == '-').ToArray()) - : "unknown"; - session.ScreenshotSubfolder = $"{DateTime.UtcNow:yyyyMMdd}_{safeSessionId}"; - logger.LogInformation("Session recovered. New W365SessionId={SessionId}", newSessionId); - return newSessionId; - } - private static string[] ExtractKeys(JsonElement action) { if (action.TryGetProperty("keys", out var k)) @@ -1061,8 +899,6 @@ private sealed class ConversationSession public bool SessionStarted { get; set; } public string? W365SessionId { get; set; } public List ConversationHistory { get; } = []; - public List NewItems { get; } = []; - public string? LastResponseId { get; set; } public int ScreenshotCounter { get; set; } public string? ScreenshotSubfolder { get; set; } public bool FolderShared { get; set; } diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs index bd974729..1f774b33 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs @@ -41,9 +41,6 @@ public class ComputerUseRequest [JsonPropertyName("instructions")] public string? Instructions { get; set; } - [JsonPropertyName("previous_response_id")] - public string? PreviousResponseId { get; set; } - [JsonPropertyName("input")] public List Input { get; set; } = []; diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj index e0455440..217f5ab6 100644 --- a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -9,8 +9,8 @@ - - + + diff --git a/dotnet/w365-computer-use/sample-agent/nuget.config b/dotnet/w365-computer-use/sample-agent/nuget.config deleted file mode 100644 index 765346e5..00000000 --- a/dotnet/w365-computer-use/sample-agent/nuget.config +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - From e5a1895e974aa06351c6f8f847d326f35382e4ce Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Fri, 17 Apr 2026 14:28:47 -0700 Subject: [PATCH 15/17] Port EndSession + session recovery + packaging hygiene from mabdelkader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Additive port of 407b43c features that don't conflict with our multi-mcp work: - System prompt: mention EndSession and casual-chat behavior - Add EndSession as a model-callable function tool - Handle EndSession in function_call: tear down W365 session, drop state, return "Session ended..." text to the user - EndSessionAsync: add catch for HttpRequestException 404 (MCP transport already expired — no need to warn) - Transparent session recovery during CUA actions: detect session-not-found tool responses, end the stale session, start a fresh one, and retry - Pin A365 SDK package versions to 0.1.72-beta (was beta.*) - Add nuget.config (clear + nuget.org only) Skipped: HasActiveSession / EndConversationSessionAsync public methods (not called anywhere on the target branch). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ComputerUse/ComputerUseOrchestrator.cs | 100 +++++++++++++++++- .../sample-agent/W365ComputerUseSample.csproj | 4 +- .../sample-agent/nuget.config | 7 ++ 3 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 dotnet/w365-computer-use/sample-agent/nuget.config diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs index 8d0597da..88f39f06 100644 --- a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -71,6 +71,8 @@ Only use computer actions when no function tool can accomplish the task. If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). Once you have completed a computer use task, call the OnTaskComplete function. Do NOT continue looping after the task is done. + If the user wants to end, quit, or disconnect their session, call the EndSession function. + If the user sends a casual greeting or question that does not require computer use, reply with a helpful text message. """; public ComputerUseOrchestrator( @@ -116,6 +118,11 @@ public ComputerUseOrchestrator( { Name = "OnTaskComplete", Description = "Call this function when the given task has been completed successfully." + }, + new FunctionToolDefinition + { + Name = "EndSession", + Description = "Call this function when the user wants to end, quit, disconnect, or release their computer session." } ]; } @@ -264,6 +271,20 @@ public async Task RunAsync( session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); return "Task completed successfully."; } + if (funcName == "EndSession") + { + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + if (session.SessionStarted) + { + _logger.LogInformation("EndSession requested by model for conversation {ConversationId}", conversationId); + onStatusUpdate?.Invoke("Ending session..."); + await EndSessionAsync(w365Tools, _logger, session.W365SessionId, cancellationToken); + session.SessionStarted = false; + session.W365SessionId = null; + _sessions.TryRemove(conversationId, out _); + } + return "Session ended. The VM has been released back to the pool."; + } // Invoke additional MCP function tool if (additionalTools != null) @@ -299,6 +320,10 @@ public static async Task EndSessionAsync(IList tools, ILogger logger, st { logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); } + catch (HttpRequestException httpEx) when (httpEx.StatusCode == System.Net.HttpStatusCode.NotFound) + { + logger.LogInformation("MCP transport session expired (404) — W365 session will be released by server timeout"); + } catch (Exception ex) { logger.LogWarning(ex, "Failed to end W365 session"); @@ -482,7 +507,14 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { var (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); - await InvokeToolAsync(tools, toolName, args, ct); + var (_, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + onStatus?.Invoke("Session lost — recovering..."); + sessionId = await RecoverSessionAsync(session, tools, _logger, ct); + (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); + await InvokeToolAsync(tools, toolName, args, ct); + } } } } @@ -494,7 +526,14 @@ private async Task HandleComputerCallAsync( if (actionType != "screenshot") { var (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); - await InvokeToolAsync(tools, toolName, args, ct); + var (_, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + onStatus?.Invoke("Session lost — recovering..."); + sessionId = await RecoverSessionAsync(session, tools, _logger, ct); + (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); + await InvokeToolAsync(tools, toolName, args, ct); + } } } @@ -640,8 +679,11 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien { foreach (var block in content.EnumerateArray()) { - if (block.TryGetProperty("data", out var blockData) && !string.IsNullOrEmpty(blockData.GetString())) - return blockData.GetString(); + if (block.TryGetProperty("data", out var blockData)) + { + var data = blockData.GetString(); + if (!string.IsNullOrEmpty(data)) return data; + } if (block.TryGetProperty("text", out var blockText)) { var extracted = ExtractBase64FromText(blockText.GetString()); @@ -682,6 +724,56 @@ private async Task CaptureScreenshotAsync(IList tools, IMcpClien return await tool.InvokeAsync(new AIFunctionArguments(args), ct); } + /// + /// Invoke a tool and detect session-not-found errors. Returns (result, isSessionLost). + /// + private static async Task<(object? Result, bool IsSessionLost)> InvokeToolCheckSessionAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? ""; + if (IsSessionNotFoundError(resultStr)) + return (result, true); + return (result, false); + } + + /// + /// Check if a tool response indicates the session is no longer valid. + /// + private static bool IsSessionNotFoundError(string response) + { + if (string.IsNullOrEmpty(response)) return false; + var lower = response.ToLowerInvariant(); + return lower.Contains("no active session found") || + lower.Contains("session not found") || + lower.Contains("session expired") || + lower.Contains("session has been terminated"); + } + + /// + /// Recover from a lost session: end the stale session (best-effort) and start a new one. + /// + private async Task RecoverSessionAsync( + ConversationSession session, IList tools, ILogger logger, CancellationToken ct) + { + logger.LogWarning("Session lost for W365SessionId={SessionId}. Recovering — ending stale session and starting new one.", session.W365SessionId); + + try + { + await EndSessionAsync(tools, logger, session.W365SessionId, ct); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Best-effort EndSession during recovery failed for {SessionId}", session.W365SessionId); + } + + var newSessionId = await StartSessionAsync(tools, logger, ct); + session.W365SessionId = newSessionId; + session.SessionStarted = true; + logger.LogInformation("Session recovered. New W365SessionId={SessionId}", newSessionId); + return newSessionId; + } + private static string[] ExtractKeys(JsonElement action) { if (action.TryGetProperty("keys", out var k)) diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj index 217f5ab6..e0455440 100644 --- a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -9,8 +9,8 @@ - - + + diff --git a/dotnet/w365-computer-use/sample-agent/nuget.config b/dotnet/w365-computer-use/sample-agent/nuget.config new file mode 100644 index 00000000..765346e5 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/nuget.config @@ -0,0 +1,7 @@ + + + + + + + From c189f13225d7c6ea6ad69a59f66a694d87f9c552 Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Fri, 17 Apr 2026 16:45:50 -0700 Subject: [PATCH 16/17] Fix intermittent Kestrel "Reading is already in progress" crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two targeted changes to eliminate a post-response crash on the `computer` tool path (gpt-5.4-mini): fail: Kestrel[13] ConnectionAbortedException → InvalidOperationException: Reading is already in progress Unhandled: ObjectDisposedException (HttpRequestPipeReader) 1. MyAgent.cs: drop the background typing-indicator Task.Run loop. The loop fired SendActivityAsync every ~4s concurrently with the main reply path, and the resulting race against StreamingResponse.EndStream triggered the crash. Keep a single initial typing activity; informative updates via onStatusUpdate/onCuaStarting already cover visual feedback. 2. Program.cs: call request.EnableBuffering() at the /api/messages endpoint so observability/tracing middleware can re-read the body without hitting "Reading is not allowed after reader was completed". Validated against a long CUA session (10+ iterations), a mixed CUA/email/chat exchange, and parallel requests — no crashes observed in the test window that previously reproduced the bug within minutes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../sample-agent/Agent/MyAgent.cs | 23 ++++--------------- .../w365-computer-use/sample-agent/Program.cs | 5 ++++ 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs index cd84dcd4..c0a0fc83 100644 --- a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -177,23 +177,12 @@ await A365OtelWrapper.InvokeObservedAgentOperation( async () => { // Typing indicator + // Single typing indicator. A background refresh loop was removed because it + // raced with the main reply path and triggered Kestrel request-body + // "Reading is already in progress" → ObjectDisposedException crashes post-response. + // Informative updates via onStatusUpdate keep the UI feedback flowing. await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), cancellationToken).ConfigureAwait(false); - using var typingCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); - var typingTask = Task.Run(async () => - { - try - { - while (!typingCts.IsCancellationRequested) - { - await Task.Delay(TimeSpan.FromSeconds(4), typingCts.Token).ConfigureAwait(false); - await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), typingCts.Token).ConfigureAwait(false); - } - } - catch (OperationCanceledException) { /* expected */ } - catch (ObjectDisposedException) { /* CTS disposed before task finished */ } - }, typingCts.Token); - try { var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; @@ -257,10 +246,6 @@ await turnContext.SendActivityAsync( } finally { - try { typingCts.Cancel(); } catch (ObjectDisposedException) { } - try { await typingTask.ConfigureAwait(false); } - catch (OperationCanceledException) { /* expected */ } - catch (ObjectDisposedException) { /* expected */ } try { await turnContext.StreamingResponse.EndStreamAsync(cancellationToken).ConfigureAwait(false); } catch (ObjectDisposedException) { /* stream already disposed */ } } diff --git a/dotnet/w365-computer-use/sample-agent/Program.cs b/dotnet/w365-computer-use/sample-agent/Program.cs index 56d030a9..40d7537a 100644 --- a/dotnet/w365-computer-use/sample-agent/Program.cs +++ b/dotnet/w365-computer-use/sample-agent/Program.cs @@ -82,6 +82,11 @@ // Map the /api/messages endpoint to the AgentApplication app.MapPost("/api/messages", async (HttpRequest request, HttpResponse response, IAgentHttpAdapter adapter, IAgent agent, CancellationToken cancellationToken) => { + // Allow multiple reads of the request body — tracing/observability middleware may + // re-read it after the adapter, which otherwise triggers + // "Reading is not allowed after reader was completed" on the Kestrel pipe reader. + request.EnableBuffering(); + await AgentMetrics.InvokeObservedHttpOperation("agent.process_message", async () => { await adapter.ProcessAsync(request, response, agent, cancellationToken); From ce58c6f80a375a977e525af4e36838d0fee6bdd6 Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Fri, 17 Apr 2026 16:46:02 -0700 Subject: [PATCH 17/17] Expand ToolingManifest to 10 MCP servers (mail/calendar/teams/odsp/etc.) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register the full stock-MCP set so the A365 SDK gateway surfaces tools from each server at runtime: mcp_W365ComputerUse, mcp_MailTools, mcp_MeServer, mcp_CalendarTools, mcp_TeamsServer, mcp_ODSPRemoteServer, mcp_SharepointListsTools, mcp_AdminTools, mcp_WordServer, mcp_m365copilot Loads ~149 function tools in prod when the blueprint has the matching McpServers.*.All inheritable scopes consented. Note: mcp_SharepointListsTools currently fails to load with an ObjectDisposedException inside the A365 SDK's MCP client factory (CancellationTokenSource). Appears to be an SDK-side issue — the other nine servers load cleanly. Not addressed here. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../sample-agent/ToolingManifest.json | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json index 35b03e1c..fbe0236b 100644 --- a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json +++ b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json @@ -1,12 +1,14 @@ { "mcpServers": [ - { - "mcpServerName": "mcp_W365ComputerUse", - "url": "mcp_W365ComputerUse" - }, - { - "mcpServerName": "mcp_MailTools", - "url": "mcp_MailTools" - } + { "mcpServerName": "mcp_W365ComputerUse", "url": "mcp_W365ComputerUse" }, + { "mcpServerName": "mcp_MailTools", "url": "mcp_MailTools" }, + { "mcpServerName": "mcp_MeServer", "url": "mcp_MeServer" }, + { "mcpServerName": "mcp_CalendarTools", "url": "mcp_CalendarTools" }, + { "mcpServerName": "mcp_TeamsServer", "url": "mcp_TeamsServer" }, + { "mcpServerName": "mcp_ODSPRemoteServer", "url": "mcp_ODSPRemoteServer" }, + { "mcpServerName": "mcp_SharepointListsTools", "url": "mcp_SharepointListsTools" }, + { "mcpServerName": "mcp_AdminTools", "url": "mcp_AdminTools" }, + { "mcpServerName": "mcp_WordServer", "url": "mcp_WordServer" }, + { "mcpServerName": "mcp_m365copilot", "url": "mcp_m365copilot" } ] }