Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
0263947
Add `a365 evaluate` command for MCP tool schema quality evaluation
ashragrawal Apr 10, 2026
8bab9ec
Fix code review findings for `a365 evaluate` command
ashragrawal Apr 13, 2026
e1cde5f
Remove dead code from evaluate pipeline
ashragrawal Apr 16, 2026
2661ab5
Move `a365 evaluate` under `a365 develop-mcp evaluate`
ashragrawal Apr 16, 2026
7dc6d75
Harden coding agent invocation in evaluate pipeline
ashragrawal Apr 16, 2026
ba37732
Merge remote-tracking branch 'origin/main' into users/ashragrawal/eva…
ashragrawal Apr 16, 2026
092782b
Address PR review: fix misleading comment and unknown-category fallback
ashragrawal Apr 16, 2026
6167199
Align SchemaDiscoveryService with project HttpClient convention
ashragrawal Apr 16, 2026
f911f1b
Show positive box in report when at max maturity level
ashragrawal Apr 16, 2026
5de5eef
Polish evaluate command output for better CLI experience
ashragrawal Apr 17, 2026
254c2b7
Address outstanding PR review comments on evaluate pipeline
ashragrawal Apr 17, 2026
dbb0a6a
Surface coding-agent dependency in evaluate help and intro output
ashragrawal Apr 17, 2026
203d285
Remove unused Microsoft.Extensions.Http package reference
ashragrawal Apr 17, 2026
9983355
Make BYOL round-trip work in evaluate — detect and resume from scored…
ashragrawal Apr 17, 2026
a29a82f
Rename schema-quality "smell" terminology to "issue" across evaluate …
ashragrawal Apr 17, 2026
8fcde15
Address PR review: escape inline script XSS, fix docstrings, use Argu…
ashragrawal Apr 17, 2026
ed073b0
Sandbox coding-agent invocations and narrow Copilot tool permissions
ashragrawal Apr 17, 2026
f86e69f
Fix Copilot tool restriction and fall back to manual scoring on agent…
ashragrawal Apr 20, 2026
73f199d
Tell agents their exact tool names in the prompt; add Write to Claude
ashragrawal Apr 20, 2026
a29cff5
Switch agent tool restriction from allowlist to shell+web denylist
ashragrawal Apr 20, 2026
ed4173c
Address PR review: branding, filename sanitization, unused params
ashragrawal Apr 21, 2026
f85d01d
Retry agent up to 3 times when scoring leaves items null
ashragrawal Apr 21, 2026
99eb989
Scale per-tool agent timeout to the number of semantic checks
ashragrawal Apr 21, 2026
8ab681c
Bump per-check timeout from 15s to 20s
ashragrawal Apr 21, 2026
95e9f59
Retry agent through timeouts, not just null-scoring
ashragrawal Apr 21, 2026
1589ac2
Address PR review: drop null guards on non-nullable params, reword sa…
ashragrawal Apr 21, 2026
940c57f
Switch scoring agent from whole-file write to id-unique edit
ashragrawal Apr 21, 2026
7028040
Tolerate duplicate/empty check ids from coding-agent output
ashragrawal Apr 21, 2026
e1bda5e
Address PR review: stamp real engine in report, gate explicit engines…
ashragrawal Apr 21, 2026
b2866b5
Harden evaluate pipeline against adversarial MCP servers
ashragrawal Apr 27, 2026
a29f8f9
Merge remote-tracking branch 'origin/main' into users/ashragrawal/eva…
ashragrawal Apr 27, 2026
d5af4a8
Merge branch 'main' into users/ashragrawal/evaluate
ashragrawal Apr 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Agents provisioned before this release need `Agent365.Observability.OtelWrite` g
**Option B — CLI** (`a365 setup admin`) has been removed in this release. Use Option A above, or copy the PowerShell instructions printed in the `a365 setup all` summary output.

### Added
- `a365 develop-mcp evaluate` command for evaluating MCP server tool schema quality — runs deterministic and semantic checks (via GitHub Copilot or Claude Code CLIs), computes maturity scoring, and generates an interactive HTML report
- `setup requirements` Global Administrator path: when the well-known CLI client app is not found in a new tenant, Global Admins are prompted to create the app and grant admin consent automatically (enter an app ID or type `C` to create).
- `--authmode obo|s2s|both` option on `setup all` — controls how the agent identity service principal receives permissions:
- `obo` (default): principal-scoped delegated grants (`consentType: "Principal"`); no Global Administrator required.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Microsoft.Agents.A365.DevTools.Cli.Helpers;
using Microsoft.Agents.A365.DevTools.Cli.Models;
using Microsoft.Agents.A365.DevTools.Cli.Services;
using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate;
using Microsoft.Extensions.Logging;
using System.CommandLine;
using static Microsoft.Agents.A365.DevTools.Cli.Helpers.PackageMCPServerHelper;
Expand All @@ -16,11 +17,13 @@ namespace Microsoft.Agents.A365.DevTools.Cli.Commands;
public static class DevelopMcpCommand
{
/// <summary>
/// Creates the develop-mcp command with subcommands for MCP server management in Dataverse
/// Creates the develop-mcp command with subcommands for MCP server management in Dataverse.
/// The evaluate subcommand is included only when <paramref name="evaluationPipelineService"/> is provided.
/// </summary>
public static Command CreateCommand(
ILogger logger,
IAgent365ToolingService toolingService,
IEvaluationPipelineService? evaluationPipelineService = null,
GraphApiService? graphApiService = null)
{
var developMcpCommand = new Command("develop-mcp", "Manage MCP servers in Dataverse environments");
Expand All @@ -42,9 +45,71 @@ public static Command CreateCommand(
developMcpCommand.AddCommand(CreatePackageMCPServerSubCommand(logger, toolingService));
developMcpCommand.AddCommand(CreateRegisterExternalMcpServerSubcommand(logger, toolingService, graphApiService));

if (evaluationPipelineService is not null)
{
developMcpCommand.AddCommand(CreateEvaluateSubcommand(evaluationPipelineService));
}

return developMcpCommand;
}

/// <summary>
/// Creates the evaluate subcommand for MCP server tool schema quality evaluation.
/// </summary>
private static Command CreateEvaluateSubcommand(IEvaluationPipelineService pipelineService)
{
var command = new Command(
"evaluate",
"Evaluate MCP server tool schema quality and generate an HTML report. " +
"Uses a locally installed coding agent (GitHub Copilot or Claude Code) to score semantic checks. " +
"If no agent is detected, the command stops after writing the checklist so you can score it manually with your own LLM, " +
"or pass --eval-engine none to skip agent probing entirely.");

// Use a required option (not a positional argument) for consistency with other
// develop-mcp subcommands and Azure CLI conventions.
var serverUrlOption = new Option<string>(
["--server-url", "-u"],
"MCP server Streamable HTTP endpoint URL")
{
IsRequired = true,
};

var outputDirOption = new Option<string>(
["--output-dir", "-o"],
Comment thread
ashragrawal marked this conversation as resolved.
getDefaultValue: () => ".",
"Output directory for evaluation artifacts");

Comment thread
ashragrawal marked this conversation as resolved.
var evalEngineOption = new Option<string>(
"--eval-engine",
getDefaultValue: () => "auto",
"Which local coding agent scores semantic checks. " +
"auto: try github-copilot then claude-code. " +
"github-copilot or claude-code: use only that engine. " +
"none: skip automatic scoring and expect the checklist to be pre-scored (bring-your-own-LLM).");

var authTokenOption = new Option<string?>(
"--auth-token",
"Bearer token for MCP server authentication");

command.AddOption(serverUrlOption);
command.AddOption(outputDirOption);
command.AddOption(evalEngineOption);
command.AddOption(authTokenOption);

command.SetHandler(async (System.CommandLine.Invocation.InvocationContext context) =>
{
var serverUrl = context.ParseResult.GetValueForOption(serverUrlOption)!;
var outputDir = context.ParseResult.GetValueForOption(outputDirOption)!;
var evalEngine = context.ParseResult.GetValueForOption(evalEngineOption)!;
var authToken = context.ParseResult.GetValueForOption(authTokenOption);
var ct = context.GetCancellationToken();

await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct);
});

return command;
}

/// <summary>
/// Creates the list-environments subcommand
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@ public static class ErrorCodes
public const string RetryExhausted = "RETRY_EXHAUSTED";
public const string SetupValidationFailed = "SETUP_VALIDATION_FAILED";
public const string ClientAppValidationFailed = "CLIENT_APP_VALIDATION_FAILED";
public const string EvaluationFailed = "EVALUATION_FAILED";
public const string SchemaDiscoveryFailed = "SCHEMA_DISCOVERY_FAILED";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

using Microsoft.Agents.A365.DevTools.Cli.Constants;

namespace Microsoft.Agents.A365.DevTools.Cli.Exceptions;

/// <summary>
/// Exception thrown when MCP server schema evaluation fails.
/// Covers schema discovery errors, checklist generation errors,
/// and report generation errors.
/// </summary>
public sealed class EvaluationException : Agent365Exception
{
public override int ExitCode => 3;

public EvaluationException(
string errorCode,
string issueDescription,
List<string>? errorDetails = null,
List<string>? mitigationSteps = null,
Dictionary<string, string>? context = null,
Exception? innerException = null)
: base(
errorCode: errorCode,
issueDescription: issueDescription,
errorDetails: errorDetails,
mitigationSteps: mitigationSteps,
context: context,
innerException: innerException)
{
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -71,5 +71,6 @@
<EmbeddedResource Include="Templates\agenticUserTemplateManifest.json" />
<EmbeddedResource Include="Templates\color.png" />
<EmbeddedResource Include="Templates\outline.png" />
<EmbeddedResource Include="Templates\SchemaEvalReport.html" />
</ItemGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

using System.Text.Json.Serialization;

namespace Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;

/// <summary>
/// A prioritized remediation action generated from a failed check.
/// </summary>
public class ActionItem
{
[JsonPropertyName("tool_name")]
public string? ToolName { get; init; }

[JsonPropertyName("param_name")]
public string? ParamName { get; init; }

[JsonPropertyName("priority")]
public Priority Priority { get; init; }

[JsonPropertyName("title")]
public string Title { get; init; } = string.Empty;

[JsonPropertyName("description")]
public string Description { get; init; } = string.Empty;

[JsonPropertyName("issue_ids")]
public List<int> IssueIds { get; init; } = [];

[JsonPropertyName("impact_areas")]
public List<ImpactArea> ImpactAreas { get; init; } = [];

[JsonPropertyName("remediation")]
public string Remediation { get; init; } = string.Empty;

[JsonPropertyName("score_impact")]
public float ScoreImpact { get; set; }

[JsonPropertyName("issue_leads_to")]
public List<string> IssueLeadsTo { get; init; } = [];
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

using System.Text.Json.Serialization;

namespace Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;

/// <summary>
/// A single check item in the evaluation checklist.
/// Score is null until evaluated (deterministic checks are pre-filled, semantic checks start null).
/// </summary>
public class ChecklistItem
{
[JsonPropertyName("id")]
public string Id { get; init; } = string.Empty;

[JsonPropertyName("type")]
public CheckType Type { get; init; }

[JsonPropertyName("prompt")]
public string Prompt { get; init; } = string.Empty;

[JsonPropertyName("score")]
public bool? Score { get; set; }

[JsonPropertyName("reason")]
public string? Reason { get; set; }

[JsonPropertyName("severity")]
public Priority Severity { get; init; }

[JsonPropertyName("category")]
public CheckCategory Category { get; init; }

[JsonPropertyName("issue_ids")]
public List<int> IssueIds { get; init; } = [];

[JsonPropertyName("impact_areas")]
public List<ImpactArea> ImpactAreas { get; init; } = [];

[JsonPropertyName("remediation")]
public string Remediation { get; init; } = string.Empty;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

using System.Text.Json.Serialization;

namespace Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;

/// <summary>
/// Final JSON blob fed to the HTML template. Contains everything the template needs
/// to render the report. All evaluation logic, descriptions, and assertions are
/// pre-computed in C# code -- the HTML template is a pure display layer.
/// </summary>
public class EvalReportData
{
[JsonPropertyName("result")]
public SchemaEvalResult Result { get; init; } = new();

[JsonPropertyName("impact_map")]
public Dictionary<string, IssueImpactInfo> ImpactMap { get; init; } = [];

[JsonPropertyName("maturity_ladder")]
public List<MaturityLadderEntry> MaturityLadder { get; init; } = [];
}

public class IssueImpactInfo
{
[JsonPropertyName("name")]
public string Name { get; init; } = string.Empty;

[JsonPropertyName("category")]
public string Category { get; init; } = string.Empty;

[JsonPropertyName("impact")]
public string Impact { get; init; } = string.Empty;

[JsonPropertyName("areas")]
public List<string> Areas { get; init; } = [];
}

public class MaturityLadderEntry
{
[JsonPropertyName("level")]
public int Level { get; init; }

[JsonPropertyName("label")]
public string Label { get; init; } = string.Empty;

[JsonPropertyName("description")]
public string Description { get; init; } = string.Empty;

[JsonPropertyName("is_current")]
public bool IsCurrent { get; init; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

using System.Text.Json.Serialization;

namespace Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;

[JsonConverter(typeof(JsonStringEnumConverter))]
public enum CheckCategory
{
ToolName,
ToolDescription,
ParamName,
ParamDescription,
SchemaStructure,
ToolsetDesign
}

[JsonConverter(typeof(JsonStringEnumConverter))]
public enum Priority
{
P0,
P1,
P2,
P3
}

[JsonConverter(typeof(JsonStringEnumConverter))]
public enum ImpactArea
{
ToolSelection,
ParamAccuracy,
Completeness,
Conciseness
}

[JsonConverter(typeof(JsonStringEnumConverter))]
public enum IssueCategory
{
Accuracy,
Functionality,
Completeness,
Conciseness
}

[JsonConverter(typeof(JsonStringEnumConverter))]
public enum CheckType
{
Deterministic,
Semantic
}

[JsonConverter(typeof(JsonStringEnumConverter))]
public enum EvalEngine
{
Auto,
GitHubCopilot,
ClaudeCode,
None
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

using System.Text.Json.Serialization;

namespace Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;

/// <summary>
/// Root of the evaluation checklist JSON. Intermediate artifact that is auditable
/// and can be evaluated by a coding agent or manually.
/// </summary>
public class EvaluationChecklist
{
[JsonPropertyName("metadata")]
public ChecklistMetadata Metadata { get; init; } = new();

[JsonPropertyName("tools")]
public List<ToolChecklist> Tools { get; init; } = [];

[JsonPropertyName("server_checks")]
public List<ChecklistItem> ServerChecks { get; init; } = [];
}

public class ChecklistMetadata
{
[JsonPropertyName("server_name")]
public string ServerName { get; init; } = string.Empty;

[JsonPropertyName("server_url")]
public string ServerUrl { get; init; } = string.Empty;

[JsonPropertyName("tool_count")]
public int ToolCount { get; init; }

[JsonPropertyName("generated_at")]
public DateTime GeneratedAt { get; init; } = DateTime.UtcNow;

[JsonPropertyName("generator_version")]
public string GeneratorVersion { get; init; } = string.Empty;
}
Loading
Loading