diff --git a/dotnet/Directory.Packages.props b/dotnet/Directory.Packages.props index 8a22261617..72a563068d 100644 --- a/dotnet/Directory.Packages.props +++ b/dotnet/Directory.Packages.props @@ -63,6 +63,9 @@ + + + diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx index 3a3c6bba9a..dfb8717c85 100644 --- a/dotnet/agent-framework-dotnet.slnx +++ b/dotnet/agent-framework-dotnet.slnx @@ -176,6 +176,8 @@ + + diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/FoundryAgents_Evaluations_Step01_RedTeaming.csproj b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/FoundryAgents_Evaluations_Step01_RedTeaming.csproj new file mode 100644 index 0000000000..d77c0bb0d3 --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/FoundryAgents_Evaluations_Step01_RedTeaming.csproj @@ -0,0 +1,16 @@ + + + + Exe + net10.0 + + enable + enable + + + + + + + + diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/Program.cs b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/Program.cs new file mode 100644 index 0000000000..93a34428c8 --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/Program.cs @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft. All rights reserved. + +// This sample demonstrates how to use Azure AI Foundry's Red Teaming service to assess +// the safety and resilience of an AI model against adversarial attacks. +// +// It uses the RedTeam API from Azure.AI.Projects to run automated attack simulations +// with various attack strategies (encoding, obfuscation, jailbreaks) across multiple +// risk categories (Violence, HateUnfairness, Sexual, SelfHarm). +// +// For more details, see: +// https://learn.microsoft.com/azure/ai-foundry/concepts/ai-red-teaming-agent + +using Azure.AI.Projects; +using Azure.Identity; + +string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set."); +string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; + +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine("RED TEAMING EVALUATION SAMPLE"); +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine(); + +// Initialize Azure credentials and clients +// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production. +// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid +// latency issues, unintended credential probing, and potential security risks from fallback mechanisms. +DefaultAzureCredential credential = new(); +AIProjectClient aiProjectClient = new(new Uri(endpoint), credential); + +// Configure the target model for red teaming +AzureOpenAIModelConfiguration targetConfig = new(deploymentName); + +// Create the red team run configuration +RedTeam redTeamConfig = new(targetConfig) +{ + DisplayName = "FinancialAdvisor-RedTeam", + ApplicationScenario = "A financial advisor assistant that provides general financial advice and information.", + NumTurns = 3, + RiskCategories = + { + RiskCategory.Violence, + RiskCategory.HateUnfairness, + RiskCategory.Sexual, + RiskCategory.SelfHarm, + }, + AttackStrategies = + { + AttackStrategy.Easy, + AttackStrategy.Moderate, + AttackStrategy.Jailbreak, + }, +}; + +Console.WriteLine($"Target model: {deploymentName}"); +Console.WriteLine("Risk categories: Violence, HateUnfairness, Sexual, SelfHarm"); +Console.WriteLine("Attack strategies: Easy, Moderate, Jailbreak"); +Console.WriteLine($"Simulation turns: {redTeamConfig.NumTurns}"); +Console.WriteLine(); + +// Submit the red team run to the service +Console.WriteLine("Submitting red team run..."); +RedTeam redTeamRun = await aiProjectClient.RedTeams.CreateAsync(redTeamConfig); + +Console.WriteLine($"Red team run created: {redTeamRun.Name}"); +Console.WriteLine($"Status: {redTeamRun.Status}"); +Console.WriteLine(); + +// Poll for completion +Console.WriteLine("Waiting for red team run to complete (this may take several minutes)..."); +while (redTeamRun.Status != "Completed" && redTeamRun.Status != "Failed" && redTeamRun.Status != "Canceled") +{ + await Task.Delay(TimeSpan.FromSeconds(15)); + redTeamRun = await aiProjectClient.RedTeams.GetAsync(redTeamRun.Name); + Console.WriteLine($" Status: {redTeamRun.Status}"); +} + +Console.WriteLine(); + +if (redTeamRun.Status == "Completed") +{ + Console.WriteLine("Red team run completed successfully!"); + Console.WriteLine(); + Console.WriteLine("Results:"); + Console.WriteLine(new string('-', 80)); + Console.WriteLine($" Run name: {redTeamRun.Name}"); + Console.WriteLine($" Display name: {redTeamRun.DisplayName}"); + Console.WriteLine($" Status: {redTeamRun.Status}"); + + Console.WriteLine(); + Console.WriteLine("Review the detailed results in the Azure AI Foundry portal:"); + Console.WriteLine($" {endpoint}"); +} +else +{ + Console.WriteLine($"Red team run ended with status: {redTeamRun.Status}"); +} + +Console.WriteLine(); +Console.WriteLine(new string('=', 80)); diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/README.md b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/README.md new file mode 100644 index 0000000000..f46c7af8ef --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming/README.md @@ -0,0 +1,101 @@ +# Red Teaming with Azure AI Foundry (Classic) + +> [!IMPORTANT] +> This sample uses the **classic Azure AI Foundry** red teaming API (`/redTeams/runs`) via `Azure.AI.Projects`. Results are viewable in the classic Foundry portal experience. The **new Foundry** portal's red teaming feature uses a different evaluation-based API that is not yet available in the .NET SDK. + +This sample demonstrates how to use Azure AI Foundry's Red Teaming service to assess the safety and resilience of an AI model against adversarial attacks. + +## What this sample demonstrates + +- Configuring a red team run targeting an Azure OpenAI model deployment +- Using multiple `AttackStrategy` options (Easy, Moderate, Jailbreak) +- Evaluating across `RiskCategory` categories (Violence, HateUnfairness, Sexual, SelfHarm) +- Submitting a red team scan and polling for completion +- Reviewing results in the Azure AI Foundry portal + +## Prerequisites + +Before you begin, ensure you have the following prerequisites: + +- .NET 10 SDK or later +- Azure AI Foundry project (hub and project created) +- Azure OpenAI deployment (e.g., gpt-4o or gpt-4o-mini) +- Azure CLI installed and authenticated (for Azure credential authentication) + +### Regional Requirements + +Red teaming is only available in regions that support risk and safety evaluators: +- **East US 2**, **Sweden Central**, **US North Central**, **France Central**, **Switzerland West** + +### Environment Variables + +Set the following environment variables: + +```powershell +$env:AZURE_FOUNDRY_PROJECT_ENDPOINT="https://your-project.services.ai.azure.com/api/projects/your-project" # Replace with your Azure Foundry project endpoint +$env:AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME="gpt-4o-mini" # Optional, defaults to gpt-4o-mini +``` + +## Run the sample + +Navigate to the sample directory and run: + +```powershell +cd dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step01_RedTeaming +dotnet run +``` + +## Expected behavior + +The sample will: + +1. Configure a `RedTeam` run targeting the specified model deployment +2. Define risk categories and attack strategies +3. Submit the scan to Azure AI Foundry's Red Teaming service +4. Poll for completion (this may take several minutes) +5. Display the run status and direct you to the Azure AI Foundry portal for detailed results + +## Understanding Red Teaming + +### Attack Strategies + +| Strategy | Description | +|----------|-------------| +| Easy | Simple encoding/obfuscation attacks (ROT13, Leetspeak, etc.) | +| Moderate | Moderate complexity attacks requiring an LLM for orchestration | +| Jailbreak | Crafted prompts designed to bypass AI safeguards (UPIA) | + +### Risk Categories + +| Category | Description | +|----------|-------------| +| Violence | Content related to violence | +| HateUnfairness | Hate speech or unfair content | +| Sexual | Sexual content | +| SelfHarm | Self-harm related content | + +### Interpreting Results + +- Results are available in the Azure AI Foundry portal (**classic view** — toggle at top-right) under the red teaming section +- Lower Attack Success Rate (ASR) is better — target ASR < 5% for production +- Review individual attack conversations to understand vulnerabilities + +### Current Limitations + +> [!NOTE] +> - The .NET Red Teaming API (`Azure.AI.Projects`) currently supports targeting **model deployments only** via `AzureOpenAIModelConfiguration`. The `AzureAIAgentTarget` type exists in the SDK but is consumed by the **Evaluation Taxonomy** API (`/evaluationtaxonomies`), not by the Red Teaming API (`/redTeams/runs`). +> - Agent-targeted red teaming with agent-specific risk categories (Prohibited actions, Sensitive data leakage, Task adherence) is documented in the [concept docs](https://learn.microsoft.com/azure/ai-foundry/concepts/ai-red-teaming-agent) but is not yet available via the public REST API or .NET SDK. +> - Results from this API appear in the **classic** Azure AI Foundry portal view. The new Foundry portal uses a separate evaluation-based system with `eval_*` identifiers. + +## Related Resources + +- [Azure AI Red Teaming Agent](https://learn.microsoft.com/azure/ai-foundry/concepts/ai-red-teaming-agent) +- [RedTeam .NET API Reference](https://learn.microsoft.com/dotnet/api/azure.ai.projects.redteam?view=azure-dotnet-preview) +- [Risk and Safety Evaluations](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-metrics-built-in#risk-and-safety-evaluators) + +## Next Steps + +After running red teaming: +1. Review attack results and strengthen agent guardrails +2. Explore the Self-Reflection sample (FoundryAgents_Evaluations_Step02_SelfReflection) for quality assessment +3. Set up continuous red teaming in your CI/CD pipeline diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj new file mode 100644 index 0000000000..646cd75532 --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj @@ -0,0 +1,25 @@ + + + + Exe + net10.0 + + enable + enable + + + + + + + + + + + + + + + + + diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs new file mode 100644 index 0000000000..3faf740c0a --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs @@ -0,0 +1,292 @@ +// Copyright (c) Microsoft. All rights reserved. + +// This sample demonstrates how to use Microsoft.Extensions.AI.Evaluation.Quality to evaluate +// an Agent Framework agent's response quality with a self-reflection loop. +// +// It uses GroundednessEvaluator, RelevanceEvaluator, and CoherenceEvaluator to score responses, +// then iteratively asks the agent to improve based on evaluation feedback. +// +// Based on: Reflexion: Language Agents with Verbal Reinforcement Learning (NeurIPS 2023) +// Reference: https://arxiv.org/abs/2303.11366 +// +// For more details, see: +// https://learn.microsoft.com/dotnet/ai/evaluation/libraries + +using Azure.AI.OpenAI; +using Azure.AI.Projects; +using Azure.Identity; +using Microsoft.Agents.AI; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Safety; + +using ChatMessage = Microsoft.Extensions.AI.ChatMessage; +using ChatRole = Microsoft.Extensions.AI.ChatRole; + +string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set."); +string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; +string openAiEndpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new InvalidOperationException("AZURE_OPENAI_ENDPOINT is not set."); +string evaluatorDeploymentName = Environment.GetEnvironmentVariable("AZURE_OPENAI_DEPLOYMENT_NAME") ?? deploymentName; + +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine("SELF-REFLECTION EVALUATION SAMPLE"); +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine(); + +// Initialize Azure credentials and client +// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production. +// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid +// latency issues, unintended credential probing, and potential security risks from fallback mechanisms. +DefaultAzureCredential credential = new(); +AIProjectClient aiProjectClient = new(new Uri(endpoint), credential); + +// Set up the LLM-based chat client for quality evaluators +IChatClient chatClient = new AzureOpenAIClient(new Uri(openAiEndpoint), credential) + .GetChatClient(evaluatorDeploymentName) + .AsIChatClient(); + +// Configure evaluation: quality evaluators use the LLM, safety evaluators use Azure AI Foundry +ContentSafetyServiceConfiguration safetyConfig = new( + credential: credential, + endpoint: new Uri(endpoint)); + +ChatConfiguration chatConfiguration = safetyConfig.ToChatConfiguration( + originalChatConfiguration: new ChatConfiguration(chatClient)); + +// Create a test agent +AIAgent agent = await aiProjectClient.CreateAIAgentAsync( + name: "KnowledgeAgent", + model: deploymentName, + instructions: "You are a helpful assistant. Answer questions accurately based on the provided context."); +Console.WriteLine($"Created agent: {agent.Name}"); +Console.WriteLine(); + +// Example question and grounding context +const string Question = """ + What are the main benefits of using Azure AI Foundry for building AI applications? + """; + +const string Context = """ + Azure AI Foundry is a comprehensive platform for building, deploying, and managing AI applications. + Key benefits include: + 1. Unified development environment with support for multiple AI frameworks and models + 2. Built-in safety and security features including content filtering and red teaming tools + 3. Scalable infrastructure that handles deployment and monitoring automatically + 4. Integration with Azure services like Azure OpenAI, Cognitive Services, and Machine Learning + 5. Evaluation tools for assessing model quality, safety, and performance + 6. Support for RAG (Retrieval-Augmented Generation) patterns with vector search + 7. Enterprise-grade compliance and governance features + """; + +Console.WriteLine("Question:"); +Console.WriteLine(Question); +Console.WriteLine(); + +// Run evaluations +try +{ + await RunSelfReflectionWithGroundedness(agent, Question, Context, chatConfiguration); + await RunQualityEvaluation(agent, Question, Context, chatConfiguration); + await RunCombinedQualityAndSafetyEvaluation(agent, Question, chatConfiguration); +} +finally +{ + // Cleanup + await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); + Console.WriteLine(); + Console.WriteLine("Cleanup: Agent deleted."); +} + +// ============================================================================ +// Implementation Functions +// ============================================================================ + +static async Task RunSelfReflectionWithGroundedness( + AIAgent agent, string question, string context, ChatConfiguration chatConfiguration) +{ + Console.WriteLine("Running Self-Reflection with Groundedness Evaluation..."); + Console.WriteLine(); + + GroundednessEvaluator groundednessEvaluator = new(); + GroundednessEvaluatorContext groundingContext = new(context); + + const int MaxReflections = 3; + double bestScore = 0; + + string currentPrompt = $"Context: {context}\n\nQuestion: {question}"; + + for (int i = 0; i < MaxReflections; i++) + { + Console.WriteLine($"Iteration {i + 1}/{MaxReflections}:"); + Console.WriteLine(new string('-', 40)); + + // Create a new session for each reflection iteration so that + // conversation context does not carry over between runs. This keeps + // each evaluation independent and avoids biasing groundedness scores. + AgentSession session = await agent.CreateSessionAsync(); + AgentResponse agentResponse = await agent.RunAsync(currentPrompt, session); + string responseText = agentResponse.Text; + + Console.WriteLine($"Response: {responseText[..Math.Min(150, responseText.Length)]}..."); + + List messages = + [ + new(ChatRole.User, currentPrompt), + ]; + ChatResponse chatResponse = new(new ChatMessage(ChatRole.Assistant, responseText)); + + EvaluationResult result = await groundednessEvaluator.EvaluateAsync( + messages, + chatResponse, + chatConfiguration, + additionalContext: [groundingContext]); + + NumericMetric groundedness = result.Get(GroundednessEvaluator.GroundednessMetricName); + double score = groundedness.Value ?? 0; + string rating = groundedness.Interpretation?.Rating.ToString() ?? "N/A"; + + Console.WriteLine($"Groundedness score: {score:F1}/5 (Rating: {rating})"); + Console.WriteLine(); + + if (score > bestScore) + { + bestScore = score; + } + + if (score >= 4.0 || i == MaxReflections - 1) + { + if (score >= 4.0) + { + Console.WriteLine("Good groundedness achieved!"); + } + + break; + } + + // Ask for improvement in the next iteration, including the previous response + // so the LLM knows what to improve on (each iteration uses a new session). + currentPrompt = $""" + Context: {context} + + Your previous answer scored {score}/5 on groundedness. + Your previous answer was: + {responseText} + + Please improve your answer to be more grounded in the provided context. + Only include information that is directly supported by the context. + + Question: {question} + """; + Console.WriteLine("Requesting improvement..."); + Console.WriteLine(); + } + + Console.WriteLine($"Best groundedness score: {bestScore:F1}/5"); + Console.WriteLine(new string('=', 80)); + Console.WriteLine(); +} + +static async Task RunQualityEvaluation( + AIAgent agent, string question, string context, ChatConfiguration chatConfiguration) +{ + Console.WriteLine("Running Quality Evaluation (Relevance, Coherence, Groundedness)..."); + Console.WriteLine(); + + IEvaluator[] evaluators = + [ + new RelevanceEvaluator(), + new CoherenceEvaluator(), + new GroundednessEvaluator(), + ]; + + CompositeEvaluator compositeEvaluator = new(evaluators); + GroundednessEvaluatorContext groundingContext = new(context); + + string prompt = $"Context: {context}\n\nQuestion: {question}"; + + AgentSession session = await agent.CreateSessionAsync(); + AgentResponse agentResponse = await agent.RunAsync(prompt, session); + string responseText = agentResponse.Text; + + Console.WriteLine($"Response: {responseText[..Math.Min(150, responseText.Length)]}..."); + Console.WriteLine(); + + List messages = + [ + new(ChatRole.User, prompt), + ]; + ChatResponse chatResponse = new(new ChatMessage(ChatRole.Assistant, responseText)); + + EvaluationResult result = await compositeEvaluator.EvaluateAsync( + messages, + chatResponse, + chatConfiguration, + additionalContext: [groundingContext]); + + foreach (EvaluationMetric metric in result.Metrics.Values) + { + if (metric is NumericMetric n) + { + string rating = n.Interpretation?.Rating.ToString() ?? "N/A"; + Console.WriteLine($" {n.Name,-20} Score: {n.Value:F1}/5 Rating: {rating}"); + } + } + + Console.WriteLine(new string('=', 80)); + Console.WriteLine(); +} + +static async Task RunCombinedQualityAndSafetyEvaluation( + AIAgent agent, string question, ChatConfiguration chatConfiguration) +{ + Console.WriteLine("Running Combined Quality + Safety Evaluation..."); + Console.WriteLine(); + + IEvaluator[] evaluators = + [ + new RelevanceEvaluator(), + new CoherenceEvaluator(), + new ContentHarmEvaluator(), + new ProtectedMaterialEvaluator(), + ]; + + CompositeEvaluator compositeEvaluator = new(evaluators); + + AgentSession session = await agent.CreateSessionAsync(); + AgentResponse agentResponse = await agent.RunAsync(question, session); + string responseText = agentResponse.Text; + + Console.WriteLine($"Response: {responseText[..Math.Min(150, responseText.Length)]}..."); + Console.WriteLine(); + + List messages = + [ + new(ChatRole.User, question), // No context in this evaluation — testing quality and safety on raw question + ]; + ChatResponse chatResponse = new(new ChatMessage(ChatRole.Assistant, responseText)); + + EvaluationResult result = await compositeEvaluator.EvaluateAsync( + messages, + chatResponse, + chatConfiguration); + + Console.WriteLine("Quality Metrics:"); + foreach (EvaluationMetric metric in result.Metrics.Values) + { + if (metric is NumericMetric n) + { + string rating = n.Interpretation?.Rating.ToString() ?? "N/A"; + bool failed = n.Interpretation?.Failed ?? false; + Console.WriteLine($" {n.Name,-25} Score: {n.Value:F1,-6} Rating: {rating,-15} Failed: {failed}"); + } + else if (metric is BooleanMetric b) + { + string rating = b.Interpretation?.Rating.ToString() ?? "N/A"; + bool failed = b.Interpretation?.Failed ?? false; + Console.WriteLine($" {b.Name,-25} Value: {b.Value,-6} Rating: {rating,-15} Failed: {failed}"); + } + } + + Console.WriteLine(new string('=', 80)); +} diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/README.md b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/README.md new file mode 100644 index 0000000000..8dcb22bd3c --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/README.md @@ -0,0 +1,118 @@ +# Self-Reflection Evaluation with Groundedness Assessment + +This sample demonstrates the self-reflection pattern using Agent Framework with `Microsoft.Extensions.AI.Evaluation.Quality` evaluators. The agent iteratively improves its responses based on real groundedness evaluation scores. + +For details on the self-reflection approach, see [Reflexion: Language Agents with Verbal Reinforcement Learning](https://arxiv.org/abs/2303.11366) (NeurIPS 2023). + +## What this sample demonstrates + +- Self-reflection loop that improves responses using real `GroundednessEvaluator` scores +- Using `RelevanceEvaluator` and `CoherenceEvaluator` for multi-metric quality assessment +- Combining quality and safety evaluators with `CompositeEvaluator` +- Configuring `ContentSafetyServiceConfiguration` for safety evaluators alongside LLM-based quality evaluators +- Tracking improvement across iterations + +## Prerequisites + +Before you begin, ensure you have the following prerequisites: + +- .NET 10 SDK or later +- Azure AI Foundry project (hub and project created) +- Azure OpenAI deployment (e.g., gpt-4o or gpt-4o-mini) +- Azure CLI installed and authenticated (for Azure credential authentication) + +**Note**: This demo uses Azure CLI credentials for authentication. Make sure you're logged in with `az login` and have access to the Azure Foundry resource. For more information, see the [Azure CLI documentation](https://learn.microsoft.com/cli/azure/authenticate-azure-cli-interactively). + +### Azure Resources Required + +1. **Azure AI Hub and Project**: Create these in the Azure Portal + - Follow: https://learn.microsoft.com/azure/ai-foundry/how-to/create-projects +2. **Azure OpenAI Deployment**: Deploy a model (e.g., gpt-4o or gpt-4o-mini) + - Agent model: Used to generate responses + - Evaluator model: Quality evaluators use an LLM; best results with GPT-4o +3. **Azure CLI**: Install and authenticate with `az login` + +### Environment Variables + +Set the following environment variables: + +```powershell +$env:AZURE_FOUNDRY_PROJECT_ENDPOINT="https://your-project.api.azureml.ms" # Azure Foundry project endpoint +$env:AZURE_OPENAI_ENDPOINT="https://your-openai.openai.azure.com/" # Azure OpenAI endpoint (for quality evaluators) +$env:AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME="gpt-4o-mini" # Model deployment name +``` + +**Note**: For best evaluation results, use GPT-4o or GPT-4o-mini as the evaluator model. The groundedness evaluator has been tested and tuned for these models. + +## Run the sample + +Navigate to the sample directory and run: + +```powershell +cd dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection +dotnet run +``` + +## Expected behavior + +The sample runs three evaluation scenarios: + +### 1. Self-Reflection with Groundedness +- Asks a question with grounding context +- Evaluates response groundedness using `GroundednessEvaluator` +- If score is below 4/5, asks the agent to improve with feedback +- Repeats up to 3 iterations +- Tracks and reports the best score achieved + +### 2. Quality Evaluation +- Evaluates a single response with multiple quality evaluators: + - `RelevanceEvaluator` — is the response relevant to the question? + - `CoherenceEvaluator` — is the response logically coherent? + - `GroundednessEvaluator` — is the response grounded in the provided context? + +### 3. Combined Quality + Safety Evaluation +- Runs both quality and safety evaluators together: + - `RelevanceEvaluator`, `CoherenceEvaluator` (quality) + - `ContentHarmEvaluator` (safety — violence, hate, sexual, self-harm) + - `ProtectedMaterialEvaluator` (safety — copyrighted content detection) + +## Understanding the Evaluation + +### Groundedness Score (1-5 scale) + +The `GroundednessEvaluator` measures how well the agent's response is grounded in the provided context: + +- **5** = Excellent - Response is fully grounded in context +- **4** = Good - Mostly grounded with minor deviations +- **3** = Fair - Partially grounded but includes unsupported claims +- **2** = Poor - Significant amount of ungrounded content +- **1** = Very Poor - Response is largely unsupported by context + +### Self-Reflection Process + +1. **Initial Response**: Agent generates answer based on question + context +2. **Evaluation**: `GroundednessEvaluator` scores the response (1-5) +3. **Feedback**: If score < 4, agent receives the score and is asked to improve +4. **Iteration**: Process repeats until good score or max iterations + +## Best Practices + +1. **Provide Complete Context**: Ensure grounding context contains all information needed to answer the question +2. **Clear Instructions**: Give the agent clear instructions about staying grounded in context +3. **Use Quality Models**: GPT-4o recommended for evaluation tasks +4. **Multiple Evaluators**: Use combination of evaluators (groundedness + relevance + coherence) +5. **Batch Processing**: For production, process multiple questions in batch + +## Related Resources + +- [Reflexion Paper (NeurIPS 2023)](https://arxiv.org/abs/2303.11366) +- [Microsoft.Extensions.AI.Evaluation Libraries](https://learn.microsoft.com/dotnet/ai/evaluation/libraries) +- [GroundednessEvaluator API Reference](https://learn.microsoft.com/dotnet/api/microsoft.extensions.ai.evaluation.quality.groundednessevaluator) +- [Azure AI Foundry Evaluation Service](https://learn.microsoft.com/azure/ai-foundry/how-to/develop/evaluate-sdk) + +## Next Steps + +After running self-reflection evaluation: +1. Implement similar patterns for other quality metrics (relevance, coherence, fluency) +2. Integrate into CI/CD pipeline for continuous quality assurance +3. Explore the Safety Evaluation sample (FoundryAgents_Evaluations_Step01_RedTeaming) for content safety assessment diff --git a/dotnet/samples/GettingStarted/FoundryAgents/README.md b/dotnet/samples/GettingStarted/FoundryAgents/README.md index d7bfe4d035..4b297105aa 100644 --- a/dotnet/samples/GettingStarted/FoundryAgents/README.md +++ b/dotnet/samples/GettingStarted/FoundryAgents/README.md @@ -60,6 +60,17 @@ Before you begin, ensure you have the following prerequisites: |[Computer use](./FoundryAgents_Step15_ComputerUse/)|This sample demonstrates how to use computer use capabilities with a Foundry agent| |[Local MCP](./FoundryAgents_Step27_LocalMCP/)|This sample demonstrates how to use a local MCP client with a Foundry agent| +## Evaluation Samples + +Evaluation is critical for building trustworthy and high-quality AI applications. The evaluation samples demonstrate how to assess agent safety, quality, and performance using Azure AI Foundry's evaluation capabilities. + +|Sample|Description| +|---|---| +|[Red Team Evaluation](./FoundryAgents_Evaluations_Step01_RedTeaming/)|This sample demonstrates how to use Azure AI Foundry's Red Teaming service to assess model safety against adversarial attacks| +|[Self-Reflection with Groundedness](./FoundryAgents_Evaluations_Step02_SelfReflection/)|This sample demonstrates the self-reflection pattern where agents iteratively improve responses based on groundedness evaluation| + +For details on safety evaluation, see the [Red Team Evaluation README](./FoundryAgents_Evaluations_Step01_RedTeaming/README.md). + ## Running the samples from the console To run the samples, navigate to the desired sample directory, e.g.