From d9075904943f5ef901a6c116914b4daf16e8c685 Mon Sep 17 00:00:00 2001 From: Chetan Toshniwal Date: Tue, 7 Apr 2026 13:27:31 -0700 Subject: [PATCH 1/3] feat(evals): add ground_truth support for similarity evaluator - Include expected_output as ground_truth in Foundry JSONL dataset rows - Add ground_truth to item schema and data mapping for similarity evaluator - Add expected_output parameter to evaluate_workflow - Add similarity Pattern 3 to evaluate_agent and evaluate_workflow samples - Add tests for ground_truth in dataset, schema, and evaluate_workflow --- .../core/agent_framework/_evaluation.py | 15 +- .../agent_framework_foundry/_foundry_evals.py | 20 ++- .../foundry/tests/test_foundry_evals.py | 148 ++++++++++++++++++ .../foundry_evals/evaluate_agent_sample.py | 38 ++++- .../foundry_evals/evaluate_workflow_sample.py | 51 +++++- 5 files changed, 265 insertions(+), 7 deletions(-) diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 682903d448..43b718f1e0 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -1659,6 +1659,7 @@ async def evaluate_workflow( workflow: Workflow, workflow_result: WorkflowRunResult | None = None, queries: str | Sequence[str] | None = None, + expected_output: str | Sequence[str] | None = None, evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], eval_name: str | None = None, include_overall: bool = True, @@ -1683,6 +1684,11 @@ async def evaluate_workflow( workflow: The workflow instance. workflow_result: A completed ``WorkflowRunResult``. queries: Test queries to run through the workflow. + expected_output: Ground-truth expected output(s), one per query. A + single string is wrapped into a one-element list. When provided, + must be the same length as ``queries``. Each value is stamped on + the corresponding ``EvalItem.expected_output`` for evaluators + that compare against a reference answer (e.g. similarity). evaluators: One or more ``Evaluator`` instances. eval_name: Display name for the evaluation. include_overall: Whether to evaluate the workflow's final output. @@ -1720,10 +1726,15 @@ async def evaluate_workflow( # Normalize singular query to list if isinstance(queries, str): queries = [queries] + if isinstance(expected_output, str): + expected_output = [expected_output] if workflow_result is None and queries is None: raise ValueError("Provide either 'workflow_result' or 'queries'.") + if expected_output is not None and queries is not None and len(expected_output) != len(queries): + raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.") + if num_repetitions < 1: raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.") @@ -1737,7 +1748,7 @@ async def evaluate_workflow( if queries is not None: results_list: list[WRR] = [] for _rep in range(num_repetitions): - for q in queries: + for qi, q in enumerate(queries): result = await workflow.run(q) if not isinstance(result, WRR): raise TypeError(f"Expected WorkflowRunResult from workflow.run(), got {type(result).__name__}.") @@ -1746,6 +1757,8 @@ async def evaluate_workflow( if include_overall: overall_item = _build_overall_item(q, result) if overall_item: + if expected_output is not None: + overall_item.expected_output = expected_output[qi] overall_items.append(overall_item) else: assert workflow_result is not None # noqa: S101 # nosec B101 diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index a5033a8e87..01ef389db5 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -75,6 +75,11 @@ "builtin.tool_call_success", } +# Evaluators that require a ground_truth / expected_output field. +_GROUND_TRUTH_EVALUATORS: set[str] = { + "builtin.similarity", +} + _BUILTIN_EVALUATORS: dict[str, str] = { # Agent behavior "intent_resolution": "builtin.intent_resolution", @@ -196,6 +201,8 @@ def _build_testing_criteria( } if qualified == "builtin.groundedness": mapping["context"] = "{{item.context}}" + if qualified in _GROUND_TRUTH_EVALUATORS: + mapping["ground_truth"] = "{{item.ground_truth}}" if qualified in _TOOL_EVALUATORS: mapping["tool_definitions"] = "{{item.tool_definitions}}" entry["data_mapping"] = mapping @@ -204,7 +211,9 @@ def _build_testing_criteria( return criteria -def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> dict[str, Any]: +def _build_item_schema( + *, has_context: bool = False, has_tools: bool = False, has_ground_truth: bool = False +) -> dict[str, Any]: """Build the ``item_schema`` for custom JSONL eval definitions.""" properties: dict[str, Any] = { "query": {"type": "string"}, @@ -214,6 +223,8 @@ def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> } if has_context: properties["context"] = {"type": "string"} + if has_ground_truth: + properties["ground_truth"] = {"type": "string"} if has_tools: properties["tool_definitions"] = {"type": "array"} return { @@ -681,16 +692,21 @@ async def _evaluate_via_dataset( ] if item.context: d["context"] = item.context + if item.expected_output: + d["ground_truth"] = item.expected_output dicts.append(d) has_context = any("context" in d for d in dicts) + has_ground_truth = any("ground_truth" in d for d in dicts) has_tools = any("tool_definitions" in d for d in dicts) eval_obj = await self._client.evals.create( name=eval_name, data_source_config={ # type: ignore[arg-type] # pyright: ignore[reportArgumentType] "type": "custom", - "item_schema": _build_item_schema(has_context=has_context, has_tools=has_tools), + "item_schema": _build_item_schema( + has_context=has_context, has_ground_truth=has_ground_truth, has_tools=has_tools + ), "include_sample_schema": True, }, testing_criteria=_build_testing_criteria( # type: ignore[arg-type] # pyright: ignore[reportArgumentType] diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index cef890c7af..d11999b76a 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -769,6 +769,10 @@ def test_quality_evaluators_use_strings(self) -> None: assert c["data_mapping"]["query"] == "{{item.query}}", f"{c['name']}" assert c["data_mapping"]["response"] == "{{item.response}}", f"{c['name']}" + def test_similarity_includes_ground_truth(self) -> None: + criteria = _build_testing_criteria(["similarity"], "gpt-4o", include_data_mapping=True) + assert criteria[0]["data_mapping"]["ground_truth"] == "{{item.ground_truth}}" + def test_all_tool_evaluators_include_tool_definitions(self) -> None: tool_evals = [ "tool_call_accuracy", @@ -801,6 +805,10 @@ def test_with_tools(self) -> None: schema = _build_item_schema(has_tools=True) assert "tool_definitions" in schema["properties"] + def test_with_ground_truth(self) -> None: + schema = _build_item_schema(has_ground_truth=True) + assert "ground_truth" in schema["properties"] + def test_with_context_and_tools(self) -> None: schema = _build_item_schema(has_context=True, has_tools=True) assert "context" in schema["properties"] @@ -1015,6 +1023,50 @@ async def test_evaluate_with_tool_items_uses_dataset_path(self) -> None: assert ds["type"] == "jsonl" assert "tool_definitions" in ds["source"]["content"][0]["item"] + async def test_evaluate_ground_truth_in_dataset(self) -> None: + """Items with expected_output include ground_truth in the JSONL payload.""" + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_gt" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_gt" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = _rc(passed=1) + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + items = [ + EvalItem( + conversation=[Message("user", ["What is 2+2?"]), Message("assistant", ["4"])], + expected_output="4", + ), + ] + + fe = FoundryEvals( + client=mock_client, + model="gpt-4o", + evaluators=[FoundryEvals.SIMILARITY], + ) + await fe.evaluate(items) + + # Verify ground_truth appears in JSONL data + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + assert ds["source"]["content"][0]["item"]["ground_truth"] == "4" + + # Verify item_schema includes ground_truth + create_call = mock_client.evals.create.call_args + schema = create_call.kwargs["data_source_config"]["item_schema"] + assert "ground_truth" in schema["properties"] + async def test_evaluate_image_content_in_dataset(self) -> None: """Image content in conversations is preserved in the JSONL payload.""" mock_client = MagicMock() @@ -1988,6 +2040,102 @@ async def test_per_agent_excludes_tool_evaluators_when_no_tools(self) -> None: "researcher has tools — should get tool_call_accuracy" ) + async def test_expected_output_stamps_overall_items(self) -> None: + """expected_output is stamped on overall items as ground_truth in the dataset.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("agent", "Response", ["Query"]) + final_output = [Message("assistant", ["Final answer"])] + + events = [ + WorkflowEvent.executor_invoked("agent", "Test query"), + WorkflowEvent.executor_completed("agent", [aer]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + mock_workflow.run = AsyncMock(return_value=wf_result) + + results = await evaluate_workflow( + workflow=mock_workflow, + queries=["Test query"], + expected_output=["Expected answer"], + evaluators=FoundryEvals( + client=mock_oai, + model="gpt-4o", + evaluators=[FoundryEvals.SIMILARITY], + ), + ) + + assert results[0].status == "completed" + + # Verify overall eval's dataset includes ground_truth + # The overall eval is the last evals.runs.create call + calls = mock_oai.evals.runs.create.call_args_list + overall_call = calls[-1] + ds = overall_call.kwargs["data_source"] + overall_item = ds["source"]["content"][0]["item"] + assert overall_item["ground_truth"] == "Expected answer" + + async def test_expected_output_with_num_repetitions(self) -> None: + """expected_output is correctly stamped on overall items across multiple repetitions.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("agent", "Response", ["Query"]) + final_output = [Message("assistant", ["Final answer"])] + + events = [ + WorkflowEvent.executor_invoked("agent", "Test query"), + WorkflowEvent.executor_completed("agent", [aer]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + mock_workflow.run = AsyncMock(return_value=wf_result) + + results = await evaluate_workflow( + workflow=mock_workflow, + queries=["Test query"], + expected_output=["Expected answer"], + evaluators=FoundryEvals( + client=mock_oai, + model="gpt-4o", + evaluators=[FoundryEvals.SIMILARITY], + ), + num_repetitions=2, + ) + + assert results[0].status == "completed" + + # workflow.run should be called twice (once per repetition) + assert mock_workflow.run.call_count == 2 + + # Verify all overall items have ground_truth stamped + calls = mock_oai.evals.runs.create.call_args_list + overall_call = calls[-1] + ds = overall_call.kwargs["data_source"] + items = ds["source"]["content"] + assert len(items) == 2 + for item in items: + assert item["item"]["ground_truth"] == "Expected answer" + + async def test_expected_output_length_mismatch_raises(self) -> None: + """Mismatched queries and expected_output lengths raise ValueError.""" + mock_oai = MagicMock() + mock_workflow = MagicMock() + + with pytest.raises(ValueError, match="expected_output"): + await evaluate_workflow( + workflow=mock_workflow, + queries=["q1", "q2"], + expected_output=["e1"], + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + # --------------------------------------------------------------------------- # EvalItemResult and EvalScoreResult diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py index 7c8b306c11..42b67b91e7 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py @@ -2,9 +2,10 @@ """Evaluate an agent using Azure AI Foundry's built-in evaluators. -This sample demonstrates two patterns: +This sample demonstrates three patterns: 1. evaluate_agent(responses=...) — Evaluate a response you already have. 2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call. +3. Similarity — Compare agent output against ground-truth reference answers. See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation. @@ -149,6 +150,41 @@ async def main() -> None: else: print(f"[FAIL] {r.failed} failed") + # ========================================================================= + # Pattern 3: Similarity — compare agent output to ground-truth answers + # ========================================================================= + print() + print("=" * 60) + print("Pattern 3: Similarity evaluation with ground truth") + print("=" * 60) + + # Similarity requires expected_output — a reference answer per query + # that the evaluator compares against the agent's actual response. + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "How much does a flight from Seattle to Paris cost?", + ], + expected_output=[ + "62°F, cloudy with a chance of rain", + "Flights from Seattle to Paris: $450 round-trip", + ], + evaluators=FoundryEvals( + client=chat_client, + evaluators=[FoundryEvals.SIMILARITY], + ), + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("[PASS] All passed") + else: + print(f"[FAIL] {r.failed} failed") + if __name__ == "__main__": asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py index e2861324f6..f89d85b5b5 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py @@ -2,11 +2,12 @@ """Evaluate a multi-agent workflow using Azure AI Foundry evaluators. -This sample demonstrates two patterns: +This sample demonstrates three patterns: 1. Post-hoc: Run the workflow, then evaluate the result you already have. 2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you. +3. Similarity: Evaluate an agent's output against ground-truth reference answers. -Both patterns return a list of results (one per provider), each with a per-agent +Patterns 1 & 2 return a list of results (one per provider), each with a per-agent breakdown in sub_results so you can identify which agent is underperforming. Prerequisites: @@ -79,7 +80,6 @@ async def main() -> None: # 4. Create the evaluator — provider config goes here, once evals = FoundryEvals(client=client) - # ========================================================================= # Pattern 1: Post-hoc — evaluate a workflow run you already did # ========================================================================= @@ -143,6 +143,43 @@ async def main() -> None: if agent_eval.report_url: print(f" Portal: {agent_eval.report_url}") + # ========================================================================= + # Pattern 3: Similarity — compare workflow output to ground-truth answers + # ========================================================================= + # Build a fresh workflow to avoid stale session state from Pattern 2. + workflow3 = SequentialBuilder(participants=[researcher, planner]).build() + + print() + print("=" * 60) + print("Pattern 3: Similarity evaluation with ground truth") + print("=" * 60) + + # Similarity compares the final workflow output against a reference answer, + # so per-agent breakdown is disabled — individual agents don't have their + # own ground-truth targets. + eval_results = await evaluate_workflow( + workflow=workflow3, + queries=[ + "Plan a trip from Seattle to Paris", + "Plan a trip from London to Tokyo", + ], + expected_output=[ + "Pack layers and an umbrella for Paris. Flights from Seattle are around $450 round-trip.", + "Bring warm clothing for Tokyo in spring. Flights from London are around $500 round-trip.", + ], + evaluators=FoundryEvals( + client=client, + evaluators=[FoundryEvals.SIMILARITY], + ), + include_per_agent=False, + ) + + for r in eval_results: + print(f"\nOverall: {r.status}") + print(f" Passed: {r.passed}/{r.total}") + if r.report_url: + print(f" Portal: {r.report_url}") + if __name__ == "__main__": asyncio.run(main()) @@ -173,4 +210,12 @@ async def main() -> None: Per-agent breakdown: researcher: 2/2 passed planner: 2/2 passed + +============================================================ +Pattern 3: Similarity evaluation with ground truth +============================================================ + +Overall: completed + Passed: 2/2 + Portal: https://ai.azure.com/... """ From bab4c9ad760c8d1767adcf3c4b308e430f74bd92 Mon Sep 17 00:00:00 2001 From: chetantoshniwal Date: Mon, 13 Apr 2026 15:20:58 -0700 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- python/packages/core/agent_framework/_evaluation.py | 2 ++ .../packages/foundry/agent_framework_foundry/_foundry_evals.py | 2 +- .../evaluation/foundry_evals/evaluate_workflow_sample.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 43b718f1e0..cc334167eb 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -1732,6 +1732,8 @@ async def evaluate_workflow( if workflow_result is None and queries is None: raise ValueError("Provide either 'workflow_result' or 'queries'.") + if expected_output is not None and queries is None: + raise ValueError("Provide 'queries' when using 'expected_output'; 'expected_output' is not supported with 'workflow_result' only.") if expected_output is not None and queries is not None and len(expected_output) != len(queries): raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.") diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 01ef389db5..2f68816591 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -692,7 +692,7 @@ async def _evaluate_via_dataset( ] if item.context: d["context"] = item.context - if item.expected_output: + if item.expected_output is not None: d["ground_truth"] = item.expected_output dicts.append(d) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py index f89d85b5b5..b9ffa1f6dd 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py @@ -5,7 +5,7 @@ This sample demonstrates three patterns: 1. Post-hoc: Run the workflow, then evaluate the result you already have. 2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you. -3. Similarity: Evaluate an agent's output against ground-truth reference answers. +3. Similarity: Evaluate the workflow's final output against ground-truth reference answers. Patterns 1 & 2 return a list of results (one per provider), each with a per-agent breakdown in sub_results so you can identify which agent is underperforming. From 72c3461668ea92d60d64355e31a6b3bc78effa8d Mon Sep 17 00:00:00 2001 From: Chetan Toshniwal Date: Mon, 13 Apr 2026 18:06:08 -0700 Subject: [PATCH 3/3] fix: wrap long line to satisfy ruff E501 --- python/packages/core/agent_framework/_evaluation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index cc334167eb..64fab0eacb 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -1733,7 +1733,10 @@ async def evaluate_workflow( raise ValueError("Provide either 'workflow_result' or 'queries'.") if expected_output is not None and queries is None: - raise ValueError("Provide 'queries' when using 'expected_output'; 'expected_output' is not supported with 'workflow_result' only.") + raise ValueError( + "Provide 'queries' when using 'expected_output';" + " 'expected_output' is not supported with 'workflow_result' only." + ) if expected_output is not None and queries is not None and len(expected_output) != len(queries): raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.")