From d9075904943f5ef901a6c116914b4daf16e8c685 Mon Sep 17 00:00:00 2001
From: Chetan Toshniwal <ctoshniwal@microsoft.com>
Date: Tue, 7 Apr 2026 13:27:31 -0700
Subject: [PATCH 1/3] feat(evals): add ground_truth support for similarity
 evaluator

- Include expected_output as ground_truth in Foundry JSONL dataset rows
- Add ground_truth to item schema and data mapping for similarity evaluator
- Add expected_output parameter to evaluate_workflow
- Add similarity Pattern 3 to evaluate_agent and evaluate_workflow samples
- Add tests for ground_truth in dataset, schema, and evaluate_workflow
---
 .../core/agent_framework/_evaluation.py       |  15 +-
 .../agent_framework_foundry/_foundry_evals.py |  20 ++-
 .../foundry/tests/test_foundry_evals.py       | 148 ++++++++++++++++++
 .../foundry_evals/evaluate_agent_sample.py    |  38 ++++-
 .../foundry_evals/evaluate_workflow_sample.py |  51 +++++-
 5 files changed, 265 insertions(+), 7 deletions(-)

diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 682903d448..43b718f1e0 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -1659,6 +1659,7 @@ async def evaluate_workflow(
     workflow: Workflow,
     workflow_result: WorkflowRunResult | None = None,
     queries: str | Sequence[str] | None = None,
+    expected_output: str | Sequence[str] | None = None,
     evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]],
     eval_name: str | None = None,
     include_overall: bool = True,
@@ -1683,6 +1684,11 @@ async def evaluate_workflow(
         workflow: The workflow instance.
         workflow_result: A completed ``WorkflowRunResult``.
         queries: Test queries to run through the workflow.
+        expected_output: Ground-truth expected output(s), one per query. A
+            single string is wrapped into a one-element list. When provided,
+            must be the same length as ``queries``. Each value is stamped on
+            the corresponding ``EvalItem.expected_output`` for evaluators
+            that compare against a reference answer (e.g. similarity).
         evaluators: One or more ``Evaluator`` instances.
         eval_name: Display name for the evaluation.
         include_overall: Whether to evaluate the workflow's final output.
@@ -1720,10 +1726,15 @@ async def evaluate_workflow(
     # Normalize singular query to list
     if isinstance(queries, str):
         queries = [queries]
+    if isinstance(expected_output, str):
+        expected_output = [expected_output]
 
     if workflow_result is None and queries is None:
         raise ValueError("Provide either 'workflow_result' or 'queries'.")
 
+    if expected_output is not None and queries is not None and len(expected_output) != len(queries):
+        raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.")
+
     if num_repetitions < 1:
         raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.")
 
@@ -1737,7 +1748,7 @@ async def evaluate_workflow(
     if queries is not None:
         results_list: list[WRR] = []
         for _rep in range(num_repetitions):
-            for q in queries:
+            for qi, q in enumerate(queries):
                 result = await workflow.run(q)
                 if not isinstance(result, WRR):
                     raise TypeError(f"Expected WorkflowRunResult from workflow.run(), got {type(result).__name__}.")
@@ -1746,6 +1757,8 @@ async def evaluate_workflow(
                 if include_overall:
                     overall_item = _build_overall_item(q, result)
                     if overall_item:
+                        if expected_output is not None:
+                            overall_item.expected_output = expected_output[qi]
                         overall_items.append(overall_item)
     else:
         assert workflow_result is not None  # noqa: S101  # nosec B101
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index a5033a8e87..01ef389db5 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -75,6 +75,11 @@
     "builtin.tool_call_success",
 }
 
+# Evaluators that require a ground_truth / expected_output field.
+_GROUND_TRUTH_EVALUATORS: set[str] = {
+    "builtin.similarity",
+}
+
 _BUILTIN_EVALUATORS: dict[str, str] = {
     # Agent behavior
     "intent_resolution": "builtin.intent_resolution",
@@ -196,6 +201,8 @@ def _build_testing_criteria(
                 }
             if qualified == "builtin.groundedness":
                 mapping["context"] = "{{item.context}}"
+            if qualified in _GROUND_TRUTH_EVALUATORS:
+                mapping["ground_truth"] = "{{item.ground_truth}}"
             if qualified in _TOOL_EVALUATORS:
                 mapping["tool_definitions"] = "{{item.tool_definitions}}"
             entry["data_mapping"] = mapping
@@ -204,7 +211,9 @@ def _build_testing_criteria(
     return criteria
 
 
-def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> dict[str, Any]:
+def _build_item_schema(
+    *, has_context: bool = False, has_tools: bool = False, has_ground_truth: bool = False
+) -> dict[str, Any]:
     """Build the ``item_schema`` for custom JSONL eval definitions."""
     properties: dict[str, Any] = {
         "query": {"type": "string"},
@@ -214,6 +223,8 @@ def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) ->
     }
     if has_context:
         properties["context"] = {"type": "string"}
+    if has_ground_truth:
+        properties["ground_truth"] = {"type": "string"}
     if has_tools:
         properties["tool_definitions"] = {"type": "array"}
     return {
@@ -681,16 +692,21 @@ async def _evaluate_via_dataset(
                 ]
             if item.context:
                 d["context"] = item.context
+            if item.expected_output:
+                d["ground_truth"] = item.expected_output
             dicts.append(d)
 
         has_context = any("context" in d for d in dicts)
+        has_ground_truth = any("ground_truth" in d for d in dicts)
         has_tools = any("tool_definitions" in d for d in dicts)
 
         eval_obj = await self._client.evals.create(
             name=eval_name,
             data_source_config={  # type: ignore[arg-type]  # pyright: ignore[reportArgumentType]
                 "type": "custom",
-                "item_schema": _build_item_schema(has_context=has_context, has_tools=has_tools),
+                "item_schema": _build_item_schema(
+                    has_context=has_context, has_ground_truth=has_ground_truth, has_tools=has_tools
+                ),
                 "include_sample_schema": True,
             },
             testing_criteria=_build_testing_criteria(  # type: ignore[arg-type]  # pyright: ignore[reportArgumentType]
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index cef890c7af..d11999b76a 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -769,6 +769,10 @@ def test_quality_evaluators_use_strings(self) -> None:
             assert c["data_mapping"]["query"] == "{{item.query}}", f"{c['name']}"
             assert c["data_mapping"]["response"] == "{{item.response}}", f"{c['name']}"
 
+    def test_similarity_includes_ground_truth(self) -> None:
+        criteria = _build_testing_criteria(["similarity"], "gpt-4o", include_data_mapping=True)
+        assert criteria[0]["data_mapping"]["ground_truth"] == "{{item.ground_truth}}"
+
     def test_all_tool_evaluators_include_tool_definitions(self) -> None:
         tool_evals = [
             "tool_call_accuracy",
@@ -801,6 +805,10 @@ def test_with_tools(self) -> None:
         schema = _build_item_schema(has_tools=True)
         assert "tool_definitions" in schema["properties"]
 
+    def test_with_ground_truth(self) -> None:
+        schema = _build_item_schema(has_ground_truth=True)
+        assert "ground_truth" in schema["properties"]
+
     def test_with_context_and_tools(self) -> None:
         schema = _build_item_schema(has_context=True, has_tools=True)
         assert "context" in schema["properties"]
@@ -1015,6 +1023,50 @@ async def test_evaluate_with_tool_items_uses_dataset_path(self) -> None:
         assert ds["type"] == "jsonl"
         assert "tool_definitions" in ds["source"]["content"][0]["item"]
 
+    async def test_evaluate_ground_truth_in_dataset(self) -> None:
+        """Items with expected_output include ground_truth in the JSONL payload."""
+        mock_client = MagicMock()
+
+        mock_eval = MagicMock()
+        mock_eval.id = "eval_gt"
+        mock_client.evals.create = AsyncMock(return_value=mock_eval)
+
+        mock_run = MagicMock()
+        mock_run.id = "run_gt"
+        mock_client.evals.runs.create = AsyncMock(return_value=mock_run)
+
+        mock_completed = MagicMock()
+        mock_completed.status = "completed"
+        mock_completed.result_counts = _rc(passed=1)
+        mock_completed.report_url = None
+        mock_completed.per_testing_criteria_results = None
+        mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed)
+
+        items = [
+            EvalItem(
+                conversation=[Message("user", ["What is 2+2?"]), Message("assistant", ["4"])],
+                expected_output="4",
+            ),
+        ]
+
+        fe = FoundryEvals(
+            client=mock_client,
+            model="gpt-4o",
+            evaluators=[FoundryEvals.SIMILARITY],
+        )
+        await fe.evaluate(items)
+
+        # Verify ground_truth appears in JSONL data
+        run_call = mock_client.evals.runs.create.call_args
+        ds = run_call.kwargs["data_source"]
+        assert ds["type"] == "jsonl"
+        assert ds["source"]["content"][0]["item"]["ground_truth"] == "4"
+
+        # Verify item_schema includes ground_truth
+        create_call = mock_client.evals.create.call_args
+        schema = create_call.kwargs["data_source_config"]["item_schema"]
+        assert "ground_truth" in schema["properties"]
+
     async def test_evaluate_image_content_in_dataset(self) -> None:
         """Image content in conversations is preserved in the JSONL payload."""
         mock_client = MagicMock()
@@ -1988,6 +2040,102 @@ async def test_per_agent_excludes_tool_evaluators_when_no_tools(self) -> None:
                     "researcher has tools — should get tool_call_accuracy"
                 )
 
+    async def test_expected_output_stamps_overall_items(self) -> None:
+        """expected_output is stamped on overall items as ground_truth in the dataset."""
+        mock_oai = self._mock_oai_client()
+
+        aer = _make_agent_exec_response("agent", "Response", ["Query"])
+        final_output = [Message("assistant", ["Final answer"])]
+
+        events = [
+            WorkflowEvent.executor_invoked("agent", "Test query"),
+            WorkflowEvent.executor_completed("agent", [aer]),
+            WorkflowEvent.output("end", final_output),
+        ]
+        wf_result = WorkflowRunResult(events, [])
+
+        mock_workflow = MagicMock()
+        mock_workflow.executors = {}
+        mock_workflow.run = AsyncMock(return_value=wf_result)
+
+        results = await evaluate_workflow(
+            workflow=mock_workflow,
+            queries=["Test query"],
+            expected_output=["Expected answer"],
+            evaluators=FoundryEvals(
+                client=mock_oai,
+                model="gpt-4o",
+                evaluators=[FoundryEvals.SIMILARITY],
+            ),
+        )
+
+        assert results[0].status == "completed"
+
+        # Verify overall eval's dataset includes ground_truth
+        # The overall eval is the last evals.runs.create call
+        calls = mock_oai.evals.runs.create.call_args_list
+        overall_call = calls[-1]
+        ds = overall_call.kwargs["data_source"]
+        overall_item = ds["source"]["content"][0]["item"]
+        assert overall_item["ground_truth"] == "Expected answer"
+
+    async def test_expected_output_with_num_repetitions(self) -> None:
+        """expected_output is correctly stamped on overall items across multiple repetitions."""
+        mock_oai = self._mock_oai_client()
+
+        aer = _make_agent_exec_response("agent", "Response", ["Query"])
+        final_output = [Message("assistant", ["Final answer"])]
+
+        events = [
+            WorkflowEvent.executor_invoked("agent", "Test query"),
+            WorkflowEvent.executor_completed("agent", [aer]),
+            WorkflowEvent.output("end", final_output),
+        ]
+        wf_result = WorkflowRunResult(events, [])
+
+        mock_workflow = MagicMock()
+        mock_workflow.executors = {}
+        mock_workflow.run = AsyncMock(return_value=wf_result)
+
+        results = await evaluate_workflow(
+            workflow=mock_workflow,
+            queries=["Test query"],
+            expected_output=["Expected answer"],
+            evaluators=FoundryEvals(
+                client=mock_oai,
+                model="gpt-4o",
+                evaluators=[FoundryEvals.SIMILARITY],
+            ),
+            num_repetitions=2,
+        )
+
+        assert results[0].status == "completed"
+
+        # workflow.run should be called twice (once per repetition)
+        assert mock_workflow.run.call_count == 2
+
+        # Verify all overall items have ground_truth stamped
+        calls = mock_oai.evals.runs.create.call_args_list
+        overall_call = calls[-1]
+        ds = overall_call.kwargs["data_source"]
+        items = ds["source"]["content"]
+        assert len(items) == 2
+        for item in items:
+            assert item["item"]["ground_truth"] == "Expected answer"
+
+    async def test_expected_output_length_mismatch_raises(self) -> None:
+        """Mismatched queries and expected_output lengths raise ValueError."""
+        mock_oai = MagicMock()
+        mock_workflow = MagicMock()
+
+        with pytest.raises(ValueError, match="expected_output"):
+            await evaluate_workflow(
+                workflow=mock_workflow,
+                queries=["q1", "q2"],
+                expected_output=["e1"],
+                evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"),
+            )
+
 
 # ---------------------------------------------------------------------------
 # EvalItemResult and EvalScoreResult
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py
index 7c8b306c11..42b67b91e7 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py
@@ -2,9 +2,10 @@
 
 """Evaluate an agent using Azure AI Foundry's built-in evaluators.
 
-This sample demonstrates two patterns:
+This sample demonstrates three patterns:
 1. evaluate_agent(responses=...) — Evaluate a response you already have.
 2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
+3. Similarity — Compare agent output against ground-truth reference answers.
 
 See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.
 
@@ -149,6 +150,41 @@ async def main() -> None:
         else:
             print(f"[FAIL] {r.failed} failed")
 
+    # =========================================================================
+    # Pattern 3: Similarity — compare agent output to ground-truth answers
+    # =========================================================================
+    print()
+    print("=" * 60)
+    print("Pattern 3: Similarity evaluation with ground truth")
+    print("=" * 60)
+
+    # Similarity requires expected_output — a reference answer per query
+    # that the evaluator compares against the agent's actual response.
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather like in Seattle?",
+            "How much does a flight from Seattle to Paris cost?",
+        ],
+        expected_output=[
+            "62°F, cloudy with a chance of rain",
+            "Flights from Seattle to Paris: $450 round-trip",
+        ],
+        evaluators=FoundryEvals(
+            client=chat_client,
+            evaluators=[FoundryEvals.SIMILARITY],
+        ),
+    )
+
+    for r in results:
+        print(f"Status: {r.status}")
+        print(f"Results: {r.passed}/{r.total} passed")
+        print(f"Portal: {r.report_url}")
+        if r.all_passed:
+            print("[PASS] All passed")
+        else:
+            print(f"[FAIL] {r.failed} failed")
+
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py
index e2861324f6..f89d85b5b5 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py
@@ -2,11 +2,12 @@
 
 """Evaluate a multi-agent workflow using Azure AI Foundry evaluators.
 
-This sample demonstrates two patterns:
+This sample demonstrates three patterns:
 1. Post-hoc: Run the workflow, then evaluate the result you already have.
 2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you.
+3. Similarity: Evaluate an agent's output against ground-truth reference answers.
 
-Both patterns return a list of results (one per provider), each with a per-agent
+Patterns 1 & 2 return a list of results (one per provider), each with a per-agent
 breakdown in sub_results so you can identify which agent is underperforming.
 
 Prerequisites:
@@ -79,7 +80,6 @@ async def main() -> None:
 
     # 4. Create the evaluator — provider config goes here, once
     evals = FoundryEvals(client=client)
-
     # =========================================================================
     # Pattern 1: Post-hoc — evaluate a workflow run you already did
     # =========================================================================
@@ -143,6 +143,43 @@ async def main() -> None:
             if agent_eval.report_url:
                 print(f"    Portal: {agent_eval.report_url}")
 
+    # =========================================================================
+    # Pattern 3: Similarity — compare workflow output to ground-truth answers
+    # =========================================================================
+    # Build a fresh workflow to avoid stale session state from Pattern 2.
+    workflow3 = SequentialBuilder(participants=[researcher, planner]).build()
+
+    print()
+    print("=" * 60)
+    print("Pattern 3: Similarity evaluation with ground truth")
+    print("=" * 60)
+
+    # Similarity compares the final workflow output against a reference answer,
+    # so per-agent breakdown is disabled — individual agents don't have their
+    # own ground-truth targets.
+    eval_results = await evaluate_workflow(
+        workflow=workflow3,
+        queries=[
+            "Plan a trip from Seattle to Paris",
+            "Plan a trip from London to Tokyo",
+        ],
+        expected_output=[
+            "Pack layers and an umbrella for Paris. Flights from Seattle are around $450 round-trip.",
+            "Bring warm clothing for Tokyo in spring. Flights from London are around $500 round-trip.",
+        ],
+        evaluators=FoundryEvals(
+            client=client,
+            evaluators=[FoundryEvals.SIMILARITY],
+        ),
+        include_per_agent=False,
+    )
+
+    for r in eval_results:
+        print(f"\nOverall: {r.status}")
+        print(f"  Passed: {r.passed}/{r.total}")
+        if r.report_url:
+            print(f"  Portal: {r.report_url}")
+
 
 if __name__ == "__main__":
     asyncio.run(main())
@@ -173,4 +210,12 @@ async def main() -> None:
 Per-agent breakdown:
   researcher: 2/2 passed
   planner: 2/2 passed
+
+============================================================
+Pattern 3: Similarity evaluation with ground truth
+============================================================
+
+Overall: completed
+  Passed: 2/2
+  Portal: https://ai.azure.com/...
 """

From bab4c9ad760c8d1767adcf3c4b308e430f74bd92 Mon Sep 17 00:00:00 2001
From: chetantoshniwal <ctoshniwal@microsoft.com>
Date: Mon, 13 Apr 2026 15:20:58 -0700
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 python/packages/core/agent_framework/_evaluation.py             | 2 ++
 .../packages/foundry/agent_framework_foundry/_foundry_evals.py  | 2 +-
 .../evaluation/foundry_evals/evaluate_workflow_sample.py        | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 43b718f1e0..cc334167eb 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -1732,6 +1732,8 @@ async def evaluate_workflow(
     if workflow_result is None and queries is None:
         raise ValueError("Provide either 'workflow_result' or 'queries'.")
 
+    if expected_output is not None and queries is None:
+        raise ValueError("Provide 'queries' when using 'expected_output'; 'expected_output' is not supported with 'workflow_result' only.")
     if expected_output is not None and queries is not None and len(expected_output) != len(queries):
         raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.")
 
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index 01ef389db5..2f68816591 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -692,7 +692,7 @@ async def _evaluate_via_dataset(
                 ]
             if item.context:
                 d["context"] = item.context
-            if item.expected_output:
+            if item.expected_output is not None:
                 d["ground_truth"] = item.expected_output
             dicts.append(d)
 
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py
index f89d85b5b5..b9ffa1f6dd 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py
@@ -5,7 +5,7 @@
 This sample demonstrates three patterns:
 1. Post-hoc: Run the workflow, then evaluate the result you already have.
 2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you.
-3. Similarity: Evaluate an agent's output against ground-truth reference answers.
+3. Similarity: Evaluate the workflow's final output against ground-truth reference answers.
 
 Patterns 1 & 2 return a list of results (one per provider), each with a per-agent
 breakdown in sub_results so you can identify which agent is underperforming.

From 72c3461668ea92d60d64355e31a6b3bc78effa8d Mon Sep 17 00:00:00 2001
From: Chetan Toshniwal <ctoshniwal@microsoft.com>
Date: Mon, 13 Apr 2026 18:06:08 -0700
Subject: [PATCH 3/3] fix: wrap long line to satisfy ruff E501

---
 python/packages/core/agent_framework/_evaluation.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index cc334167eb..64fab0eacb 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -1733,7 +1733,10 @@ async def evaluate_workflow(
         raise ValueError("Provide either 'workflow_result' or 'queries'.")
 
     if expected_output is not None and queries is None:
-        raise ValueError("Provide 'queries' when using 'expected_output'; 'expected_output' is not supported with 'workflow_result' only.")
+        raise ValueError(
+            "Provide 'queries' when using 'expected_output';"
+            " 'expected_output' is not supported with 'workflow_result' only."
+        )
     if expected_output is not None and queries is not None and len(expected_output) != len(queries):
         raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.")