From b0523296c8fb4ca634b5a1dfb9bc4534c845687e Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sun, 7 Dec 2025 14:43:39 +0530 Subject: [PATCH 1/6] Add basic evaluation example script --- examples/basic_evaluation.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/basic_evaluation.py diff --git a/examples/basic_evaluation.py b/examples/basic_evaluation.py new file mode 100644 index 0000000..e5c06d7 --- /dev/null +++ b/examples/basic_evaluation.py @@ -0,0 +1,45 @@ +""" +Basic Evaluation Example for AgentUnit +-------------------------------------- + +This script demonstrates how to run a minimal evaluation using +AgentUnit with a FakeAdapter. It is designed for beginners and does +not require any extra dependencies. +""" + +from agentunit.core.evaluator import Evaluator +from agentunit.core.adapters import BaseAdapter + + +class FakeAdapter(BaseAdapter): + """ + A simple mock adapter used only for demonstration. + It returns a predictable output so evaluation is easy to understand. + """ + + def generate(self, prompt: str) -> str: + # Always returns the same answer for simplicity + return "Hello, this is a fake response!" + + +def main(): + # Step 1 — Prepare the adapter + adapter = FakeAdapter() + + # Step 2 — Create the evaluator + evaluator = Evaluator(adapter=adapter) + + # Step 3 — Prepare an example prompt + prompt = "Say hello!" + + # Step 4 — Run the evaluation + result = evaluator.evaluate(prompt) + + # Step 5 — Print the output + print("Prompt:", prompt) + print("Model Output:", result.output) + print("Evaluation Score:", result.score) + + +if __name__ == "__main__": + main() From dd2f4feeed6b9ddf968cd25fccc7cbdea345e0d7 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Mon, 8 Dec 2025 15:37:23 +0530 Subject: [PATCH 2/6] Fix typos and improve clarity in docstrings across core modules --- src/agentunit/core/__init__.py | 4 +++- src/agentunit/core/exceptions.py | 16 ++++++++++++---- src/agentunit/core/replay.py | 8 ++++++-- src/agentunit/core/runner.py | 4 +++- src/agentunit/core/scenario.py | 16 ++++++++++++---- src/agentunit/core/trace.py | 12 +++++++++--- 6 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/agentunit/core/__init__.py b/src/agentunit/core/__init__.py index 952bcb5..757d228 100644 --- a/src/agentunit/core/__init__.py +++ b/src/agentunit/core/__init__.py @@ -1,4 +1,6 @@ -"""Core components for AgentUnit.""" +""" +Core components for AgentUnit. +""" from agentunit.datasets.base import DatasetCase, DatasetSource from agentunit.reporting.results import ScenarioResult diff --git a/src/agentunit/core/exceptions.py b/src/agentunit/core/exceptions.py index 03326f2..2a1d159 100644 --- a/src/agentunit/core/exceptions.py +++ b/src/agentunit/core/exceptions.py @@ -1,15 +1,23 @@ -"""Custom exceptions for AgentUnit.""" +""" +Custom exceptions for AgentUnit. +""" from __future__ import annotations class AgentUnitError(Exception): - """Base class for AgentUnit exceptions.""" + """ + Base class for AgentUnit exceptions. + """ class AdapterNotAvailableError(AgentUnitError): - """Raised when an adapter cannot be initialized due to missing dependencies.""" + """ + Raised when an adapter cannot be initialized due to missing dependencies. + """ class ScenarioExecutionError(AgentUnitError): - """Raised when a scenario fails during execution.""" + """ + Raised when a scenario fails during execution. + """ diff --git a/src/agentunit/core/replay.py b/src/agentunit/core/replay.py index af78380..f0e2b5f 100644 --- a/src/agentunit/core/replay.py +++ b/src/agentunit/core/replay.py @@ -1,4 +1,6 @@ -"""Replay utilities leveraging stored traces.""" +""" +Replay utilities leveraging stored traces. +""" from __future__ import annotations @@ -8,7 +10,9 @@ def load_traces(traces_dir: str | Path) -> list[TraceLog]: - """Load stored traces from disk for deterministic replay or analysis.""" + """ + Load stored traces from disk for deterministic replay or analysis. + """ path = Path(traces_dir) logs: list[TraceLog] = [] diff --git a/src/agentunit/core/runner.py b/src/agentunit/core/runner.py index bcbd66c..143921b 100644 --- a/src/agentunit/core/runner.py +++ b/src/agentunit/core/runner.py @@ -1,4 +1,6 @@ -"""Scenario runner orchestration.""" +""" +Scenario runner orchestration. +""" from __future__ import annotations diff --git a/src/agentunit/core/scenario.py b/src/agentunit/core/scenario.py index f0b0d86..70bba2a 100644 --- a/src/agentunit/core/scenario.py +++ b/src/agentunit/core/scenario.py @@ -1,4 +1,6 @@ -"""Scenario definition API exposed to end users.""" +""" +Scenario definition API exposed to end users. +""" from __future__ import annotations @@ -19,7 +21,9 @@ @dataclass(slots=True) class Scenario: - """Defines a reproducible agent evaluation scenario.""" + """ + Defines a reproducible agent evaluation scenario. + """ name: str adapter: BaseAdapter @@ -75,7 +79,9 @@ def from_crewai( name: str | None = None, **options: object, ) -> Scenario: - """Create scenario from CrewAI crew.""" + """ + Create scenario from CrewAI crew. + """ from agentunit.adapters.crewai import CrewAIAdapter adapter = CrewAIAdapter.from_crew(crew, **options) @@ -91,7 +97,9 @@ def from_autogen( name: str | None = None, **options: object, ) -> Scenario: - """Create scenario from AutoGen orchestrator.""" + """ + Create scenario from AutoGen orchestrator. + """ from agentunit.adapters.autogen import AutoGenAdapter adapter = AutoGenAdapter(orchestrator=orchestrator, **options) diff --git a/src/agentunit/core/trace.py b/src/agentunit/core/trace.py index 52f1afc..59d0978 100644 --- a/src/agentunit/core/trace.py +++ b/src/agentunit/core/trace.py @@ -1,4 +1,6 @@ -"""Tracing utilities shared between adapters and the runner.""" +""" +Tracing utilities shared between adapters and the runner. +""" from __future__ import annotations @@ -11,7 +13,9 @@ @dataclass(slots=True) class TraceEvent: - """Represents a single prompt, tool call, or response in an agent run.""" + """ + Represents a single prompt, tool call, or response in an agent run. + """ type: str payload: dict[str, Any] @@ -20,7 +24,9 @@ class TraceEvent: @dataclass(slots=True) class TraceLog: - """A collection of chronological events for a scenario iteration.""" + """ + A collection of chronological events for a scenario iteration. + """ events: list[TraceEvent] = field(default_factory=list) From 80b0706d4ec6be984702f2b29f1116c04d3508e9 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sun, 14 Dec 2025 19:59:28 +0530 Subject: [PATCH 3/6] Add Google-style docstrings to BaseAdapter methods --- examples/basic_evaluation.py | 2 +- src/agentunit/adapters/base.py | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/examples/basic_evaluation.py b/examples/basic_evaluation.py index e5c06d7..ac40b84 100644 --- a/examples/basic_evaluation.py +++ b/examples/basic_evaluation.py @@ -7,8 +7,8 @@ not require any extra dependencies. """ -from agentunit.core.evaluator import Evaluator from agentunit.core.adapters import BaseAdapter +from agentunit.core.evaluator import Evaluator class FakeAdapter(BaseAdapter): diff --git a/src/agentunit/adapters/base.py b/src/agentunit/adapters/base.py index f17fb10..5bd975b 100644 --- a/src/agentunit/adapters/base.py +++ b/src/agentunit/adapters/base.py @@ -32,14 +32,42 @@ class BaseAdapter(abc.ABC): @abc.abstractmethod def prepare(self) -> None: - """Perform any lazy setup (loading graphs, flows, etc.).""" + """ + Perform any lazy setup required before execution. + + This may include loading graphs, flows, or other resources. + + Returns: + None + """ + @abc.abstractmethod def execute(self, case: DatasetCase, trace: TraceLog) -> AdapterOutcome: - """Run the agent flow on a single dataset case.""" + """ + Run the agent flow on a single dataset case. + + Args: + case (DatasetCase): The dataset case to be processed. + trace (TraceLog): Trace log used to record execution details. + + Returns: + AdapterOutcome: The outcome produced by executing the adapter. + """ + def cleanup(self) -> None: # pragma: no cover - default no-op - """Hook for cleaning up resources such as temporary files or servers.""" + """ + Clean up resources after execution. + + This hook can be used to release resources such as temporary files + or running servers. + + Returns: + None + """ + def supports_replay(self) -> bool: return True + From 8e7b8c10e8d4a8d5a2d70d0ff650169af3c6f57b Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Mon, 15 Dec 2025 11:08:08 +0530 Subject: [PATCH 4/6] Format base adapter using ruff --- src/agentunit/adapters/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/agentunit/adapters/base.py b/src/agentunit/adapters/base.py index 5bd975b..9a3c0c1 100644 --- a/src/agentunit/adapters/base.py +++ b/src/agentunit/adapters/base.py @@ -41,7 +41,6 @@ def prepare(self) -> None: None """ - @abc.abstractmethod def execute(self, case: DatasetCase, trace: TraceLog) -> AdapterOutcome: """ @@ -55,7 +54,6 @@ def execute(self, case: DatasetCase, trace: TraceLog) -> AdapterOutcome: AdapterOutcome: The outcome produced by executing the adapter. """ - def cleanup(self) -> None: # pragma: no cover - default no-op """ Clean up resources after execution. @@ -67,7 +65,5 @@ def cleanup(self) -> None: # pragma: no cover - default no-op None """ - def supports_replay(self) -> bool: return True - From 0669ffd0f6d513c766e755ae26350e3265b36db4 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Wed, 17 Dec 2025 21:59:52 +0530 Subject: [PATCH 5/6] docs: add instructions for running CI checks locally --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 3b9fa4a..850d485 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,21 @@ poetry run python3 -m pytest tests -v Latest verification (2025-10-24): 144 passed, 10 skipped, 32 warnings. Warnings originate from third-party dependencies (`langchain` pydantic shim deprecations and `datetime.utcnow` usage). Track upstream fixes or pin patched releases as needed. +### Running CI Checks Locally + +Before opening a pull request, you can run the same checks locally that are executed in CI. + +#### Requirements +- Python **3.10 or higher** +- [Poetry](https://python-poetry.org/) installed + +#### Setup +Install dependencies (including dev tools): + +```bash +poetry install --with dev +``` + ## Contributing We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for: From fe9d27bfef283c06bc7c1619e6d21d55a652ca23 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Thu, 18 Dec 2025 20:15:34 +0530 Subject: [PATCH 6/6] Remove example file unrelated to CI documentation --- examples/basic_evaluation.py | 45 ------------------------------------ 1 file changed, 45 deletions(-) delete mode 100644 examples/basic_evaluation.py diff --git a/examples/basic_evaluation.py b/examples/basic_evaluation.py deleted file mode 100644 index ac40b84..0000000 --- a/examples/basic_evaluation.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Basic Evaluation Example for AgentUnit --------------------------------------- - -This script demonstrates how to run a minimal evaluation using -AgentUnit with a FakeAdapter. It is designed for beginners and does -not require any extra dependencies. -""" - -from agentunit.core.adapters import BaseAdapter -from agentunit.core.evaluator import Evaluator - - -class FakeAdapter(BaseAdapter): - """ - A simple mock adapter used only for demonstration. - It returns a predictable output so evaluation is easy to understand. - """ - - def generate(self, prompt: str) -> str: - # Always returns the same answer for simplicity - return "Hello, this is a fake response!" - - -def main(): - # Step 1 — Prepare the adapter - adapter = FakeAdapter() - - # Step 2 — Create the evaluator - evaluator = Evaluator(adapter=adapter) - - # Step 3 — Prepare an example prompt - prompt = "Say hello!" - - # Step 4 — Run the evaluation - result = evaluator.evaluate(prompt) - - # Step 5 — Print the output - print("Prompt:", prompt) - print("Model Output:", result.output) - print("Evaluation Score:", result.score) - - -if __name__ == "__main__": - main()