From 49595e8fd6f984b4a66b90037037b650c31b952c Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sat, 21 Mar 2026 14:40:58 -0700 Subject: [PATCH 1/2] add a couple of simple evaluators Signed-off-by: Peter Jausovec --- .github/workflows/validate-evaluators.yaml | 4 +- .gitignore | 1 + README.md | 4 +- evaluators/contains/contains.py | 57 +++++++++++++ evaluators/contains/evaluator.yaml | 6 ++ evaluators/equals/equals.py | 58 ++++++++++++++ evaluators/equals/evaluator.yaml | 6 ++ evaluators/is_json/evaluator.yaml | 6 ++ evaluators/is_json/is_json.py | 64 +++++++++++++++ evaluators/levenshtein_ratio/evaluator.yaml | 6 ++ .../levenshtein_ratio/levenshtein_ratio.py | 80 +++++++++++++++++++ evaluators/regex_match/evaluator.yaml | 6 ++ evaluators/regex_match/regex_match.py | 78 ++++++++++++++++++ evaluators/tool_coverage/tool_coverage.py | 2 +- evaluators/tool_sequence_match/evaluator.yaml | 6 ++ .../tool_sequence_match.py | 69 ++++++++++++++++ scripts/test_input.json | 6 +- scripts/validate_evaluator.py | 10 +-- 18 files changed, 456 insertions(+), 13 deletions(-) create mode 100644 .gitignore create mode 100644 evaluators/contains/contains.py create mode 100644 evaluators/contains/evaluator.yaml create mode 100644 evaluators/equals/equals.py create mode 100644 evaluators/equals/evaluator.yaml create mode 100644 evaluators/is_json/evaluator.yaml create mode 100644 evaluators/is_json/is_json.py create mode 100644 evaluators/levenshtein_ratio/evaluator.yaml create mode 100644 evaluators/levenshtein_ratio/levenshtein_ratio.py create mode 100644 evaluators/regex_match/evaluator.yaml create mode 100644 evaluators/regex_match/regex_match.py create mode 100644 evaluators/tool_sequence_match/evaluator.yaml create mode 100644 evaluators/tool_sequence_match/tool_sequence_match.py diff --git a/.github/workflows/validate-evaluators.yaml b/.github/workflows/validate-evaluators.yaml index 565286b..f494cb2 100644 --- a/.github/workflows/validate-evaluators.yaml +++ b/.github/workflows/validate-evaluators.yaml @@ -20,9 +20,7 @@ jobs: - name: Install dependencies run: | - pip install pyyaml - # TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI - pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py" + pip install pyyaml agentevals-evaluator-sdk - name: Discover and validate all evaluators run: | diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0cafc1c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv/ \ No newline at end of file diff --git a/README.md b/README.md index c94955a..0a7fac5 100644 --- a/README.md +++ b/README.md @@ -107,13 +107,13 @@ author: your-github-username Run the validation script to catch issues before submitting: ```bash -pip install agentevals-grader-sdk pyyaml +pip install pyyaml agentevals-evaluator-sdk python scripts/validate_evaluator.py evaluators/my_evaluator ``` This checks: - **Manifest schema** -- required fields, entrypoint exists, name matches directory -- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator +- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator - **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.) You can also test with a full eval run: diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py new file mode 100644 index 0000000..cb840fc --- /dev/null +++ b/evaluators/contains/contains.py @@ -0,0 +1,57 @@ +"""Substring containment evaluator. + +Scores each invocation 1.0 if final_response contains the configured substring, +otherwise 0.0. + +Config: + substring (str): Required for scoring; if omitted, the evaluator is a no-op (1.0). + +Usage in eval_config.yaml: + config: + substring: "expected phrase" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +@evaluator +def contains(input: EvalInput) -> EvalResult: + needle = (input.config.get("substring") or "").strip() + if not needle: + return EvalResult( + score=1.0, + per_invocation_scores=[1.0] * len(input.invocations), + details={"note": "no substring configured; skipping check"}, + ) + + case_insensitive = bool(input.config.get("case_insensitive", False)) + haystack_fn = str.lower if case_insensitive else lambda s: s + needle_cmp = haystack_fn(needle) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + text = (inv.final_response or "") + if case_insensitive: + ok = needle_cmp in haystack_fn(text) + else: + ok = needle in text + if ok: + scores.append(1.0) + else: + scores.append(0.0) + issues.append(f"{inv.invocation_id}: response does not contain {needle!r}") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + contains.run() diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml new file mode 100644 index 0000000..368b446 --- /dev/null +++ b/evaluators/contains/evaluator.yaml @@ -0,0 +1,6 @@ +name: contains +description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive) +language: python +entrypoint: contains.py +tags: [string, contains,] +author: agentevals-dev diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py new file mode 100644 index 0000000..874fa30 --- /dev/null +++ b/evaluators/equals/equals.py @@ -0,0 +1,58 @@ +"""Exact string match evaluator. + +Config: + expected (str): If omitted, no-op (1.0). + case_insensitive (bool, default True): Compare normalized strings. + strip (bool, default True): Strip whitespace before compare. + +Usage: + config: + expected: "4" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +@evaluator +def equals(input: EvalInput) -> EvalResult: + expected = input.config.get("expected") + if expected is None: + return EvalResult( + score=1.0, + per_invocation_scores=[1.0] * len(input.invocations), + details={"note": "no expected string configured; skipping check"}, + ) + + case_insensitive = bool(input.config.get("case_insensitive", True)) + strip = bool(input.config.get("strip", True)) + + def norm(s: str) -> str: + t = s.strip() if strip else s + return t.lower() if case_insensitive else t + + exp = norm(str(expected)) + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + got = norm(inv.final_response or "") + if got == exp: + scores.append(1.0) + else: + scores.append(0.0) + issues.append( + f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}" + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + equals.run() diff --git a/evaluators/equals/evaluator.yaml b/evaluators/equals/evaluator.yaml new file mode 100644 index 0000000..31915ef --- /dev/null +++ b/evaluators/equals/evaluator.yaml @@ -0,0 +1,6 @@ +name: equals +description: Scores whether each final response exactly matches a configured expected string +language: python +entrypoint: equals.py +tags: [string, equals] +author: agentevals-dev diff --git a/evaluators/is_json/evaluator.yaml b/evaluators/is_json/evaluator.yaml new file mode 100644 index 0000000..1236e9e --- /dev/null +++ b/evaluators/is_json/evaluator.yaml @@ -0,0 +1,6 @@ +name: is_json +description: Scores whether each final response parses as JSON (optional markdown code fence extraction) +language: python +entrypoint: is_json.py +tags: [json, structured] +author: agentevals-dev diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py new file mode 100644 index 0000000..d0e0430 --- /dev/null +++ b/evaluators/is_json/is_json.py @@ -0,0 +1,64 @@ +"""JSON parse check evaluator. + +Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks. + +Config: + require_json (bool, default False): If False, evaluator is a no-op (1.0). + extract_markdown_fence (bool, default True): Strip ```json fences if present. + +Usage: + config: + require_json: true +""" + +from __future__ import annotations + +import json +import re + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + +_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE) + + +def _parse_json_payload(text: str, extract_fence: bool) -> object: + raw = (text or "").strip() + if extract_fence: + m = _FENCE.match(raw) + if m: + raw = m.group(1).strip() + return json.loads(raw) + + +@evaluator +def is_json(input: EvalInput) -> EvalResult: + if not input.config.get("require_json"): + return EvalResult( + score=1.0, + per_invocation_scores=[1.0] * len(input.invocations), + details={"note": "require_json not set; skipping check"}, + ) + + extract_fence = bool(input.config.get("extract_markdown_fence", True)) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + try: + _parse_json_payload(inv.final_response or "", extract_fence) + scores.append(1.0) + except (json.JSONDecodeError, TypeError, ValueError) as exc: + scores.append(0.0) + issues.append(f"{inv.invocation_id}: not valid JSON ({exc})") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + is_json.run() diff --git a/evaluators/levenshtein_ratio/evaluator.yaml b/evaluators/levenshtein_ratio/evaluator.yaml new file mode 100644 index 0000000..a9263db --- /dev/null +++ b/evaluators/levenshtein_ratio/evaluator.yaml @@ -0,0 +1,6 @@ +name: levenshtein_ratio +description: Scores similarity of each response to a reference string using normalized Levenshtein distance +language: python +entrypoint: levenshtein_ratio.py +tags: [string, levenshtein] +author: agentevals-dev diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py new file mode 100644 index 0000000..dcec4ea --- /dev/null +++ b/evaluators/levenshtein_ratio/levenshtein_ratio.py @@ -0,0 +1,80 @@ +"""Normalized Levenshtein similarity evaluator. + +Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1]. + +Config: + expected (str): If omitted, no-op (1.0). + case_insensitive (bool, default False): Compare lowercased strings. + +Usage: + config: + expected: "reference answer" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +def _levenshtein(a: str, b: str) -> int: + """Classic O(nm) edit distance.""" + if len(a) < len(b): + a, b = b, a + if not b: + return len(a) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a): + cur = [i + 1] + for j, cb in enumerate(b): + ins = prev[j + 1] + 1 + delete = cur[j] + 1 + sub = prev[j] + (ca != cb) + cur.append(min(ins, delete, sub)) + prev = cur + return prev[-1] + + +@evaluator +def levenshtein_ratio(input: EvalInput) -> EvalResult: + expected = input.config.get("expected") + if expected is None: + return EvalResult( + score=1.0, + per_invocation_scores=[1.0] * len(input.invocations), + details={"note": "no expected string configured; skipping check"}, + ) + + case_insensitive = bool(input.config.get("case_insensitive", False)) + ref = str(expected) + if case_insensitive: + ref = ref.lower() + + scores: list[float] = [] + details_rows: list[dict] = [] + + for inv in input.invocations: + got = inv.final_response or "" + a, b = (got.lower(), ref) if case_insensitive else (got, ref) + dist = _levenshtein(a, b) + denom = max(len(a), len(b), 1) + sim = 1.0 - (dist / denom) + sim = max(0.0, min(1.0, sim)) + scores.append(sim) + details_rows.append( + { + "invocation_id": inv.invocation_id, + "distance": dist, + "similarity": sim, + } + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"per_invocation": details_rows}, + ) + + +if __name__ == "__main__": + levenshtein_ratio.run() diff --git a/evaluators/regex_match/evaluator.yaml b/evaluators/regex_match/evaluator.yaml new file mode 100644 index 0000000..c630ed0 --- /dev/null +++ b/evaluators/regex_match/evaluator.yaml @@ -0,0 +1,6 @@ +name: regex_match +description: Scores whether each final response matches a configured regular expression +language: python +entrypoint: regex_match.py +tags: [regex] +author: agentevals-dev diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py new file mode 100644 index 0000000..e08bb5b --- /dev/null +++ b/evaluators/regex_match/regex_match.py @@ -0,0 +1,78 @@ +"""Regex on final response evaluator. + +Config: + pattern (str): If omitted, no-op (1.0). + flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |. + +Usage: + config: + pattern: "^The answer" + flags: IGNORECASE +""" + +from __future__ import annotations + +import re + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + +_FLAG_MAP = { + "IGNORECASE": re.IGNORECASE, + "MULTILINE": re.MULTILINE, + "DOTALL": re.DOTALL, +} + + +@evaluator +def regex_match(input: EvalInput) -> EvalResult: + pattern = input.config.get("pattern") + if not pattern: + return EvalResult( + score=1.0, + per_invocation_scores=[1.0] * len(input.invocations), + details={"note": "no pattern configured; skipping check"}, + ) + + flag_names = input.config.get("flags") + flags = 0 + if isinstance(flag_names, str): + for part in flag_names.replace("|", ",").split(","): + key = part.strip().upper() + if key in _FLAG_MAP: + flags |= _FLAG_MAP[key] + elif isinstance(flag_names, list): + for part in flag_names: + key = str(part).strip().upper() + if key in _FLAG_MAP: + flags |= _FLAG_MAP[key] + + try: + rx = re.compile(str(pattern), flags) + except re.error as exc: + return EvalResult( + score=0.0, + per_invocation_scores=[0.0] * len(input.invocations), + details={"error": f"invalid regex: {exc}"}, + ) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + text = inv.final_response or "" + if rx.search(text): + scores.append(1.0) + else: + scores.append(0.0) + issues.append(f"{inv.invocation_id}: no match for pattern {pattern!r}") + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + regex_match.run() diff --git a/evaluators/tool_coverage/tool_coverage.py b/evaluators/tool_coverage/tool_coverage.py index 3bc7a5b..7a17d38 100644 --- a/evaluators/tool_coverage/tool_coverage.py +++ b/evaluators/tool_coverage/tool_coverage.py @@ -18,7 +18,7 @@ def tool_coverage(input: EvalInput) -> EvalResult: details: list[str] = [] for inv in input.invocations: - actual = len(inv.tool_calls) + actual = len(inv.intermediate_steps.tool_calls) if actual >= min_calls: scores.append(1.0) else: diff --git a/evaluators/tool_sequence_match/evaluator.yaml b/evaluators/tool_sequence_match/evaluator.yaml new file mode 100644 index 0000000..b58999c --- /dev/null +++ b/evaluators/tool_sequence_match/evaluator.yaml @@ -0,0 +1,6 @@ +name: tool_sequence_match +description: Scores whether tool calls match an expected list of tool names (order-sensitive or multiset) +language: python +entrypoint: tool_sequence_match.py +tags: [tools, trajectory] +author: agentevals-dev diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py new file mode 100644 index 0000000..3e0a800 --- /dev/null +++ b/evaluators/tool_sequence_match/tool_sequence_match.py @@ -0,0 +1,69 @@ +"""Expected tool call sequence evaluator. + +Compares the ordered list of tool names in each invocation to config. + +Config: + expected_tool_names (list[str]): If omitted or empty, no-op (1.0). + require_order (bool, default True): If False, compares multisets (same counts per name). + +Usage: + config: + expected_tool_names: ["search", "calculator"] + require_order: true +""" + +from __future__ import annotations + +from collections import Counter + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +@evaluator +def tool_sequence_match(input: EvalInput) -> EvalResult: + expected = input.config.get("expected_tool_names") + if not expected: + return EvalResult( + score=1.0, + per_invocation_scores=[1.0] * len(input.invocations), + details={"note": "no expected_tool_names configured; skipping check"}, + ) + + want = [str(x) for x in expected] + require_order = bool(input.config.get("require_order", True)) + + scores: list[float] = [] + issues: list[str] = [] + + for inv in input.invocations: + actual = [] + for call in inv.intermediate_steps.tool_calls or []: + if isinstance(call, dict): + n = call.get("name") + if n is not None: + actual.append(str(n)) + + if require_order: + ok = actual == want + else: + ok = Counter(actual) == Counter(want) + + if ok: + scores.append(1.0) + else: + scores.append(0.0) + issues.append( + f"{inv.invocation_id}: expected {want!r}, got {actual!r} " + f"(require_order={require_order})" + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"issues": issues} if issues else None, + ) + + +if __name__ == "__main__": + tool_sequence_match.run() diff --git a/scripts/test_input.json b/scripts/test_input.json index 79d41f5..7b5939c 100644 --- a/scripts/test_input.json +++ b/scripts/test_input.json @@ -7,8 +7,10 @@ "invocation_id": "ci-test-001", "user_content": "What is 2+2?", "final_response": "The answer is 4.", - "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}], - "tool_responses": [{"name": "calculator", "output": "4"}] + "intermediate_steps": { + "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}], + "tool_responses": [{"name": "calculator", "output": "4"}] + } } ], "expected_invocations": null diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py index e80570e..c1d5fb5 100644 --- a/scripts/validate_evaluator.py +++ b/scripts/validate_evaluator.py @@ -94,14 +94,14 @@ def validate_syntax(evaluator_dir: Path, manifest: dict) -> bool: _ok(f"Python syntax valid ({entry_path})") source = entry_path.read_text() - if "agentevals_grader_sdk" not in source: + if "agentevals_evaluator_sdk" not in source: _fail( - f"{entry_path} does not import agentevals_grader_sdk. " + f"{entry_path} does not import agentevals_evaluator_sdk. " f"Evaluators must use the SDK or implement the stdin/stdout protocol." ) return False - if "@grader" not in source: - _fail(f"{entry_path} does not use the @grader decorator") + if "@evaluator" not in source: + _fail(f"{entry_path} does not use the @evaluator decorator") return False if 'if __name__ == "__main__"' not in source and "if __name__ == '__main__'" not in source: _fail(f"{entry_path} missing 'if __name__ == \"__main__\"' block with .run() call") @@ -211,7 +211,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool: # Full Pydantic validation via the SDK if available try: - from agentevals_grader_sdk import EvalResult + from agentevals_evaluator_sdk import EvalResult EvalResult.model_validate(output) _ok("Output validates against EvalResult schema (Pydantic)") except ImportError: From ae67759488d99a06373dae77d557449716bf8c9f Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sun, 22 Mar 2026 22:49:29 +0100 Subject: [PATCH 2/2] pr feedback Signed-off-by: Peter Jausovec --- evaluators/contains/contains.py | 28 ++++++++++--------- evaluators/contains/evaluator.yaml | 2 +- evaluators/equals/equals.py | 14 ++++++---- evaluators/is_json/is_json.py | 10 +------ .../levenshtein_ratio/levenshtein_ratio.py | 12 ++++---- evaluators/regex_match/regex_match.py | 17 ++++++----- .../tool_sequence_match.py | 19 +++++++++---- scripts/validate_evaluator.py | 9 +++++- 8 files changed, 64 insertions(+), 47 deletions(-) diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py index cb840fc..cbe6eca 100644 --- a/evaluators/contains/contains.py +++ b/evaluators/contains/contains.py @@ -4,7 +4,7 @@ otherwise 0.0. Config: - substring (str): Required for scoring; if omitted, the evaluator is a no-op (1.0). + substring (str): Required. If omitted, returns NOT_EVALUATED. Usage in eval_config.yaml: config: @@ -13,37 +13,39 @@ from __future__ import annotations -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator @evaluator def contains(input: EvalInput) -> EvalResult: - needle = (input.config.get("substring") or "").strip() - if not needle: + substring = (input.config.get("substring") or "").strip() + n = len(input.invocations) + if not substring: return EvalResult( - score=1.0, - per_invocation_scores=[1.0] * len(input.invocations), - details={"note": "no substring configured; skipping check"}, + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: substring"}, ) case_insensitive = bool(input.config.get("case_insensitive", False)) - haystack_fn = str.lower if case_insensitive else lambda s: s - needle_cmp = haystack_fn(needle) + normalize = str.lower if case_insensitive else lambda s: s + substring_cmp = normalize(substring) scores: list[float] = [] issues: list[str] = [] for inv in input.invocations: - text = (inv.final_response or "") + response_text = inv.final_response or "" if case_insensitive: - ok = needle_cmp in haystack_fn(text) + ok = substring_cmp in normalize(response_text) else: - ok = needle in text + ok = substring in response_text if ok: scores.append(1.0) else: scores.append(0.0) - issues.append(f"{inv.invocation_id}: response does not contain {needle!r}") + issues.append(f"{inv.invocation_id}: response does not contain {substring!r}") overall = sum(scores) / len(scores) if scores else 0.0 return EvalResult( diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml index 368b446..7395d68 100644 --- a/evaluators/contains/evaluator.yaml +++ b/evaluators/contains/evaluator.yaml @@ -2,5 +2,5 @@ name: contains description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive) language: python entrypoint: contains.py -tags: [string, contains,] +tags: [string, contains] author: agentevals-dev diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py index 874fa30..d85b1e1 100644 --- a/evaluators/equals/equals.py +++ b/evaluators/equals/equals.py @@ -1,7 +1,7 @@ """Exact string match evaluator. Config: - expected (str): If omitted, no-op (1.0). + expected (str): Required. If omitted, returns NOT_EVALUATED. case_insensitive (bool, default True): Compare normalized strings. strip (bool, default True): Strip whitespace before compare. @@ -12,20 +12,22 @@ from __future__ import annotations -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator @evaluator def equals(input: EvalInput) -> EvalResult: expected = input.config.get("expected") if expected is None: + n = len(input.invocations) return EvalResult( - score=1.0, - per_invocation_scores=[1.0] * len(input.invocations), - details={"note": "no expected string configured; skipping check"}, + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: expected"}, ) - case_insensitive = bool(input.config.get("case_insensitive", True)) + case_insensitive = bool(input.config.get("case_insensitive", False)) strip = bool(input.config.get("strip", True)) def norm(s: str) -> str: diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py index d0e0430..deb4980 100644 --- a/evaluators/is_json/is_json.py +++ b/evaluators/is_json/is_json.py @@ -3,12 +3,11 @@ Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks. Config: - require_json (bool, default False): If False, evaluator is a no-op (1.0). extract_markdown_fence (bool, default True): Strip ```json fences if present. Usage: config: - require_json: true + extract_markdown_fence: true """ from __future__ import annotations @@ -32,13 +31,6 @@ def _parse_json_payload(text: str, extract_fence: bool) -> object: @evaluator def is_json(input: EvalInput) -> EvalResult: - if not input.config.get("require_json"): - return EvalResult( - score=1.0, - per_invocation_scores=[1.0] * len(input.invocations), - details={"note": "require_json not set; skipping check"}, - ) - extract_fence = bool(input.config.get("extract_markdown_fence", True)) scores: list[float] = [] diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py index dcec4ea..730b917 100644 --- a/evaluators/levenshtein_ratio/levenshtein_ratio.py +++ b/evaluators/levenshtein_ratio/levenshtein_ratio.py @@ -3,7 +3,7 @@ Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1]. Config: - expected (str): If omitted, no-op (1.0). + expected (str): Required. If omitted, returns NOT_EVALUATED. case_insensitive (bool, default False): Compare lowercased strings. Usage: @@ -13,7 +13,7 @@ from __future__ import annotations -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator def _levenshtein(a: str, b: str) -> int: @@ -38,10 +38,12 @@ def _levenshtein(a: str, b: str) -> int: def levenshtein_ratio(input: EvalInput) -> EvalResult: expected = input.config.get("expected") if expected is None: + n = len(input.invocations) return EvalResult( - score=1.0, - per_invocation_scores=[1.0] * len(input.invocations), - details={"note": "no expected string configured; skipping check"}, + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: expected"}, ) case_insensitive = bool(input.config.get("case_insensitive", False)) diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py index e08bb5b..271602e 100644 --- a/evaluators/regex_match/regex_match.py +++ b/evaluators/regex_match/regex_match.py @@ -1,7 +1,7 @@ """Regex on final response evaluator. Config: - pattern (str): If omitted, no-op (1.0). + pattern (str): Required. If omitted or invalid, returns NOT_EVALUATED. flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |. Usage: @@ -14,7 +14,7 @@ import re -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator _FLAG_MAP = { "IGNORECASE": re.IGNORECASE, @@ -26,11 +26,13 @@ @evaluator def regex_match(input: EvalInput) -> EvalResult: pattern = input.config.get("pattern") + n = len(input.invocations) if not pattern: return EvalResult( - score=1.0, - per_invocation_scores=[1.0] * len(input.invocations), - details={"note": "no pattern configured; skipping check"}, + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: pattern"}, ) flag_names = input.config.get("flags") @@ -51,8 +53,9 @@ def regex_match(input: EvalInput) -> EvalResult: except re.error as exc: return EvalResult( score=0.0, - per_invocation_scores=[0.0] * len(input.invocations), - details={"error": f"invalid regex: {exc}"}, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "invalid regex pattern", "error": str(exc)}, ) scores: list[float] = [] diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py index 3e0a800..a729707 100644 --- a/evaluators/tool_sequence_match/tool_sequence_match.py +++ b/evaluators/tool_sequence_match/tool_sequence_match.py @@ -3,7 +3,7 @@ Compares the ordered list of tool names in each invocation to config. Config: - expected_tool_names (list[str]): If omitted or empty, no-op (1.0). + expected_tool_names (list[str]): Required non-empty. Otherwise returns NOT_EVALUATED. require_order (bool, default True): If False, compares multisets (same counts per name). Usage: @@ -16,17 +16,26 @@ from collections import Counter -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator @evaluator def tool_sequence_match(input: EvalInput) -> EvalResult: expected = input.config.get("expected_tool_names") + n = len(input.invocations) + if expected is None or not isinstance(expected, list): + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing or invalid config: expected_tool_names (need a list of names)"}, + ) if not expected: return EvalResult( - score=1.0, - per_invocation_scores=[1.0] * len(input.invocations), - details={"note": "no expected_tool_names configured; skipping check"}, + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing or empty config: expected_tool_names"}, ) want = [str(x) for x in expected] diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py index c1d5fb5..13379e3 100644 --- a/scripts/validate_evaluator.py +++ b/scripts/validate_evaluator.py @@ -158,7 +158,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool: if not stdout: stderr_preview = result.stderr.strip()[:500] _fail( - f"Evaluator produced no output on stdout" + "Evaluator produced no output on stdout" + (f"\n stderr: {stderr_preview}" if stderr_preview else "") ) return False @@ -208,6 +208,13 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool: f"got {type(per_inv).__name__}" ) return False + for i, x in enumerate(per_inv): + if x is not None and not isinstance(x, (int, float)): + _fail( + f"'per_invocation_scores[{i}]' must be a number or null, " + f"got {type(x).__name__}" + ) + return False # Full Pydantic validation via the SDK if available try: