From 49595e8fd6f984b4a66b90037037b650c31b952c Mon Sep 17 00:00:00 2001
From: Peter Jausovec <peter.jausovec@solo.io>
Date: Sat, 21 Mar 2026 14:40:58 -0700
Subject: [PATCH 1/2] add a couple of simple evaluators

Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
---
 .github/workflows/validate-evaluators.yaml    |  4 +-
 .gitignore                                    |  1 +
 README.md                                     |  4 +-
 evaluators/contains/contains.py               | 57 +++++++++++++
 evaluators/contains/evaluator.yaml            |  6 ++
 evaluators/equals/equals.py                   | 58 ++++++++++++++
 evaluators/equals/evaluator.yaml              |  6 ++
 evaluators/is_json/evaluator.yaml             |  6 ++
 evaluators/is_json/is_json.py                 | 64 +++++++++++++++
 evaluators/levenshtein_ratio/evaluator.yaml   |  6 ++
 .../levenshtein_ratio/levenshtein_ratio.py    | 80 +++++++++++++++++++
 evaluators/regex_match/evaluator.yaml         |  6 ++
 evaluators/regex_match/regex_match.py         | 78 ++++++++++++++++++
 evaluators/tool_coverage/tool_coverage.py     |  2 +-
 evaluators/tool_sequence_match/evaluator.yaml |  6 ++
 .../tool_sequence_match.py                    | 69 ++++++++++++++++
 scripts/test_input.json                       |  6 +-
 scripts/validate_evaluator.py                 | 10 +--
 18 files changed, 456 insertions(+), 13 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 evaluators/contains/contains.py
 create mode 100644 evaluators/contains/evaluator.yaml
 create mode 100644 evaluators/equals/equals.py
 create mode 100644 evaluators/equals/evaluator.yaml
 create mode 100644 evaluators/is_json/evaluator.yaml
 create mode 100644 evaluators/is_json/is_json.py
 create mode 100644 evaluators/levenshtein_ratio/evaluator.yaml
 create mode 100644 evaluators/levenshtein_ratio/levenshtein_ratio.py
 create mode 100644 evaluators/regex_match/evaluator.yaml
 create mode 100644 evaluators/regex_match/regex_match.py
 create mode 100644 evaluators/tool_sequence_match/evaluator.yaml
 create mode 100644 evaluators/tool_sequence_match/tool_sequence_match.py

diff --git a/.github/workflows/validate-evaluators.yaml b/.github/workflows/validate-evaluators.yaml
index 565286b..f494cb2 100644
--- a/.github/workflows/validate-evaluators.yaml
+++ b/.github/workflows/validate-evaluators.yaml
@@ -20,9 +20,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install pyyaml
-          # TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI
-          pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py"
+          pip install pyyaml agentevals-evaluator-sdk
 
       - name: Discover and validate all evaluators
         run: |
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0cafc1c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.venv/
\ No newline at end of file
diff --git a/README.md b/README.md
index c94955a..0a7fac5 100644
--- a/README.md
+++ b/README.md
@@ -107,13 +107,13 @@ author: your-github-username
 Run the validation script to catch issues before submitting:
 
 ```bash
-pip install agentevals-grader-sdk pyyaml
+pip install pyyaml agentevals-evaluator-sdk
 python scripts/validate_evaluator.py evaluators/my_evaluator
 ```
 
 This checks:
 - **Manifest schema** -- required fields, entrypoint exists, name matches directory
-- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
+- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator
 - **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)
 
 You can also test with a full eval run:
diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py
new file mode 100644
index 0000000..cb840fc
--- /dev/null
+++ b/evaluators/contains/contains.py
@@ -0,0 +1,57 @@
+"""Substring containment evaluator.
+
+Scores each invocation 1.0 if final_response contains the configured substring,
+otherwise 0.0.
+
+Config:
+  substring (str): Required for scoring; if omitted, the evaluator is a no-op (1.0).
+
+Usage in eval_config.yaml:
+    config:
+      substring: "expected phrase"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+@evaluator
+def contains(input: EvalInput) -> EvalResult:
+    needle = (input.config.get("substring") or "").strip()
+    if not needle:
+        return EvalResult(
+            score=1.0,
+            per_invocation_scores=[1.0] * len(input.invocations),
+            details={"note": "no substring configured; skipping check"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    haystack_fn = str.lower if case_insensitive else lambda s: s
+    needle_cmp = haystack_fn(needle)
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        text = (inv.final_response or "")
+        if case_insensitive:
+            ok = needle_cmp in haystack_fn(text)
+        else:
+            ok = needle in text
+        if ok:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: response does not contain {needle!r}")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    contains.run()
diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml
new file mode 100644
index 0000000..368b446
--- /dev/null
+++ b/evaluators/contains/evaluator.yaml
@@ -0,0 +1,6 @@
+name: contains
+description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
+language: python
+entrypoint: contains.py
+tags: [string, contains,]
+author: agentevals-dev
diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py
new file mode 100644
index 0000000..874fa30
--- /dev/null
+++ b/evaluators/equals/equals.py
@@ -0,0 +1,58 @@
+"""Exact string match evaluator.
+
+Config:
+  expected (str): If omitted, no-op (1.0).
+  case_insensitive (bool, default True): Compare normalized strings.
+  strip (bool, default True): Strip whitespace before compare.
+
+Usage:
+    config:
+      expected: "4"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+@evaluator
+def equals(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected")
+    if expected is None:
+        return EvalResult(
+            score=1.0,
+            per_invocation_scores=[1.0] * len(input.invocations),
+            details={"note": "no expected string configured; skipping check"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", True))
+    strip = bool(input.config.get("strip", True))
+
+    def norm(s: str) -> str:
+        t = s.strip() if strip else s
+        return t.lower() if case_insensitive else t
+
+    exp = norm(str(expected))
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        got = norm(inv.final_response or "")
+        if got == exp:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(
+                f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}"
+            )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    equals.run()
diff --git a/evaluators/equals/evaluator.yaml b/evaluators/equals/evaluator.yaml
new file mode 100644
index 0000000..31915ef
--- /dev/null
+++ b/evaluators/equals/evaluator.yaml
@@ -0,0 +1,6 @@
+name: equals
+description: Scores whether each final response exactly matches a configured expected string
+language: python
+entrypoint: equals.py
+tags: [string, equals]
+author: agentevals-dev
diff --git a/evaluators/is_json/evaluator.yaml b/evaluators/is_json/evaluator.yaml
new file mode 100644
index 0000000..1236e9e
--- /dev/null
+++ b/evaluators/is_json/evaluator.yaml
@@ -0,0 +1,6 @@
+name: is_json
+description: Scores whether each final response parses as JSON (optional markdown code fence extraction)
+language: python
+entrypoint: is_json.py
+tags: [json, structured]
+author: agentevals-dev
diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py
new file mode 100644
index 0000000..d0e0430
--- /dev/null
+++ b/evaluators/is_json/is_json.py
@@ -0,0 +1,64 @@
+"""JSON parse check evaluator.
+
+Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
+
+Config:
+  require_json (bool, default False): If False, evaluator is a no-op (1.0).
+  extract_markdown_fence (bool, default True): Strip ```json fences if present.
+
+Usage:
+    config:
+      require_json: true
+"""
+
+from __future__ import annotations
+
+import json
+import re
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE)
+
+
+def _parse_json_payload(text: str, extract_fence: bool) -> object:
+    raw = (text or "").strip()
+    if extract_fence:
+        m = _FENCE.match(raw)
+        if m:
+            raw = m.group(1).strip()
+    return json.loads(raw)
+
+
+@evaluator
+def is_json(input: EvalInput) -> EvalResult:
+    if not input.config.get("require_json"):
+        return EvalResult(
+            score=1.0,
+            per_invocation_scores=[1.0] * len(input.invocations),
+            details={"note": "require_json not set; skipping check"},
+        )
+
+    extract_fence = bool(input.config.get("extract_markdown_fence", True))
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        try:
+            _parse_json_payload(inv.final_response or "", extract_fence)
+            scores.append(1.0)
+        except (json.JSONDecodeError, TypeError, ValueError) as exc:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: not valid JSON ({exc})")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    is_json.run()
diff --git a/evaluators/levenshtein_ratio/evaluator.yaml b/evaluators/levenshtein_ratio/evaluator.yaml
new file mode 100644
index 0000000..a9263db
--- /dev/null
+++ b/evaluators/levenshtein_ratio/evaluator.yaml
@@ -0,0 +1,6 @@
+name: levenshtein_ratio
+description: Scores similarity of each response to a reference string using normalized Levenshtein distance
+language: python
+entrypoint: levenshtein_ratio.py
+tags: [string, levenshtein]
+author: agentevals-dev
diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py
new file mode 100644
index 0000000..dcec4ea
--- /dev/null
+++ b/evaluators/levenshtein_ratio/levenshtein_ratio.py
@@ -0,0 +1,80 @@
+"""Normalized Levenshtein similarity evaluator.
+
+Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1].
+
+Config:
+  expected (str): If omitted, no-op (1.0).
+  case_insensitive (bool, default False): Compare lowercased strings.
+
+Usage:
+    config:
+      expected: "reference answer"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+def _levenshtein(a: str, b: str) -> int:
+    """Classic O(nm) edit distance."""
+    if len(a) < len(b):
+        a, b = b, a
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a):
+        cur = [i + 1]
+        for j, cb in enumerate(b):
+            ins = prev[j + 1] + 1
+            delete = cur[j] + 1
+            sub = prev[j] + (ca != cb)
+            cur.append(min(ins, delete, sub))
+        prev = cur
+    return prev[-1]
+
+
+@evaluator
+def levenshtein_ratio(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected")
+    if expected is None:
+        return EvalResult(
+            score=1.0,
+            per_invocation_scores=[1.0] * len(input.invocations),
+            details={"note": "no expected string configured; skipping check"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    ref = str(expected)
+    if case_insensitive:
+        ref = ref.lower()
+
+    scores: list[float] = []
+    details_rows: list[dict] = []
+
+    for inv in input.invocations:
+        got = inv.final_response or ""
+        a, b = (got.lower(), ref) if case_insensitive else (got, ref)
+        dist = _levenshtein(a, b)
+        denom = max(len(a), len(b), 1)
+        sim = 1.0 - (dist / denom)
+        sim = max(0.0, min(1.0, sim))
+        scores.append(sim)
+        details_rows.append(
+            {
+                "invocation_id": inv.invocation_id,
+                "distance": dist,
+                "similarity": sim,
+            }
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"per_invocation": details_rows},
+    )
+
+
+if __name__ == "__main__":
+    levenshtein_ratio.run()
diff --git a/evaluators/regex_match/evaluator.yaml b/evaluators/regex_match/evaluator.yaml
new file mode 100644
index 0000000..c630ed0
--- /dev/null
+++ b/evaluators/regex_match/evaluator.yaml
@@ -0,0 +1,6 @@
+name: regex_match
+description: Scores whether each final response matches a configured regular expression
+language: python
+entrypoint: regex_match.py
+tags: [regex]
+author: agentevals-dev
diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py
new file mode 100644
index 0000000..e08bb5b
--- /dev/null
+++ b/evaluators/regex_match/regex_match.py
@@ -0,0 +1,78 @@
+"""Regex on final response evaluator.
+
+Config:
+  pattern (str): If omitted, no-op (1.0).
+  flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |.
+
+Usage:
+    config:
+      pattern: "^The answer"
+      flags: IGNORECASE
+"""
+
+from __future__ import annotations
+
+import re
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+_FLAG_MAP = {
+    "IGNORECASE": re.IGNORECASE,
+    "MULTILINE": re.MULTILINE,
+    "DOTALL": re.DOTALL,
+}
+
+
+@evaluator
+def regex_match(input: EvalInput) -> EvalResult:
+    pattern = input.config.get("pattern")
+    if not pattern:
+        return EvalResult(
+            score=1.0,
+            per_invocation_scores=[1.0] * len(input.invocations),
+            details={"note": "no pattern configured; skipping check"},
+        )
+
+    flag_names = input.config.get("flags")
+    flags = 0
+    if isinstance(flag_names, str):
+        for part in flag_names.replace("|", ",").split(","):
+            key = part.strip().upper()
+            if key in _FLAG_MAP:
+                flags |= _FLAG_MAP[key]
+    elif isinstance(flag_names, list):
+        for part in flag_names:
+            key = str(part).strip().upper()
+            if key in _FLAG_MAP:
+                flags |= _FLAG_MAP[key]
+
+    try:
+        rx = re.compile(str(pattern), flags)
+    except re.error as exc:
+        return EvalResult(
+            score=0.0,
+            per_invocation_scores=[0.0] * len(input.invocations),
+            details={"error": f"invalid regex: {exc}"},
+        )
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        text = inv.final_response or ""
+        if rx.search(text):
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: no match for pattern {pattern!r}")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    regex_match.run()
diff --git a/evaluators/tool_coverage/tool_coverage.py b/evaluators/tool_coverage/tool_coverage.py
index 3bc7a5b..7a17d38 100644
--- a/evaluators/tool_coverage/tool_coverage.py
+++ b/evaluators/tool_coverage/tool_coverage.py
@@ -18,7 +18,7 @@ def tool_coverage(input: EvalInput) -> EvalResult:
     details: list[str] = []
 
     for inv in input.invocations:
-        actual = len(inv.tool_calls)
+        actual = len(inv.intermediate_steps.tool_calls)
         if actual >= min_calls:
             scores.append(1.0)
         else:
diff --git a/evaluators/tool_sequence_match/evaluator.yaml b/evaluators/tool_sequence_match/evaluator.yaml
new file mode 100644
index 0000000..b58999c
--- /dev/null
+++ b/evaluators/tool_sequence_match/evaluator.yaml
@@ -0,0 +1,6 @@
+name: tool_sequence_match
+description: Scores whether tool calls match an expected list of tool names (order-sensitive or multiset)
+language: python
+entrypoint: tool_sequence_match.py
+tags: [tools, trajectory]
+author: agentevals-dev
diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py
new file mode 100644
index 0000000..3e0a800
--- /dev/null
+++ b/evaluators/tool_sequence_match/tool_sequence_match.py
@@ -0,0 +1,69 @@
+"""Expected tool call sequence evaluator.
+
+Compares the ordered list of tool names in each invocation to config.
+
+Config:
+  expected_tool_names (list[str]): If omitted or empty, no-op (1.0).
+  require_order (bool, default True): If False, compares multisets (same counts per name).
+
+Usage:
+    config:
+      expected_tool_names: ["search", "calculator"]
+      require_order: true
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+@evaluator
+def tool_sequence_match(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected_tool_names")
+    if not expected:
+        return EvalResult(
+            score=1.0,
+            per_invocation_scores=[1.0] * len(input.invocations),
+            details={"note": "no expected_tool_names configured; skipping check"},
+        )
+
+    want = [str(x) for x in expected]
+    require_order = bool(input.config.get("require_order", True))
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        actual = []
+        for call in inv.intermediate_steps.tool_calls or []:
+            if isinstance(call, dict):
+                n = call.get("name")
+                if n is not None:
+                    actual.append(str(n))
+
+        if require_order:
+            ok = actual == want
+        else:
+            ok = Counter(actual) == Counter(want)
+
+        if ok:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(
+                f"{inv.invocation_id}: expected {want!r}, got {actual!r} "
+                f"(require_order={require_order})"
+            )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    tool_sequence_match.run()
diff --git a/scripts/test_input.json b/scripts/test_input.json
index 79d41f5..7b5939c 100644
--- a/scripts/test_input.json
+++ b/scripts/test_input.json
@@ -7,8 +7,10 @@
       "invocation_id": "ci-test-001",
       "user_content": "What is 2+2?",
       "final_response": "The answer is 4.",
-      "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}],
-      "tool_responses": [{"name": "calculator", "output": "4"}]
+      "intermediate_steps": {
+        "tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}],
+        "tool_responses": [{"name": "calculator", "output": "4"}]
+      }
     }
   ],
   "expected_invocations": null
diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py
index e80570e..c1d5fb5 100644
--- a/scripts/validate_evaluator.py
+++ b/scripts/validate_evaluator.py
@@ -94,14 +94,14 @@ def validate_syntax(evaluator_dir: Path, manifest: dict) -> bool:
         _ok(f"Python syntax valid ({entry_path})")
 
         source = entry_path.read_text()
-        if "agentevals_grader_sdk" not in source:
+        if "agentevals_evaluator_sdk" not in source:
             _fail(
-                f"{entry_path} does not import agentevals_grader_sdk. "
+                f"{entry_path} does not import agentevals_evaluator_sdk. "
                 f"Evaluators must use the SDK or implement the stdin/stdout protocol."
             )
             return False
-        if "@grader" not in source:
-            _fail(f"{entry_path} does not use the @grader decorator")
+        if "@evaluator" not in source:
+            _fail(f"{entry_path} does not use the @evaluator decorator")
             return False
         if 'if __name__ == "__main__"' not in source and "if __name__ == '__main__'" not in source:
             _fail(f"{entry_path} missing 'if __name__ == \"__main__\"' block with .run() call")
@@ -211,7 +211,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
 
     # Full Pydantic validation via the SDK if available
     try:
-        from agentevals_grader_sdk import EvalResult
+        from agentevals_evaluator_sdk import EvalResult
         EvalResult.model_validate(output)
         _ok("Output validates against EvalResult schema (Pydantic)")
     except ImportError:

From ae67759488d99a06373dae77d557449716bf8c9f Mon Sep 17 00:00:00 2001
From: Peter Jausovec <peter.jausovec@solo.io>
Date: Sun, 22 Mar 2026 22:49:29 +0100
Subject: [PATCH 2/2] pr feedback

Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
---
 evaluators/contains/contains.py               | 28 ++++++++++---------
 evaluators/contains/evaluator.yaml            |  2 +-
 evaluators/equals/equals.py                   | 14 ++++++----
 evaluators/is_json/is_json.py                 | 10 +------
 .../levenshtein_ratio/levenshtein_ratio.py    | 12 ++++----
 evaluators/regex_match/regex_match.py         | 17 ++++++-----
 .../tool_sequence_match.py                    | 19 +++++++++----
 scripts/validate_evaluator.py                 |  9 +++++-
 8 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py
index cb840fc..cbe6eca 100644
--- a/evaluators/contains/contains.py
+++ b/evaluators/contains/contains.py
@@ -4,7 +4,7 @@
 otherwise 0.0.
 
 Config:
-  substring (str): Required for scoring; if omitted, the evaluator is a no-op (1.0).
+  substring (str): Required. If omitted, returns NOT_EVALUATED.
 
 Usage in eval_config.yaml:
     config:
@@ -13,37 +13,39 @@
 
 from __future__ import annotations
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 @evaluator
 def contains(input: EvalInput) -> EvalResult:
-    needle = (input.config.get("substring") or "").strip()
-    if not needle:
+    substring = (input.config.get("substring") or "").strip()
+    n = len(input.invocations)
+    if not substring:
         return EvalResult(
-            score=1.0,
-            per_invocation_scores=[1.0] * len(input.invocations),
-            details={"note": "no substring configured; skipping check"},
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: substring"},
         )
 
     case_insensitive = bool(input.config.get("case_insensitive", False))
-    haystack_fn = str.lower if case_insensitive else lambda s: s
-    needle_cmp = haystack_fn(needle)
+    normalize = str.lower if case_insensitive else lambda s: s
+    substring_cmp = normalize(substring)
 
     scores: list[float] = []
     issues: list[str] = []
 
     for inv in input.invocations:
-        text = (inv.final_response or "")
+        response_text = inv.final_response or ""
         if case_insensitive:
-            ok = needle_cmp in haystack_fn(text)
+            ok = substring_cmp in normalize(response_text)
         else:
-            ok = needle in text
+            ok = substring in response_text
         if ok:
             scores.append(1.0)
         else:
             scores.append(0.0)
-            issues.append(f"{inv.invocation_id}: response does not contain {needle!r}")
+            issues.append(f"{inv.invocation_id}: response does not contain {substring!r}")
 
     overall = sum(scores) / len(scores) if scores else 0.0
     return EvalResult(
diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml
index 368b446..7395d68 100644
--- a/evaluators/contains/evaluator.yaml
+++ b/evaluators/contains/evaluator.yaml
@@ -2,5 +2,5 @@ name: contains
 description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
 language: python
 entrypoint: contains.py
-tags: [string, contains,]
+tags: [string, contains]
 author: agentevals-dev
diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py
index 874fa30..d85b1e1 100644
--- a/evaluators/equals/equals.py
+++ b/evaluators/equals/equals.py
@@ -1,7 +1,7 @@
 """Exact string match evaluator.
 
 Config:
-  expected (str): If omitted, no-op (1.0).
+  expected (str): Required. If omitted, returns NOT_EVALUATED.
   case_insensitive (bool, default True): Compare normalized strings.
   strip (bool, default True): Strip whitespace before compare.
 
@@ -12,20 +12,22 @@
 
 from __future__ import annotations
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 @evaluator
 def equals(input: EvalInput) -> EvalResult:
     expected = input.config.get("expected")
     if expected is None:
+        n = len(input.invocations)
         return EvalResult(
-            score=1.0,
-            per_invocation_scores=[1.0] * len(input.invocations),
-            details={"note": "no expected string configured; skipping check"},
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: expected"},
         )
 
-    case_insensitive = bool(input.config.get("case_insensitive", True))
+    case_insensitive = bool(input.config.get("case_insensitive", False))
     strip = bool(input.config.get("strip", True))
 
     def norm(s: str) -> str:
diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py
index d0e0430..deb4980 100644
--- a/evaluators/is_json/is_json.py
+++ b/evaluators/is_json/is_json.py
@@ -3,12 +3,11 @@
 Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
 
 Config:
-  require_json (bool, default False): If False, evaluator is a no-op (1.0).
   extract_markdown_fence (bool, default True): Strip ```json fences if present.
 
 Usage:
     config:
-      require_json: true
+      extract_markdown_fence: true
 """
 
 from __future__ import annotations
@@ -32,13 +31,6 @@ def _parse_json_payload(text: str, extract_fence: bool) -> object:
 
 @evaluator
 def is_json(input: EvalInput) -> EvalResult:
-    if not input.config.get("require_json"):
-        return EvalResult(
-            score=1.0,
-            per_invocation_scores=[1.0] * len(input.invocations),
-            details={"note": "require_json not set; skipping check"},
-        )
-
     extract_fence = bool(input.config.get("extract_markdown_fence", True))
 
     scores: list[float] = []
diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py
index dcec4ea..730b917 100644
--- a/evaluators/levenshtein_ratio/levenshtein_ratio.py
+++ b/evaluators/levenshtein_ratio/levenshtein_ratio.py
@@ -3,7 +3,7 @@
 Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1].
 
 Config:
-  expected (str): If omitted, no-op (1.0).
+  expected (str): Required. If omitted, returns NOT_EVALUATED.
   case_insensitive (bool, default False): Compare lowercased strings.
 
 Usage:
@@ -13,7 +13,7 @@
 
 from __future__ import annotations
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 def _levenshtein(a: str, b: str) -> int:
@@ -38,10 +38,12 @@ def _levenshtein(a: str, b: str) -> int:
 def levenshtein_ratio(input: EvalInput) -> EvalResult:
     expected = input.config.get("expected")
     if expected is None:
+        n = len(input.invocations)
         return EvalResult(
-            score=1.0,
-            per_invocation_scores=[1.0] * len(input.invocations),
-            details={"note": "no expected string configured; skipping check"},
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: expected"},
         )
 
     case_insensitive = bool(input.config.get("case_insensitive", False))
diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py
index e08bb5b..271602e 100644
--- a/evaluators/regex_match/regex_match.py
+++ b/evaluators/regex_match/regex_match.py
@@ -1,7 +1,7 @@
 """Regex on final response evaluator.
 
 Config:
-  pattern (str): If omitted, no-op (1.0).
+  pattern (str): Required. If omitted or invalid, returns NOT_EVALUATED.
   flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |.
 
 Usage:
@@ -14,7 +14,7 @@
 
 import re
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 _FLAG_MAP = {
     "IGNORECASE": re.IGNORECASE,
@@ -26,11 +26,13 @@
 @evaluator
 def regex_match(input: EvalInput) -> EvalResult:
     pattern = input.config.get("pattern")
+    n = len(input.invocations)
     if not pattern:
         return EvalResult(
-            score=1.0,
-            per_invocation_scores=[1.0] * len(input.invocations),
-            details={"note": "no pattern configured; skipping check"},
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: pattern"},
         )
 
     flag_names = input.config.get("flags")
@@ -51,8 +53,9 @@ def regex_match(input: EvalInput) -> EvalResult:
     except re.error as exc:
         return EvalResult(
             score=0.0,
-            per_invocation_scores=[0.0] * len(input.invocations),
-            details={"error": f"invalid regex: {exc}"},
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "invalid regex pattern", "error": str(exc)},
         )
 
     scores: list[float] = []
diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py
index 3e0a800..a729707 100644
--- a/evaluators/tool_sequence_match/tool_sequence_match.py
+++ b/evaluators/tool_sequence_match/tool_sequence_match.py
@@ -3,7 +3,7 @@
 Compares the ordered list of tool names in each invocation to config.
 
 Config:
-  expected_tool_names (list[str]): If omitted or empty, no-op (1.0).
+  expected_tool_names (list[str]): Required non-empty. Otherwise returns NOT_EVALUATED.
   require_order (bool, default True): If False, compares multisets (same counts per name).
 
 Usage:
@@ -16,17 +16,26 @@
 
 from collections import Counter
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 @evaluator
 def tool_sequence_match(input: EvalInput) -> EvalResult:
     expected = input.config.get("expected_tool_names")
+    n = len(input.invocations)
+    if expected is None or not isinstance(expected, list):
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing or invalid config: expected_tool_names (need a list of names)"},
+        )
     if not expected:
         return EvalResult(
-            score=1.0,
-            per_invocation_scores=[1.0] * len(input.invocations),
-            details={"note": "no expected_tool_names configured; skipping check"},
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing or empty config: expected_tool_names"},
         )
 
     want = [str(x) for x in expected]
diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py
index c1d5fb5..13379e3 100644
--- a/scripts/validate_evaluator.py
+++ b/scripts/validate_evaluator.py
@@ -158,7 +158,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
     if not stdout:
         stderr_preview = result.stderr.strip()[:500]
         _fail(
-            f"Evaluator produced no output on stdout"
+            "Evaluator produced no output on stdout"
             + (f"\n  stderr: {stderr_preview}" if stderr_preview else "")
         )
         return False
@@ -208,6 +208,13 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
                 f"got {type(per_inv).__name__}"
             )
             return False
+        for i, x in enumerate(per_inv):
+            if x is not None and not isinstance(x, (int, float)):
+                _fail(
+                    f"'per_invocation_scores[{i}]' must be a number or null, "
+                    f"got {type(x).__name__}"
+                )
+                return False
 
     # Full Pydantic validation via the SDK if available
     try: