From 854133a67287e249a59bd96d8e0fd6724740dd1a Mon Sep 17 00:00:00 2001
From: wiliyam <wiliyam@users.noreply.github.com>
Date: Tue, 21 Apr 2026 07:41:39 +0000
Subject: [PATCH 1/2] feat: add StringCheckGrader support for OpenAI Evals
 backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds string_check grader alongside the existing text_similarity grader.
string_check evaluates agent responses against a fixed reference string
using comparison operations (eq, ne, like, ilike). Unlike text_similarity,
it does not require a golden eval set — the reference is specified
directly in the grader config.

Changes:
- config.py: _VALID_STRING_CHECK_OPERATIONS, _SUPPORTED_GRADER_TYPES,
  grader-aware validator with explicit operation/reference checks
- openai_eval_backend.py: _ACTUAL_ONLY_SCHEMA and _get_item_schema for
  grader-aware item shape, string_check branch in _build_testing_criteria,
  grader_type param in _build_jsonl_items (excludes expected_response for
  string_check), grader-relevant detail key in results (operation vs
  evaluation_metric), gated expected_invocations requirement
- docs/custom-evaluators.md: String Check Grader section, threshold
  inapplicability note, grader-aware How it works description
- examples/custom_evaluators/eval_config.yaml: example entries for both
  grader types
- README.md: mentions both grader types in Custom Evaluators section
- tests/test_openai_eval_backend.py: unit tests covering config validation,
  schema selection, testing criteria, JSONL builder, score extraction,
  and full mocked-client flow for both grader types

Addresses review feedback from @krisztianfekete on PR #102.
---
 README.md                                   |   2 +-
 docs/custom-evaluators.md                   |  27 +-
 examples/custom_evaluators/eval_config.yaml |  15 +
 src/agentevals/config.py                    |  44 ++-
 src/agentevals/openai_eval_backend.py       |  80 ++++-
 tests/test_openai_eval_backend.py           | 341 ++++++++++++++++++++
 6 files changed, 482 insertions(+), 27 deletions(-)
 create mode 100644 tests/test_openai_eval_backend.py

diff --git a/README.md b/README.md
index b53afa5..8523486 100644
--- a/README.md
+++ b/README.md
@@ -240,7 +240,7 @@ evaluators:
     threshold: 0.7
 ```
 
-Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`).
+Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`). Two OpenAI grader types are supported: `text_similarity` for comparing responses against a golden reference, and `string_check` for exact or pattern-based matching against a fixed value.
 
 See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK helpers, and how to contribute evaluators.
 
diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md
index 592dd25..79ac470 100644
--- a/docs/custom-evaluators.md
+++ b/docs/custom-evaluators.md
@@ -104,7 +104,7 @@ Each evaluator entry in the `evaluators` list uses the following fields. The `ty
 |---|---|---|---|
 | `name` | yes | | Unique name for the evaluator (used in output) |
 | `type` | yes | | `openai_eval` for OpenAI Evals API graders |
-| `threshold` | no | `0.5` | Maps to `pass_threshold` in the OpenAI grader |
+| `threshold` | no | `0.5` | Maps to `pass_threshold` in the OpenAI grader (not applicable for `string_check`) |
 | `timeout` | no | `120` | Max seconds to wait for the OpenAI eval run |
 | `grader` | yes | | OpenAI grader config (see [OpenAI Evals Graders](#openai-evals-api-graders)) |
 
@@ -317,9 +317,32 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
 | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
 | `rouge_l` | Longest common subsequence overlap (F-measure) |
 
+### String Check Grader
+
+Checks the agent's response against a fixed reference string using comparison operations. Does **not** require an eval set — the reference value is specified directly in the grader config. The `threshold` field is not applicable to this grader (string_check always returns 0 or 1).
+
+```yaml
+evaluators:
+  - name: city_name_check
+    type: openai_eval
+    grader:
+      type: string_check
+      operation: eq
+      reference: "Paris"
+```
+
+The `grader.operation` field selects the comparison:
+
+| Operation | Description |
+|---|---|
+| `eq` | Exact equality |
+| `ne` | Not equal |
+| `like` | Pattern match (case-sensitive) |
+| `ilike` | Pattern match (case-insensitive) |
+
 ### How it works
 
-Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
+Under the hood, agentevals creates an ephemeral eval on OpenAI, submits invocations as JSONL items, polls for results, and cleans up. For `text_similarity` graders, each item contains both the actual and expected responses; for `string_check` graders, each item contains only the actual response (the reference is supplied statically in the grader config). Items are placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
 
 ### Configuring the GitHub source
 
diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml
index d3bd261..c4a2f9b 100644
--- a/examples/custom_evaluators/eval_config.yaml
+++ b/examples/custom_evaluators/eval_config.yaml
@@ -32,3 +32,18 @@ evaluators:
     ref: evaluators/random_evaluator/random_evaluator.py
     threshold: 0.110
     executor: local
+
+  # OpenAI Evals API graders (requires OPENAI_API_KEY)
+  - name: response_similarity
+    type: openai_eval
+    threshold: 0.8
+    grader:
+      type: text_similarity
+      evaluation_metric: fuzzy_match
+
+  - name: city_name_check
+    type: openai_eval
+    grader:
+      type: string_check
+      operation: eq
+      reference: "{{ item.expected_response }}"
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index 8c7d01a..0a0451d 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -70,6 +70,18 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     }
 )
 
+_VALID_STRING_CHECK_OPERATIONS = frozenset(
+    {
+        "eq",
+        "ne",
+        "like",
+        "ilike",
+    }
+)
+
+# All supported grader types — used in error messages and type checks.
+_SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"})
+
 
 class OpenAIEvalDef(BaseModel):
     """An evaluator that delegates grading to the OpenAI Evals API."""
@@ -84,13 +96,31 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type != "text_similarity":
-            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
-        metric = v.get("evaluation_metric")
-        if not metric:
-            raise ValueError("'evaluation_metric' is required for text_similarity grader")
-        if metric not in _VALID_SIMILARITY_METRICS:
-            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+
+        if grader_type == "text_similarity":
+            metric = v.get("evaluation_metric")
+            if not metric:
+                raise ValueError("'evaluation_metric' is required for text_similarity grader")
+            if metric not in _VALID_SIMILARITY_METRICS:
+                raise ValueError(
+                    f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}"
+                )
+        elif grader_type == "string_check":
+            operation = v.get("operation")
+            if not operation:
+                raise ValueError("'operation' is required for string_check grader")
+            if operation not in _VALID_STRING_CHECK_OPERATIONS:
+                raise ValueError(
+                    f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}"
+                )
+            if not v.get("reference"):
+                raise ValueError("'reference' is required for string_check grader")
+        else:
+            raise ValueError(
+                f"Unsupported grader type '{grader_type}'. "
+                f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}"
+            )
+
         return v
 
 
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
index a6e9c00..9e4f24a 100644
--- a/src/agentevals/openai_eval_backend.py
+++ b/src/agentevals/openai_eval_backend.py
@@ -22,6 +22,7 @@
 
 _POLL_INTERVAL_SECONDS = 2
 
+# Schema for graders that compare actual vs expected (e.g. text_similarity).
 _TEXT_PAIR_SCHEMA = {
     "type": "object",
     "properties": {
@@ -31,6 +32,22 @@
     "required": ["actual_response", "expected_response"],
 }
 
+# Schema for graders that only need the actual response (e.g. string_check).
+_ACTUAL_ONLY_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actual_response": {"type": "string"},
+    },
+    "required": ["actual_response"],
+}
+
+
+def _get_item_schema(grader_type: str) -> dict[str, Any]:
+    """Return the appropriate item schema for the given grader type."""
+    if grader_type == "string_check":
+        return _ACTUAL_ONLY_SCHEMA
+    return _TEXT_PAIR_SCHEMA
+
 
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
     """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +68,41 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "pass_threshold": evaluator_def.threshold,
         }
 
+    if grader_type == "string_check":
+        return {
+            "type": "string_check",
+            "name": evaluator_def.name,
+            "input": "{{ item.actual_response }}",
+            "reference": grader["reference"],
+            "operation": grader["operation"],
+        }
+
     raise ValueError(f"Unsupported grader type: {grader_type}")
 
 
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    grader_type: str = "",
 ) -> list[dict[str, Any]]:
+    """Build JSONL items matching the grader-aware item schema.
+
+    string_check graders use a static reference from config and only need
+    ``actual_response`` in each item.  All other graders (e.g. text_similarity)
+    also require ``expected_response``.
+    """
+    include_expected = grader_type != "string_check"
     items = []
     for i, actual_inv in enumerate(actual_invocations):
         actual_text = _content_to_text(actual_inv.final_response)
-        if i < len(expected_invocations):
-            expected_text = _content_to_text(expected_invocations[i].final_response)
-        else:
-            expected_text = ""
-        items.append(
-            {
-                "item": {
-                    "actual_response": actual_text,
-                    "expected_response": expected_text,
-                }
-            }
-        )
+        item: dict[str, Any] = {"actual_response": actual_text}
+        if include_expected:
+            if i < len(expected_invocations):
+                expected_text = _content_to_text(expected_invocations[i].final_response)
+            else:
+                expected_text = ""
+            item["expected_response"] = expected_text
+        items.append({"item": item})
     return items
 
 
@@ -111,13 +141,21 @@ async def evaluate_openai_eval(
             error="OPENAI_API_KEY environment variable is not set.",
         )
 
-    if expected_invocations is None:
+    grader_type = evaluator_def.grader.get("type", "")
+
+    # string_check graders use a static reference from config and don't need
+    # expected_invocations — only text_similarity requires a golden eval set.
+    if grader_type != "string_check" and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
-            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+            error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
         )
 
-    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    items = _build_jsonl_items(
+        actual_invocations,
+        expected_invocations if expected_invocations is not None else [],
+        grader_type=grader_type,
+    )
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -135,7 +173,7 @@ async def evaluate_openai_eval(
             name=f"agentevals-{evaluator_def.name}",
             data_source_config={
                 "type": "custom",
-                "item_schema": _TEXT_PAIR_SCHEMA,
+                "item_schema": _get_item_schema(grader_type),
                 "include_sample_schema": False,
             },
             testing_criteria=[testing_criteria],
@@ -225,10 +263,18 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     total = result_counts.total if result_counts else 0
     eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
 
+    grader_type = evaluator_def.grader.get("type", "")
+    # Include the grader-relevant key depending on type
+    # (evaluation_metric for text_similarity, operation for string_check)
+    if grader_type == "string_check":
+        grader_detail_key = "operation"
+    else:
+        grader_detail_key = "evaluation_metric"
+
     details: dict[str, Any] = {
         "openai_eval_id": eval_id,
         "openai_run_id": run_id,
-        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
+        grader_detail_key: evaluator_def.grader.get(grader_detail_key),
         "result_counts": {"passed": passed, "failed": failed, "total": total},
     }
     per_criteria = getattr(run, "per_testing_criteria_results", None)
diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py
new file mode 100644
index 0000000..b18e8dd
--- /dev/null
+++ b/tests/test_openai_eval_backend.py
@@ -0,0 +1,341 @@
+"""Unit tests for the OpenAI Evals backend — covers both text_similarity and string_check graders."""
+
+import asyncio
+from unittest.mock import MagicMock, patch, AsyncMock
+
+import pytest
+from pydantic import ValidationError
+
+from agentevals.config import OpenAIEvalDef
+from agentevals.openai_eval_backend import (
+    _build_jsonl_items,
+    _build_testing_criteria,
+    _extract_item_score,
+    _get_item_schema,
+    evaluate_openai_eval,
+)
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+
+
+def _make_invocation(text: str):
+    """Build a minimal Invocation-like object with a final_response."""
+    inv = MagicMock()
+    inv.final_response = text
+    return inv
+
+
+def _make_text_similarity_def(**overrides) -> OpenAIEvalDef:
+    defaults = {
+        "name": "test_similarity",
+        "threshold": 0.7,
+        "grader": {
+            "type": "text_similarity",
+            "evaluation_metric": "fuzzy_match",
+        },
+    }
+    defaults.update(overrides)
+    return OpenAIEvalDef(**defaults)
+
+
+def _make_string_check_def(**overrides) -> OpenAIEvalDef:
+    defaults = {
+        "name": "test_check",
+        "grader": {
+            "type": "string_check",
+            "operation": "eq",
+            "reference": "Paris",
+        },
+    }
+    defaults.update(overrides)
+    return OpenAIEvalDef(**defaults)
+
+
+# ── Config validation tests ────────────────────────────────────────────────────
+
+
+class TestOpenAIEvalDefValidation:
+    """Verify that the pydantic validator enforces correct grader configs."""
+
+    def test_text_similarity_requires_evaluation_metric(self):
+        with pytest.raises(ValidationError, match="evaluation_metric"):
+            OpenAIEvalDef(name="x", grader={"type": "text_similarity"})
+
+    def test_text_similarity_rejects_invalid_metric(self):
+        with pytest.raises(ValidationError, match="Unknown evaluation_metric"):
+            OpenAIEvalDef(
+                name="x",
+                grader={"type": "text_similarity", "evaluation_metric": "bogus"},
+            )
+
+    def test_text_similarity_accepts_valid_metrics(self):
+        for metric in ("fuzzy_match", "bleu", "cosine", "rouge_l"):
+            d = OpenAIEvalDef(
+                name="x",
+                grader={"type": "text_similarity", "evaluation_metric": metric},
+            )
+            assert d.grader["evaluation_metric"] == metric
+
+    def test_string_check_requires_operation(self):
+        with pytest.raises(ValidationError, match="operation"):
+            OpenAIEvalDef(name="x", grader={"type": "string_check", "reference": "hi"})
+
+    def test_string_check_requires_reference(self):
+        with pytest.raises(ValidationError, match="reference"):
+            OpenAIEvalDef(name="x", grader={"type": "string_check", "operation": "eq"})
+
+    def test_string_check_rejects_invalid_operation(self):
+        with pytest.raises(ValidationError, match="Unknown operation"):
+            OpenAIEvalDef(
+                name="x",
+                grader={"type": "string_check", "operation": "contains", "reference": "hi"},
+            )
+
+    def test_string_check_accepts_valid_operations(self):
+        for op in ("eq", "ne", "like", "ilike"):
+            d = OpenAIEvalDef(
+                name="x",
+                grader={"type": "string_check", "operation": op, "reference": "val"},
+            )
+            assert d.grader["operation"] == op
+
+    def test_unsupported_grader_type_raises(self):
+        with pytest.raises(ValidationError, match="Unsupported grader type"):
+            OpenAIEvalDef(name="x", grader={"type": "model_graded"})
+
+
+# ── Item schema tests ──────────────────────────────────────────────────────────
+
+
+class TestGetItemSchema:
+    def test_string_check_schema_has_actual_only(self):
+        schema = _get_item_schema("string_check")
+        assert schema["required"] == ["actual_response"]
+        assert "expected_response" not in schema["properties"]
+
+    def test_text_similarity_schema_has_both(self):
+        schema = _get_item_schema("text_similarity")
+        assert "actual_response" in schema["required"]
+        assert "expected_response" in schema["required"]
+
+
+# ── Testing criteria tests ─────────────────────────────────────────────────────
+
+
+class TestBuildTestingCriteria:
+    def test_text_similarity_criteria(self):
+        ev = _make_text_similarity_def(threshold=0.8)
+        criteria = _build_testing_criteria(ev)
+        assert criteria["type"] == "text_similarity"
+        assert criteria["evaluation_metric"] == "fuzzy_match"
+        assert criteria["pass_threshold"] == 0.8
+        assert criteria["reference"] == "{{ item.expected_response }}"
+
+    def test_string_check_criteria(self):
+        ev = _make_string_check_def()
+        criteria = _build_testing_criteria(ev)
+        assert criteria["type"] == "string_check"
+        assert criteria["operation"] == "eq"
+        assert criteria["reference"] == "Paris"
+        assert criteria["input"] == "{{ item.actual_response }}"
+        assert "pass_threshold" not in criteria
+
+    def test_unsupported_grader_raises(self):
+        ev = _make_text_similarity_def()
+        # Bypass pydantic validation to test the function directly
+        ev.grader = {"type": "unknown"}
+        with pytest.raises(ValueError, match="Unsupported grader type"):
+            _build_testing_criteria(ev)
+
+
+# ── JSONL item building tests ──────────────────────────────────────────────────
+
+
+class TestBuildJsonlItems:
+    def test_text_similarity_includes_expected(self):
+        actual = [_make_invocation("hello")]
+        expected = [_make_invocation("hi")]
+
+        with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
+            items = _build_jsonl_items(actual, expected, grader_type="text_similarity")
+
+        assert len(items) == 1
+        assert items[0]["item"]["actual_response"] == "hello"
+        assert items[0]["item"]["expected_response"] == "hi"
+
+    def test_string_check_excludes_expected(self):
+        actual = [_make_invocation("Paris")]
+        expected = [_make_invocation("ignored")]
+
+        with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
+            items = _build_jsonl_items(actual, expected, grader_type="string_check")
+
+        assert len(items) == 1
+        assert items[0]["item"]["actual_response"] == "Paris"
+        assert "expected_response" not in items[0]["item"]
+
+    def test_missing_expected_uses_empty_string(self):
+        actual = [_make_invocation("a"), _make_invocation("b")]
+        expected = [_make_invocation("x")]
+
+        with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
+            items = _build_jsonl_items(actual, expected, grader_type="text_similarity")
+
+        assert items[1]["item"]["expected_response"] == ""
+
+    def test_empty_invocations_returns_empty(self):
+        with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
+            items = _build_jsonl_items([], [], grader_type="string_check")
+        assert items == []
+
+
+# ── Item score extraction ──────────────────────────────────────────────────────
+
+
+class TestExtractItemScore:
+    def test_returns_score(self):
+        item = MagicMock()
+        result = MagicMock()
+        result.score = 0.85
+        item.results = [result]
+        assert _extract_item_score(item) == 0.85
+
+    def test_returns_none_when_no_results(self):
+        item = MagicMock()
+        item.results = []
+        assert _extract_item_score(item) is None
+
+    def test_returns_none_when_results_attr_missing(self):
+        item = MagicMock(spec=[])  # no attributes
+        assert _extract_item_score(item) is None
+
+
+# ── Integration tests (mocked OpenAI client) ───────────────────────────────────
+
+
+class TestEvaluateOpenAIEval:
+    def _make_mock_client(self, run_status="completed", scores=None):
+        """Create a fully mocked OpenAI client."""
+        client = MagicMock()
+
+        # evals.create
+        eval_obj = MagicMock()
+        eval_obj.id = "eval_123"
+        client.evals.create.return_value = eval_obj
+
+        # evals.runs.create
+        run_obj = MagicMock()
+        run_obj.id = "run_456"
+        client.evals.runs.create.return_value = run_obj
+
+        # evals.runs.retrieve
+        completed_run = MagicMock()
+        completed_run.status = run_status
+        completed_run.result_counts = MagicMock()
+        completed_run.result_counts.passed = len(scores or [])
+        completed_run.result_counts.failed = 0
+        completed_run.result_counts.total = len(scores or [])
+        completed_run.per_testing_criteria_results = None
+        client.evals.runs.retrieve.return_value = completed_run
+
+        # evals.runs.output_items.list
+        output_items = []
+        for s in (scores or []):
+            item = MagicMock()
+            result = MagicMock()
+            result.score = s
+            item.results = [result]
+            output_items.append(item)
+        page = MagicMock()
+        page.data = output_items
+        client.evals.runs.output_items.list.return_value = page
+
+        # evals.delete
+        client.evals.delete.return_value = None
+
+        return client
+
+    @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
+    @patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
+    @patch("agentevals.openai_eval_backend._get_openai_client")
+    def test_string_check_success(self, mock_get_client, mock_content):
+        client = self._make_mock_client(scores=[1.0])
+        mock_get_client.return_value = client
+
+        ev = _make_string_check_def()
+        actual = [_make_invocation("Paris")]
+
+        result = asyncio.run(evaluate_openai_eval(ev, actual, None))
+
+        assert result.error is None
+        assert result.score == 1.0
+        assert result.eval_status == "PASSED"
+        assert result.details["operation"] == "eq"
+
+    @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
+    @patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
+    @patch("agentevals.openai_eval_backend._get_openai_client")
+    def test_text_similarity_requires_expected(self, mock_get_client, mock_content):
+        ev = _make_text_similarity_def()
+        actual = [_make_invocation("hello")]
+
+        result = asyncio.run(evaluate_openai_eval(ev, actual, None))
+
+        assert result.error is not None
+        assert "expected invocations" in result.error
+
+    @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
+    @patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
+    @patch("agentevals.openai_eval_backend._get_openai_client")
+    def test_text_similarity_success(self, mock_get_client, mock_content):
+        client = self._make_mock_client(scores=[0.9, 0.8])
+        mock_get_client.return_value = client
+
+        ev = _make_text_similarity_def()
+        actual = [_make_invocation("hello"), _make_invocation("world")]
+        expected = [_make_invocation("hi"), _make_invocation("earth")]
+
+        result = asyncio.run(evaluate_openai_eval(ev, actual, expected))
+
+        assert result.error is None
+        assert result.score == pytest.approx(0.85)
+        assert result.details["evaluation_metric"] == "fuzzy_match"
+
+    @patch.dict("os.environ", {"OPENAI_API_KEY": ""})
+    def test_missing_api_key_returns_error(self):
+        ev = _make_string_check_def()
+        actual = [_make_invocation("Paris")]
+
+        result = asyncio.run(evaluate_openai_eval(ev, actual, None))
+
+        assert result.error is not None
+        assert "OPENAI_API_KEY" in result.error
+
+    @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
+    @patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
+    @patch("agentevals.openai_eval_backend._get_openai_client")
+    def test_string_check_no_expected_needed(self, mock_get_client, mock_content):
+        """string_check grader should work without expected_invocations (None)."""
+        client = self._make_mock_client(scores=[1.0])
+        mock_get_client.return_value = client
+
+        ev = _make_string_check_def()
+        actual = [_make_invocation("Paris")]
+
+        result = asyncio.run(evaluate_openai_eval(ev, actual, None))
+
+        # Verify it didn't short-circuit with an error
+        assert result.error is None
+        assert result.eval_status == "PASSED"
+
+    @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
+    @patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
+    @patch("agentevals.openai_eval_backend._get_openai_client")
+    def test_empty_invocations_returns_error(self, mock_get_client, mock_content):
+        ev = _make_string_check_def()
+
+        result = asyncio.run(evaluate_openai_eval(ev, [], None))
+
+        assert result.error is not None
+        assert "No invocations" in result.error

From 12be9dac39ad4ded4c38c0b584fd7a4d7fd40f16 Mon Sep 17 00:00:00 2001
From: wiliyam <wiliyam@users.noreply.github.com>
Date: Wed, 22 Apr 2026 02:31:51 +0000
Subject: [PATCH 2/2] style: apply ruff format to config.py and test file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unwraps a few multi-line f-string ValueError messages that exceed the
default line length but fit when collapsed. Pure formatting — no logic
change. Fixes the `ruff format --check` CI step on PR #102.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/agentevals/config.py          | 13 +++----------
 tests/test_openai_eval_backend.py |  5 ++---
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index 0a0451d..3dd3e91 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -102,24 +102,17 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
             if not metric:
                 raise ValueError("'evaluation_metric' is required for text_similarity grader")
             if metric not in _VALID_SIMILARITY_METRICS:
-                raise ValueError(
-                    f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}"
-                )
+                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
         elif grader_type == "string_check":
             operation = v.get("operation")
             if not operation:
                 raise ValueError("'operation' is required for string_check grader")
             if operation not in _VALID_STRING_CHECK_OPERATIONS:
-                raise ValueError(
-                    f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}"
-                )
+                raise ValueError(f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
             if not v.get("reference"):
                 raise ValueError("'reference' is required for string_check grader")
         else:
-            raise ValueError(
-                f"Unsupported grader type '{grader_type}'. "
-                f"Supported: {sorted(_SUPPORTED_GRADER_TYPES)}"
-            )
+            raise ValueError(f"Unsupported grader type '{grader_type}'. Supported: {sorted(_SUPPORTED_GRADER_TYPES)}")
 
         return v
 
diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py
index b18e8dd..9e8eaa7 100644
--- a/tests/test_openai_eval_backend.py
+++ b/tests/test_openai_eval_backend.py
@@ -1,7 +1,7 @@
 """Unit tests for the OpenAI Evals backend — covers both text_similarity and string_check graders."""
 
 import asyncio
-from unittest.mock import MagicMock, patch, AsyncMock
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 from pydantic import ValidationError
@@ -15,7 +15,6 @@
     evaluate_openai_eval,
 )
 
-
 # ── Helpers ────────────────────────────────────────────────────────────────────
 
 
@@ -241,7 +240,7 @@ def _make_mock_client(self, run_status="completed", scores=None):
 
         # evals.runs.output_items.list
         output_items = []
-        for s in (scores or []):
+        for s in scores or []:
             item = MagicMock()
             result = MagicMock()
             result.score = s