agentevals-dev · wiliyam · Apr 21, 2026 · Apr 22, 2026 · krisztianfekete · Apr 21, 2026
diff --git a/README.md b/README.md
@@ -240,7 +240,7 @@ evaluators:
     threshold: 0.7
 ```
 
-Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`).
+Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`). Two OpenAI grader types are supported: `text_similarity` for comparing responses against a golden reference, and `string_check` for exact or pattern-based matching against a fixed value.
 
 See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK helpers, and how to contribute evaluators.
 

diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md
@@ -104,7 +104,7 @@ Each evaluator entry in the `evaluators` list uses the following fields. The `ty
 |---|---|---|---|
 | `name` | yes | | Unique name for the evaluator (used in output) |
 | `type` | yes | | `openai_eval` for OpenAI Evals API graders |
-| `threshold` | no | `0.5` | Maps to `pass_threshold` in the OpenAI grader |
+| `threshold` | no | `0.5` | Maps to `pass_threshold` in the OpenAI grader (not applicable for `string_check`) |
 | `timeout` | no | `120` | Max seconds to wait for the OpenAI eval run |
 | `grader` | yes | | OpenAI grader config (see [OpenAI Evals Graders](#openai-evals-api-graders)) |
 
@@ -317,9 +317,32 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
 | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
 | `rouge_l` | Longest common subsequence overlap (F-measure) |
 
+### String Check Grader
+
+Checks the agent's response against a fixed reference string using comparison operations. Does **not** require an eval set — the reference value is specified directly in the grader config. The `threshold` field is not applicable to this grader (string_check always returns 0 or 1).
+
+```yaml
+evaluators:
+  - name: city_name_check
+    type: openai_eval
+    grader:
+      type: string_check
+      operation: eq
+      reference: "Paris"
+```
+
+The `grader.operation` field selects the comparison:
+
+| Operation | Description |
+|---|---|
+| `eq` | Exact equality |
+| `ne` | Not equal |
+| `like` | Pattern match (case-sensitive) |
+| `ilike` | Pattern match (case-insensitive) |
+
 ### How it works
 
-Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
+Under the hood, agentevals creates an ephemeral eval on OpenAI, submits invocations as JSONL items, polls for results, and cleans up. For `text_similarity` graders, each item contains both the actual and expected responses; for `string_check` graders, each item contains only the actual response (the reference is supplied statically in the grader config). Items are placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
 
 ### Configuring the GitHub source
 

diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml
@@ -32,3 +32,18 @@ evaluators:
     ref: evaluators/random_evaluator/random_evaluator.py
     threshold: 0.110
     executor: local
+
+  # OpenAI Evals API graders (requires OPENAI_API_KEY)
+  - name: response_similarity
+    type: openai_eval
+    threshold: 0.8
+    grader:
+      type: text_similarity
+      evaluation_metric: fuzzy_match
+
+  - name: city_name_check
+    type: openai_eval
+    grader:
+      type: string_check
+      operation: eq
+      reference: "{{ item.expected_response }}"
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
@@ -70,6 +70,18 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     }
 )
 
+_VALID_STRING_CHECK_OPERATIONS = frozenset(
+    {
+        "eq",
+        "ne",
+        "like",
+        "ilike",
+    }
+)
+
+# All supported grader types — used in error messages and type checks.
+_SUPPORTED_GRADER_TYPES = frozenset({"text_similarity", "string_check"})
+
 
 class OpenAIEvalDef(BaseModel):
     """An evaluator that delegates grading to the OpenAI Evals API."""
@@ -84,13 +96,24 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type != "text_similarity":
-            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
-        metric = v.get("evaluation_metric")
-        if not metric:
-            raise ValueError("'evaluation_metric' is required for text_similarity grader")
-        if metric not in _VALID_SIMILARITY_METRICS:
-            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+
+        if grader_type == "text_similarity":
+            metric = v.get("evaluation_metric")
+            if not metric:
+                raise ValueError("'evaluation_metric' is required for text_similarity grader")
+            if metric not in _VALID_SIMILARITY_METRICS:
+                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        elif grader_type == "string_check":
+            operation = v.get("operation")
+            if not operation:
+                raise ValueError("'operation' is required for string_check grader")
+            if operation not in _VALID_STRING_CHECK_OPERATIONS:
+                raise ValueError(f"Unknown operation '{operation}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
+            if not v.get("reference"):
+                raise ValueError("'reference' is required for string_check grader")
+        else:
+            raise ValueError(f"Unsupported grader type '{grader_type}'. Supported: {sorted(_SUPPORTED_GRADER_TYPES)}")
+
         return v
 
 

diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
@@ -22,6 +22,7 @@
 
 _POLL_INTERVAL_SECONDS = 2
 
+# Schema for graders that compare actual vs expected (e.g. text_similarity).
 _TEXT_PAIR_SCHEMA = {
     "type": "object",
     "properties": {
@@ -31,6 +32,22 @@
     "required": ["actual_response", "expected_response"],
 }
 
+# Schema for graders that only need the actual response (e.g. string_check).
+_ACTUAL_ONLY_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actual_response": {"type": "string"},
+    },
+    "required": ["actual_response"],
+}
+
+
+def _get_item_schema(grader_type: str) -> dict[str, Any]:
+    """Return the appropriate item schema for the given grader type."""
+    if grader_type == "string_check":
+        return _ACTUAL_ONLY_SCHEMA
+    return _TEXT_PAIR_SCHEMA
+
 
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
     """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +68,41 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "pass_threshold": evaluator_def.threshold,
         }
 
+    if grader_type == "string_check":
+        return {
+            "type": "string_check",
+            "name": evaluator_def.name,
+            "input": "{{ item.actual_response }}",
+            "reference": grader["reference"],
+            "operation": grader["operation"],
+        }
+
     raise ValueError(f"Unsupported grader type: {grader_type}")
 
 
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    grader_type: str = "",
 ) -> list[dict[str, Any]]:
+    """Build JSONL items matching the grader-aware item schema.
+
+    string_check graders use a static reference from config and only need
+    ``actual_response`` in each item.  All other graders (e.g. text_similarity)
+    also require ``expected_response``.
+    """
+    include_expected = grader_type != "string_check"
     items = []
     for i, actual_inv in enumerate(actual_invocations):
         actual_text = _content_to_text(actual_inv.final_response)
-        if i < len(expected_invocations):
-            expected_text = _content_to_text(expected_invocations[i].final_response)
-        else:
-            expected_text = ""
-        items.append(
-            {
-                "item": {
-                    "actual_response": actual_text,
-                    "expected_response": expected_text,
-                }
-            }
-        )
+        item: dict[str, Any] = {"actual_response": actual_text}
+        if include_expected:
+            if i < len(expected_invocations):
+                expected_text = _content_to_text(expected_invocations[i].final_response)
+            else:
+                expected_text = ""
+            item["expected_response"] = expected_text
+        items.append({"item": item})
     return items
 
 
@@ -111,13 +141,21 @@ async def evaluate_openai_eval(
             error="OPENAI_API_KEY environment variable is not set.",
         )
 
-    if expected_invocations is None:
+    grader_type = evaluator_def.grader.get("type", "")
+
+    # string_check graders use a static reference from config and don't need
+    # expected_invocations — only text_similarity requires a golden eval set.
+    if grader_type != "string_check" and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
-            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+            error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
         )
 
-    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    items = _build_jsonl_items(
+        actual_invocations,
+        expected_invocations if expected_invocations is not None else [],
+        grader_type=grader_type,
+    )
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -135,7 +173,7 @@ async def evaluate_openai_eval(
             name=f"agentevals-{evaluator_def.name}",
             data_source_config={
                 "type": "custom",
-                "item_schema": _TEXT_PAIR_SCHEMA,
+                "item_schema": _get_item_schema(grader_type),
                 "include_sample_schema": False,
             },
             testing_criteria=[testing_criteria],
@@ -225,10 +263,18 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     total = result_counts.total if result_counts else 0
     eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
 
+    grader_type = evaluator_def.grader.get("type", "")
+    # Include the grader-relevant key depending on type
+    # (evaluation_metric for text_similarity, operation for string_check)
+    if grader_type == "string_check":
+        grader_detail_key = "operation"
+    else:
+        grader_detail_key = "evaluation_metric"
+
     details: dict[str, Any] = {
         "openai_eval_id": eval_id,
         "openai_run_id": run_id,
-        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
+        grader_detail_key: evaluator_def.grader.get(grader_detail_key),
         "result_counts": {"passed": passed, "failed": failed, "total": total},
     }
     per_criteria = getattr(run, "per_testing_criteria_results", None)