From 56a5bcacbe3d9667b67dd9ad9f0d41efeb2031bd Mon Sep 17 00:00:00 2001
From: evalstate <1936278+evalstate@users.noreply.github.com>
Date: Mon, 23 Mar 2026 16:38:58 +0000
Subject: [PATCH 1/2] Add verifier-based eval validation

---
 src/upskill/cli.py               |  10 +-
 src/upskill/evaluate.py          |  48 ++----
 src/upskill/generate.py          |   6 +-
 src/upskill/models.py            |  59 +++++++-
 src/upskill/verifiers.py         | 243 +++++++++++++++++++++++++++++++
 tests/test_execution_backends.py |  69 ++++++++-
 tests/test_verifiers.py          | 118 +++++++++++++++
 7 files changed, 506 insertions(+), 47 deletions(-)
 create mode 100644 src/upskill/verifiers.py
 create mode 100644 tests/test_verifiers.py

diff --git a/src/upskill/cli.py b/src/upskill/cli.py
index d38396a..1fca0a4 100644
--- a/src/upskill/cli.py
+++ b/src/upskill/cli.py
@@ -547,9 +547,11 @@ async def _load_test_cases(
 
 
 def _count_invalid_expected_cases(test_cases: list[TestCase]) -> int:
-    """Count generated or loaded tests missing enough expected strings."""
+    """Count tests with legacy expected-string checks missing enough values."""
     invalid_expected = 0
     for test_case in test_cases:
+        if test_case.expected is None:
+            continue
         expected_values = [value.strip() for value in test_case.expected.contains if value.strip()]
         if len(expected_values) < 2:
             invalid_expected += 1
@@ -1601,7 +1603,11 @@ async def _eval_async(  # noqa: C901
     invalid_expected = _count_invalid_expected_cases(test_cases)
     console.print(f"[dim]Loaded {len(test_cases)} test case(s) from {test_source}[/dim]")
     if invalid_expected:
-        console.print(f"[yellow]{invalid_expected} test case(s) missing expected strings[/yellow]")
+        console.print(
+            "[yellow]"
+            f"{invalid_expected} legacy expected-string test case(s) missing expected strings"
+            "[/yellow]"
+        )
 
     runs_path = Path(runs_dir) if runs_dir else config.runs_dir
     batch_id, batch_folder = create_batch_folder(runs_path)
diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py
index b5412ca..99c63f6 100644
--- a/src/upskill/evaluate.py
+++ b/src/upskill/evaluate.py
@@ -18,13 +18,12 @@
 from upskill.executors.contracts import ExecutionRequest
 from upskill.models import (
     EvalResults,
-    ExpectedSpec,
     Skill,
     TestCase,
     TestResult,
     ValidationResult,
 )
-from upskill.validators import get_validator
+from upskill.verifiers import run_verifiers
 
 logger = logging.getLogger(__name__)
 
@@ -141,39 +140,11 @@ def load_eval_results_from_artifact_root(
 
 def check_expected(
     output: str,
-    expected: ExpectedSpec,
+    test_case: TestCase,
     workspace: Path | None = None,
-    test_case: TestCase | None = None,
-) -> tuple[bool, ValidationResult | None]:
-    """Check if output matches expected conditions.
-
-    Args:
-        output: The agent's output string
-        expected: Expected conditions dict (legacy format with "contains")
-        workspace: Optional workspace directory for file-based validation
-        test_case: Optional test case with custom validator config
-
-    Returns:
-        Tuple of (success, validation_result)
-    """
-    # Handle custom validator if specified
-    if test_case and test_case.validator:
-        validator = get_validator(test_case.validator)
-        if validator and workspace:
-            config = test_case.validator_config or {}
-            result = validator(
-                workspace=workspace,
-                output_file=test_case.output_file or "",
-                **config,
-            )
-            return result.passed, result
-
-    required = expected.contains
-    output_lower = output.lower()
-    if any(item.lower() not in output_lower for item in required):
-        return False, None
-
-    return True, None
+) -> ValidationResult:
+    """Run normalized deterministic verifiers for one test case."""
+    return run_verifiers(test_case, output=output, workspace=workspace)
 
 
 def format_test_prompt(test_case: TestCase) -> str:
@@ -216,7 +187,7 @@ def build_eval_execution_request(
             "instance_name": instance_name,
             "operation": operation,
             "skill_name": skill.name if skill else None,
-            "has_validator": bool(test_case.validator),
+            "has_validator": bool(test_case.effective_verifiers()),
         },
     )
 
@@ -321,15 +292,14 @@ async def _run_test_with_evaluator(
         _write_test_result_summary(normalized_artifact_dir / "test_result.json", result)
         return result
 
-    success, validation_result = check_expected(
+    validation_result = check_expected(
         execution_result.output_text or "",
-        test_case.expected,
-        execution_result.workspace_dir,
         test_case,
+        execution_result.workspace_dir,
     )
     result = TestResult(
         test_case=test_case,
-        success=success,
+        success=validation_result.passed,
         output=execution_result.output_text,
         tokens_used=execution_result.stats.total_tokens,
         turns=execution_result.stats.turns,
diff --git a/src/upskill/generate.py b/src/upskill/generate.py
index 8abc943..83b6dad 100644
--- a/src/upskill/generate.py
+++ b/src/upskill/generate.py
@@ -149,6 +149,8 @@ async def generate_tests(
     cases = result.cases
     invalid_expected = 0
     for tc in cases:
+        if tc.expected is None:
+            continue
         expected_values = [value.strip() for value in tc.expected.contains if value.strip()]
         if len(expected_values) < 2:
             invalid_expected += 1
@@ -160,8 +162,8 @@ async def generate_tests(
     )
     if invalid_expected:
         print(
-            "Warning: some test cases are missing at least two expected strings; "
-            "review generated tests."
+            "Warning: some legacy expected-string test cases are missing at least two expected "
+            "strings; review generated tests."
         )
     return cases
 
diff --git a/src/upskill/models.py b/src/upskill/models.py
index c4de86f..695fa9f 100644
--- a/src/upskill/models.py
+++ b/src/upskill/models.py
@@ -7,7 +7,7 @@
 from datetime import datetime
 from typing import TYPE_CHECKING
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -34,6 +34,33 @@ class ValidationResult(BaseModel):
     metrics_count: int = 0
     benchmarks_found: list[str] = Field(default_factory=list)
     error_message: str | None = None
+    details: list[str] = Field(default_factory=list)
+
+
+type VerifierConfigValue = str | int | float | bool
+
+
+class VerifierSpec(BaseModel):
+    """Deterministic verifier configuration for a test case."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    type: str
+    name: str | None = None
+    values: list[str] = Field(default_factory=list)
+    path: str | None = None
+    text: str | None = None
+    cmd: str | None = None
+    config: dict[str, VerifierConfigValue] | None = None
+
+    @field_validator("values", mode="before")
+    @classmethod
+    def coerce_values(cls, value: str | list[str] | None) -> list[str]:
+        if value is None:
+            return []
+        if isinstance(value, str):
+            return [value]
+        return value
 
 
 class ExpectedSpec(BaseModel):
@@ -67,12 +94,38 @@ class TestCase(BaseModel):
 
     input: str  # Task/prompt to give the agent
     context: TestCaseContext | None = None  # Files, env vars, etc.
-    expected: ExpectedSpec  # Expected output checks
+    expected: ExpectedSpec | None = None  # Legacy expected output checks
+    verifiers: list[VerifierSpec] = Field(default_factory=list)
 
     # Custom validator support
     output_file: str | None = None  # File to validate instead of agent output
     validator: str | None = None  # Validator name (e.g., "hf_eval_yaml")
-    validator_config: dict[str, str | int | float | bool] | None = None
+    validator_config: dict[str, VerifierConfigValue] | None = None
+
+    @model_validator(mode="after")
+    def validate_expectations(self) -> TestCase:
+        if self.expected is None and not self.verifiers and self.validator is None:
+            raise ValueError("TestCase requires at least one of expected, verifiers, or validator.")
+        return self
+
+    def effective_verifiers(self) -> list[VerifierSpec]:
+        """Return normalized verifier specs including legacy expectation fields."""
+        effective = list(self.verifiers)
+        if self.expected is not None and self.expected.contains:
+            effective.insert(
+                0,
+                VerifierSpec(type="contains", values=self.expected.contains),
+            )
+        if self.validator is not None:
+            effective.append(
+                VerifierSpec(
+                    type="validator",
+                    name=self.validator,
+                    path=self.output_file,
+                    config=self.validator_config,
+                )
+            )
+        return effective
 
 
 class TestCaseSuite(BaseModel):
diff --git a/src/upskill/verifiers.py b/src/upskill/verifiers.py
new file mode 100644
index 0000000..0f56408
--- /dev/null
+++ b/src/upskill/verifiers.py
@@ -0,0 +1,243 @@
+"""Deterministic verifier execution for upskill test cases."""
+
+from __future__ import annotations
+
+import subprocess
+from typing import TYPE_CHECKING
+
+from upskill.models import TestCase, ValidationResult, VerifierSpec
+from upskill.validators import get_validator
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+DEFAULT_COMMAND_TIMEOUT_SECONDS = 60
+MAX_COMMAND_OUTPUT_CHARS = 1200
+
+
+def _build_validation_result(
+    passed: bool,
+    *,
+    error_message: str | None = None,
+    details: list[str] | None = None,
+) -> ValidationResult:
+    return ValidationResult(
+        passed=passed,
+        assertions_passed=1 if passed else 0,
+        assertions_total=1,
+        error_message=error_message,
+        details=details or [],
+    )
+
+
+def _format_command_failure(output: str) -> str:
+    compact = output.strip()
+    if len(compact) > MAX_COMMAND_OUTPUT_CHARS:
+        compact = compact[:MAX_COMMAND_OUTPUT_CHARS].rstrip() + "..."
+    return compact or "command exited with a non-zero status"
+
+
+def _resolve_values(spec: VerifierSpec) -> list[str]:
+    if spec.values:
+        return spec.values
+    if spec.text:
+        return [spec.text]
+    return []
+
+
+def _run_contains_verifier(spec: VerifierSpec, output: str) -> ValidationResult:
+    required = [value for value in _resolve_values(spec) if value.strip()]
+    if not required:
+        return _build_validation_result(False, error_message="contains verifier is missing values")
+
+    output_lower = output.lower()
+    missing = [item for item in required if item.lower() not in output_lower]
+    if missing:
+        return _build_validation_result(
+            False,
+            error_message=f"missing required output text: {missing[0]}",
+            details=[f"missing: {item}" for item in missing],
+        )
+    return _build_validation_result(True)
+
+
+def _require_workspace(spec: VerifierSpec, workspace: Path | None) -> ValidationResult | None:
+    if workspace is not None:
+        return None
+    return _build_validation_result(
+        False,
+        error_message=f"{spec.type} verifier requires a workspace",
+    )
+
+
+def _run_file_exists_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    assert workspace is not None
+    if not spec.path:
+        return _build_validation_result(False, error_message="file_exists verifier is missing path")
+
+    target = workspace / spec.path
+    if target.exists():
+        return _build_validation_result(True)
+    return _build_validation_result(
+        False,
+        error_message=f"expected file does not exist: {spec.path}",
+    )
+
+
+def _run_file_contains_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    assert workspace is not None
+    if not spec.path:
+        return _build_validation_result(
+            False,
+            error_message="file_contains verifier is missing path",
+        )
+
+    target = workspace / spec.path
+    if not target.exists():
+        return _build_validation_result(
+            False,
+            error_message=f"expected file does not exist: {spec.path}",
+        )
+
+    required = [value for value in _resolve_values(spec) if value.strip()]
+    if not required:
+        return _build_validation_result(
+            False,
+            error_message="file_contains verifier is missing text or values",
+        )
+
+    content = target.read_text(encoding="utf-8")
+    content_lower = content.lower()
+    missing = [item for item in required if item.lower() not in content_lower]
+    if missing:
+        return _build_validation_result(
+            False,
+            error_message=f"missing required file text: {missing[0]}",
+            details=[f"missing: {item}" for item in missing],
+        )
+    return _build_validation_result(True)
+
+
+def _run_command_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    assert workspace is not None
+    if not spec.cmd:
+        return _build_validation_result(False, error_message="command verifier is missing cmd")
+
+    timeout_seconds = DEFAULT_COMMAND_TIMEOUT_SECONDS
+    if spec.config and "timeout_seconds" in spec.config:
+        timeout_seconds = int(spec.config["timeout_seconds"])
+
+    completed = subprocess.run(
+        spec.cmd,
+        shell=True,
+        cwd=workspace,
+        text=True,
+        capture_output=True,
+        timeout=timeout_seconds,
+        check=False,
+    )
+    if completed.returncode == 0:
+        return _build_validation_result(True)
+
+    combined_output = "\n".join(part for part in (completed.stdout, completed.stderr) if part)
+    return _build_validation_result(
+        False,
+        error_message=_format_command_failure(combined_output),
+    )
+
+
+def _run_legacy_validator_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    assert workspace is not None
+    if not spec.name:
+        return _build_validation_result(False, error_message="validator verifier is missing name")
+
+    validator = get_validator(spec.name)
+    if validator is None:
+        return _build_validation_result(
+            False,
+            error_message=f"unknown validator: {spec.name}",
+        )
+
+    config = spec.config or {}
+    return validator(
+        workspace=workspace,
+        output_file=spec.path or "",
+        **config,
+    )
+
+
+def run_verifier(
+    spec: VerifierSpec,
+    *,
+    output: str,
+    workspace: Path | None,
+) -> ValidationResult:
+    """Run one verifier against the current output/workspace."""
+
+    if spec.type == "contains":
+        return _run_contains_verifier(spec, output)
+    if spec.type == "file_exists":
+        return _run_file_exists_verifier(spec, workspace)
+    if spec.type == "file_contains":
+        return _run_file_contains_verifier(spec, workspace)
+    if spec.type == "command":
+        return _run_command_verifier(spec, workspace)
+    if spec.type == "validator":
+        return _run_legacy_validator_verifier(spec, workspace)
+
+    return _build_validation_result(
+        False,
+        error_message=f"unsupported verifier type: {spec.type}",
+    )
+
+
+def run_verifiers(
+    test_case: TestCase,
+    *,
+    output: str,
+    workspace: Path | None,
+) -> ValidationResult:
+    """Run all verifiers configured for a test case."""
+
+    specs = test_case.effective_verifiers()
+    if not specs:
+        return ValidationResult(
+            passed=False,
+            assertions_passed=0,
+            assertions_total=0,
+            error_message="no verifiers configured",
+        )
+
+    passed = 0
+    total = 0
+    details: list[str] = []
+    error_messages: list[str] = []
+
+    for spec in specs:
+        result = run_verifier(spec, output=output, workspace=workspace)
+        passed += result.assertions_passed
+        total += result.assertions_total
+        if result.error_message:
+            error_messages.append(result.error_message)
+        if result.details:
+            details.extend(result.details)
+
+    return ValidationResult(
+        passed=passed == total,
+        assertions_passed=passed,
+        assertions_total=total,
+        error_message="; ".join(error_messages) if error_messages else None,
+        details=details,
+    )
diff --git a/tests/test_execution_backends.py b/tests/test_execution_backends.py
index e14f960..bdc7cd7 100644
--- a/tests/test_execution_backends.py
+++ b/tests/test_execution_backends.py
@@ -19,7 +19,14 @@
 from upskill.executors.remote_fast_agent import RemoteFastAgentExecutor
 from upskill.fast_agent_cli import build_fast_agent_command
 from upskill.hf_jobs import JobsConfig, SubmittedJob
-from upskill.models import ConversationStats, ExpectedSpec, Skill, TestCase, TestResult
+from upskill.models import (
+    ConversationStats,
+    ExpectedSpec,
+    Skill,
+    TestCase,
+    TestResult,
+    VerifierSpec,
+)
 from upskill.result_parsing import parse_fast_agent_results
 
 
@@ -513,6 +520,66 @@ async def cancel(self, handle: ExecutionHandle) -> None:
     assert "finished with-skill test 1/1 (ok)" in messages
 
 
+@pytest.mark.asyncio
+async def test_evaluate_skill_supports_verifier_only_test_cases(tmp_path: Path) -> None:
+    skill = Skill(
+        name="write-good-prs",
+        description="Write good pull request descriptions.",
+        body="Use a clear structure.",
+    )
+    test_case = TestCase(
+        input="write a report",
+        verifiers=[
+            VerifierSpec(type="file_exists", path="report.txt"),
+            VerifierSpec(type="file_contains", path="report.txt", text="answer"),
+        ],
+    )
+
+    class FakeExecutor:
+        async def execute(self, request: ExecutionRequest) -> ExecutionHandle:
+            request.artifact_dir.mkdir(parents=True, exist_ok=True)
+            workspace_dir = request.artifact_dir / "workspace"
+            workspace_dir.mkdir(parents=True, exist_ok=True)
+            (workspace_dir / "report.txt").write_text("answer in report", encoding="utf-8")
+            task = asyncio.create_task(
+                asyncio.sleep(
+                    0,
+                    result=ExecutionResult(
+                        output_text="done",
+                        raw_results_path=None,
+                        stdout_path=request.artifact_dir / "stdout.txt",
+                        stderr_path=request.artifact_dir / "stderr.txt",
+                        artifact_dir=request.artifact_dir,
+                        workspace_dir=workspace_dir,
+                        stats=ConversationStats(),
+                    ),
+                )
+            )
+            return ExecutionHandle(request=request, task=task)
+
+        async def collect(self, handle: ExecutionHandle) -> ExecutionResult:
+            return await handle.task
+
+        async def cancel(self, handle: ExecutionHandle) -> None:
+            handle.task.cancel()
+
+    results = await evaluate_skill(
+        skill,
+        [test_case],
+        FakeExecutor(),
+        model="haiku",
+        fastagent_config_path=tmp_path / "fastagent.config.yaml",
+        cards_source_dir=tmp_path,
+        artifact_root=tmp_path / "eval",
+    )
+
+    test_result = results.with_skill_results[0]
+    assert test_result.success is True
+    assert test_result.validation_result is not None
+    assert test_result.validation_result.assertions_passed == 2
+    assert test_result.validation_result.assertions_total == 2
+
+
 @pytest.mark.asyncio
 async def test_evaluate_skill_includes_job_id_in_execution_errors(tmp_path: Path) -> None:
     skill = Skill(
diff --git a/tests/test_verifiers.py b/tests/test_verifiers.py
new file mode 100644
index 0000000..3704833
--- /dev/null
+++ b/tests/test_verifiers.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import sys
+from typing import TYPE_CHECKING
+
+import pytest
+
+from upskill.models import ExpectedSpec, ValidationResult, VerifierSpec
+from upskill.models import TestCase as UpskillTestCase
+from upskill.validators import register_validator
+from upskill.verifiers import run_verifiers
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+@register_validator("test-counting-validator")
+def _test_counting_validator(
+    workspace: Path,
+    output_file: str,
+    **_: object,
+) -> ValidationResult:
+    target = workspace / output_file
+    passed = target.exists()
+    return ValidationResult(
+        passed=passed,
+        assertions_passed=2 if passed else 0,
+        assertions_total=2,
+        error_message=None if passed else f"missing file: {output_file}",
+    )
+
+
+def test_run_verifiers_supports_legacy_expected_contains() -> None:
+    test_case = UpskillTestCase(
+        input="say hello",
+        expected=ExpectedSpec(contains=["hello", "world"]),
+    )
+
+    result = run_verifiers(test_case, output="Hello, world!", workspace=None)
+
+    assert result.passed is True
+    assert result.assertions_passed == 1
+    assert result.assertions_total == 1
+
+
+def test_run_verifiers_supports_file_verifiers(tmp_path: Path) -> None:
+    target = tmp_path / "report.txt"
+    target.write_text("bundle ok", encoding="utf-8")
+    test_case = UpskillTestCase(
+        input="write file",
+        verifiers=[
+            VerifierSpec(type="file_exists", path="report.txt"),
+            VerifierSpec(type="file_contains", path="report.txt", text="bundle ok"),
+        ],
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is True
+    assert result.assertions_passed == 2
+    assert result.assertions_total == 2
+
+
+def test_run_verifiers_supports_command_verifier(tmp_path: Path) -> None:
+    script = tmp_path / "check.py"
+    script.write_text("print('ok')\n", encoding="utf-8")
+    test_case = UpskillTestCase(
+        input="run assertion script",
+        verifiers=[VerifierSpec(type="command", cmd=f"'{sys.executable}' check.py")],
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is True
+    assert result.assertions_passed == 1
+
+
+def test_run_verifiers_translates_legacy_validator(tmp_path: Path) -> None:
+    target = tmp_path / "artifact.txt"
+    target.write_text("ok", encoding="utf-8")
+    test_case = UpskillTestCase(
+        input="validate artifact",
+        validator="test-counting-validator",
+        output_file="artifact.txt",
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is True
+    assert result.assertions_passed == 2
+    assert result.assertions_total == 2
+
+
+def test_run_verifiers_reports_failures(tmp_path: Path) -> None:
+    test_case = UpskillTestCase(
+        input="write report",
+        verifiers=[
+            VerifierSpec(type="file_exists", path="report.txt"),
+            VerifierSpec(
+                type="command",
+                cmd=f"'{sys.executable}' -c 'import sys; sys.exit(1)'",
+            ),
+        ],
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is False
+    assert result.assertions_passed == 0
+    assert result.assertions_total == 2
+    assert result.error_message is not None
+
+
+def test_test_case_rejects_missing_expectation_configuration() -> None:
+    with pytest.raises(
+        ValueError, match="requires at least one of expected, verifiers, or validator"
+    ):
+        UpskillTestCase(input="missing all checks")

From b6ec94002c334bc1814d60744ca54958ab1a0e06 Mon Sep 17 00:00:00 2001
From: evalstate <1936278+evalstate@users.noreply.github.com>
Date: Mon, 23 Mar 2026 16:51:09 +0000
Subject: [PATCH 2/2] Add CI manifest planning helpers

---
 src/upskill/ci.py                             | 214 ++++++++++++++++++
 src/upskill/models.py                         |  84 ++++++-
 .../ci_action_repo/.upskill/evals.yaml        |   5 +
 .../ci_action_repo/evals/example.yaml         |   6 +
 .../skills/example-skill/SKILL.md             |   6 +
 tests/test_ci.py                              | 168 ++++++++++++++
 6 files changed, 482 insertions(+), 1 deletion(-)
 create mode 100644 src/upskill/ci.py
 create mode 100644 tests/fixtures/ci_action_repo/.upskill/evals.yaml
 create mode 100644 tests/fixtures/ci_action_repo/evals/example.yaml
 create mode 100644 tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md
 create mode 100644 tests/test_ci.py

diff --git a/src/upskill/ci.py b/src/upskill/ci.py
new file mode 100644
index 0000000..275096a
--- /dev/null
+++ b/src/upskill/ci.py
@@ -0,0 +1,214 @@
+"""Scenario planning and report helpers for ``upskill ci``."""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+from pathlib import Path
+
+import yaml
+
+from upskill.models import CiReport, EvalManifest, EvalScenario, TestCase
+
+
+def _normalize_relative_path(path: Path, root: Path) -> str:
+    """Return a stable report path relative to ``root`` when possible."""
+    try:
+        return path.resolve().relative_to(root.resolve()).as_posix()
+    except ValueError:
+        return path.resolve().as_posix()
+
+
+def load_eval_manifest(path: Path) -> EvalManifest:
+    """Load a YAML or JSON CI manifest."""
+    with open(path, encoding="utf-8") as handle:
+        if path.suffix.lower() == ".json":
+            payload = json.load(handle)
+        else:
+            payload = yaml.safe_load(handle) or {}
+    return EvalManifest.model_validate(payload)
+
+
+def load_test_cases(path: Path) -> list[TestCase]:
+    """Load test cases from YAML or JSON."""
+    with open(path, encoding="utf-8") as handle:
+        if path.suffix.lower() == ".json":
+            payload = json.load(handle)
+        else:
+            payload = yaml.safe_load(handle) or {}
+
+    cases = payload["cases"] if isinstance(payload, dict) and "cases" in payload else payload
+    return [TestCase.model_validate(item) for item in cases]
+
+
+def plan_ci_suite(
+    manifest_path: Path,
+    *,
+    scope: str = "changed",
+    base_ref: str = "origin/main",
+    working_dir: Path | None = None,
+) -> tuple[CiReport, list[EvalScenario]]:
+    """Resolve scenario selection without executing the suite."""
+    root = (working_dir or Path.cwd()).resolve()
+    manifest = load_eval_manifest(manifest_path)
+
+    changed_files: list[str] = []
+    changed_skills: list[str] = []
+    if scope == "changed":
+        changed_files = resolve_changed_files(base_ref=base_ref, working_dir=root)
+        changed_skills = resolve_changed_skill_dirs(changed_files, working_dir=root)
+
+    selected_scenarios = select_scenarios(
+        manifest,
+        scope=scope,
+        changed_skills=changed_skills,
+    )
+
+    report = CiReport(
+        manifest_path=_normalize_relative_path(manifest_path, root),
+        scope=scope,
+        base_ref=base_ref if scope == "changed" else None,
+        changed_files=changed_files,
+        changed_skills=changed_skills,
+        selected_scenarios=[scenario.id for scenario in selected_scenarios],
+        success=True,
+    )
+    return report, selected_scenarios
+
+
+def resolve_changed_files(*, base_ref: str, working_dir: Path) -> list[str]:
+    """Return changed files for the current checkout."""
+    completed = subprocess.run(
+        ["git", "diff", "--name-only", f"{base_ref}...HEAD"],
+        cwd=working_dir,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    if completed.returncode != 0:
+        error = completed.stderr.strip() or completed.stdout.strip() or "git diff failed"
+        raise RuntimeError(error)
+    return [line.strip() for line in completed.stdout.splitlines() if line.strip()]
+
+
+def resolve_changed_skill_dirs(changed_files: list[str], *, working_dir: Path) -> list[str]:
+    """Find skill directories impacted by changed files."""
+    changed_skills: set[str] = set()
+    root = working_dir.resolve()
+
+    for changed_file in changed_files:
+        path = (working_dir / changed_file).resolve()
+        current = path if path.is_dir() else path.parent
+        while current != root and current != current.parent:
+            if (current / "SKILL.md").exists():
+                changed_skills.add(current.relative_to(root).as_posix())
+                break
+            current = current.parent
+
+    return sorted(changed_skills)
+
+
+def select_scenarios(
+    manifest: EvalManifest,
+    *,
+    scope: str,
+    changed_skills: list[str],
+) -> list[EvalScenario]:
+    """Filter manifest scenarios for the requested CI scope."""
+    if scope == "all":
+        return list(manifest.scenarios)
+
+    changed = set(changed_skills)
+    selected = []
+    for scenario in manifest.scenarios:
+        scenario_skills = {Path(skill).as_posix() for skill in scenario.skills}
+        if scenario_skills & changed:
+            selected.append(scenario)
+    return selected
+
+
+def write_ci_report(path: Path, report: CiReport) -> None:
+    """Write the machine-readable CI report."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(report.model_dump(mode="json"), indent=2),
+        encoding="utf-8",
+    )
+
+
+def render_ci_report_markdown(report: CiReport) -> str:
+    """Render a GitHub-friendly markdown summary."""
+    lines = [
+        "# upskill CI",
+        "",
+        f"- Scope: `{report.scope}`",
+        f"- Manifest: `{report.manifest_path}`",
+    ]
+    if report.base_ref:
+        lines.append(f"- Base ref: `{report.base_ref}`")
+    if report.changed_skills:
+        changed = ", ".join(f"`{item}`" for item in report.changed_skills)
+        lines.append(f"- Changed skills: {changed}")
+    if not report.scenarios:
+        lines.extend(["", "No scenarios were selected."])
+        return "\n".join(lines)
+
+    for scenario in report.scenarios:
+        lines.extend(
+            [
+                "",
+                f"## {scenario.scenario_id}",
+                "",
+                "| Variant | Skills | Pass | Assertions | Judge | Tokens |",
+                "| --- | --- | --- | --- | --- | --- |",
+            ]
+        )
+        variants = [scenario.bundle, *scenario.ablations]
+        if scenario.baseline is not None:
+            variants.append(scenario.baseline)
+        for variant in variants:
+            judge_value = f"{variant.judge_score:.2f}" if variant.judge_score is not None else "n/a"
+            lines.append(
+                "| "
+                f"{variant.variant_id} | "
+                f"{', '.join(variant.skills) or '(none)'} | "
+                f"{'PASS' if variant.passed else 'FAIL'} | "
+                f"{variant.assertions_passed}/{variant.assertions_total} | "
+                f"{judge_value} | "
+                f"{variant.total_tokens} |"
+            )
+        if scenario.contributions:
+            lines.extend(
+                [
+                    "",
+                    "| Skill | Hard Delta | Judge Delta | Passed Without Skill |",
+                    "| --- | --- | --- | --- |",
+                ]
+            )
+            for contribution in scenario.contributions:
+                judge_delta = (
+                    f"{contribution.judge_score_delta:+.2f}"
+                    if contribution.judge_score_delta is not None
+                    else "n/a"
+                )
+                lines.append(
+                    "| "
+                    f"{contribution.skill} | "
+                    f"{contribution.hard_score_delta:+.2f} | "
+                    f"{judge_delta} | "
+                    f"{'yes' if contribution.passed_without_skill else 'no'} |"
+                )
+
+    return "\n".join(lines)
+
+
+def write_step_summary(report: CiReport) -> None:
+    """Append the markdown summary to GitHub's step summary file when available."""
+    summary_path = os.getenv("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        return
+
+    with open(summary_path, "a", encoding="utf-8") as handle:
+        handle.write(render_ci_report_markdown(report))
+        handle.write("\n")
diff --git a/src/upskill/models.py b/src/upskill/models.py
index 695fa9f..990eb32 100644
--- a/src/upskill/models.py
+++ b/src/upskill/models.py
@@ -5,7 +5,7 @@
 import json
 import re
 from datetime import datetime
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
@@ -470,6 +470,88 @@ def is_beneficial(self) -> bool:
         return self.skill_lift > 0.05 or (self.skill_lift >= 0 and self.token_savings > 0.2)
 
 
+class ScenarioJudgeConfig(BaseModel):
+    """Judge configuration for a scenario."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    enabled: bool = False
+    criteria: list[str] | None = None
+
+
+class EvalScenario(BaseModel):
+    """Scenario definition for CI evaluation."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    id: str = Field(..., min_length=1)
+    skills: list[str] = Field(default_factory=list)
+    tests: str
+    judge: ScenarioJudgeConfig | None = None
+    include_baseline: bool = False
+
+
+class EvalManifest(BaseModel):
+    """Top-level CI manifest."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    scenarios: list[EvalScenario] = Field(default_factory=list)
+
+
+class ScenarioVariantResult(BaseModel):
+    """Aggregate result for one scenario variant."""
+
+    variant_id: str
+    variant_type: Literal["bundle", "ablation", "baseline"]
+    skills: list[str] = Field(default_factory=list)
+    omitted_skill: str | None = None
+    passed: bool
+    assertions_passed: int = 0
+    assertions_total: int = 0
+    hard_score: float = 0.0
+    judge_score: float | None = None
+    judge_summary: str | None = None
+    total_tokens: int = 0
+    average_turns: float = 0.0
+    run_folder: str | None = None
+
+
+class ScenarioContribution(BaseModel):
+    """Contribution delta for leaving one skill out of a bundle."""
+
+    skill: str
+    hard_score_delta: float = 0.0
+    judge_score_delta: float | None = None
+    passed_without_skill: bool = False
+
+
+class ScenarioReport(BaseModel):
+    """Report for one selected scenario."""
+
+    scenario_id: str
+    skills: list[str] = Field(default_factory=list)
+    tests_path: str
+    passed: bool
+    bundle: ScenarioVariantResult
+    ablations: list[ScenarioVariantResult] = Field(default_factory=list)
+    baseline: ScenarioVariantResult | None = None
+    contributions: list[ScenarioContribution] = Field(default_factory=list)
+
+
+class CiReport(BaseModel):
+    """Machine-readable report for a CI evaluation run."""
+
+    manifest_path: str
+    scope: str
+    base_ref: str | None = None
+    changed_files: list[str] = Field(default_factory=list)
+    changed_skills: list[str] = Field(default_factory=list)
+    selected_scenarios: list[str] = Field(default_factory=list)
+    success: bool = True
+    scenarios: list[ScenarioReport] = Field(default_factory=list)
+
+
 # Run logging models (similar to skills-test)
 
 
diff --git a/tests/fixtures/ci_action_repo/.upskill/evals.yaml b/tests/fixtures/ci_action_repo/.upskill/evals.yaml
new file mode 100644
index 0000000..52da53d
--- /dev/null
+++ b/tests/fixtures/ci_action_repo/.upskill/evals.yaml
@@ -0,0 +1,5 @@
+scenarios:
+  - id: fixture-scenario
+    skills:
+      - skills/example-skill
+    tests: evals/example.yaml
diff --git a/tests/fixtures/ci_action_repo/evals/example.yaml b/tests/fixtures/ci_action_repo/evals/example.yaml
new file mode 100644
index 0000000..031a2eb
--- /dev/null
+++ b/tests/fixtures/ci_action_repo/evals/example.yaml
@@ -0,0 +1,6 @@
+cases:
+  - input: "Say fixture"
+    expected:
+      contains:
+        - fixture
+        - response
diff --git a/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md
new file mode 100644
index 0000000..70b1902
--- /dev/null
+++ b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md
@@ -0,0 +1,6 @@
+---
+name: example-skill
+description: Example fixture skill for action smoke tests
+---
+
+Respond with the fixture response.
diff --git a/tests/test_ci.py b/tests/test_ci.py
new file mode 100644
index 0000000..8ff3999
--- /dev/null
+++ b/tests/test_ci.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+from upskill.ci import (
+    load_eval_manifest,
+    load_test_cases,
+    plan_ci_suite,
+    render_ci_report_markdown,
+    resolve_changed_skill_dirs,
+    write_ci_report,
+    write_step_summary,
+)
+from upskill.models import (
+    CiReport,
+    ScenarioContribution,
+    ScenarioReport,
+    ScenarioVariantResult,
+)
+
+FIXTURE_REPO = Path(__file__).parent / "fixtures" / "ci_action_repo"
+
+
+def _run_git(repo: Path, *args: str) -> None:
+    completed = subprocess.run(
+        ["git", *args],
+        cwd=repo,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    assert completed.returncode == 0, completed.stderr
+
+
+def _copy_fixture_repo(destination: Path) -> None:
+    shutil.copytree(FIXTURE_REPO, destination, dirs_exist_ok=True)
+
+
+def test_load_eval_manifest_and_test_cases_from_fixture() -> None:
+    manifest = load_eval_manifest(FIXTURE_REPO / ".upskill" / "evals.yaml")
+    cases = load_test_cases(FIXTURE_REPO / "evals" / "example.yaml")
+
+    assert [scenario.id for scenario in manifest.scenarios] == ["fixture-scenario"]
+    assert manifest.scenarios[0].skills == ["skills/example-skill"]
+    assert manifest.scenarios[0].tests == "evals/example.yaml"
+    assert len(cases) == 1
+    assert cases[0].expected is not None
+    assert cases[0].expected.contains == ["fixture", "response"]
+
+
+def test_resolve_changed_skill_dirs_walks_to_nearest_skill(tmp_path: Path) -> None:
+    skill_dir = tmp_path / "skills" / "example-skill"
+    refs_dir = skill_dir / "references"
+    refs_dir.mkdir(parents=True)
+    (skill_dir / "SKILL.md").write_text("example", encoding="utf-8")
+    (refs_dir / "guide.md").write_text("details", encoding="utf-8")
+
+    changed_skills = resolve_changed_skill_dirs(
+        ["skills/example-skill/references/guide.md", "README.md"],
+        working_dir=tmp_path,
+    )
+
+    assert changed_skills == ["skills/example-skill"]
+
+
+def test_plan_ci_suite_selects_changed_scenarios_from_git_history(tmp_path: Path) -> None:
+    repo = tmp_path / "repo"
+    _copy_fixture_repo(repo)
+
+    _run_git(repo, "init")
+    _run_git(repo, "config", "user.name", "Test User")
+    _run_git(repo, "config", "user.email", "test@example.com")
+    _run_git(repo, "add", ".")
+    _run_git(repo, "commit", "-m", "initial")
+
+    skill_path = repo / "skills" / "example-skill" / "SKILL.md"
+    skill_path.write_text(skill_path.read_text(encoding="utf-8") + "\nUpdated.\n", encoding="utf-8")
+    _run_git(repo, "add", "skills/example-skill/SKILL.md")
+    _run_git(repo, "commit", "-m", "update skill")
+
+    report, selected = plan_ci_suite(
+        repo / ".upskill" / "evals.yaml",
+        scope="changed",
+        base_ref="HEAD~1",
+        working_dir=repo,
+    )
+
+    assert report.changed_files == ["skills/example-skill/SKILL.md"]
+    assert report.changed_skills == ["skills/example-skill"]
+    assert report.selected_scenarios == ["fixture-scenario"]
+    assert [scenario.id for scenario in selected] == ["fixture-scenario"]
+
+
+def test_render_ci_report_markdown_and_write_outputs(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    report = CiReport(
+        manifest_path=".upskill/evals.yaml",
+        scope="all",
+        selected_scenarios=["fixture-scenario"],
+        scenarios=[
+            ScenarioReport(
+                scenario_id="fixture-scenario",
+                skills=["skills/example-skill"],
+                tests_path="evals/example.yaml",
+                passed=True,
+                bundle=ScenarioVariantResult(
+                    variant_id="bundle",
+                    variant_type="bundle",
+                    skills=["skills/example-skill"],
+                    passed=True,
+                    assertions_passed=2,
+                    assertions_total=2,
+                    judge_score=0.8,
+                    total_tokens=42,
+                ),
+                ablations=[
+                    ScenarioVariantResult(
+                        variant_id="without-example-skill",
+                        variant_type="ablation",
+                        skills=[],
+                        omitted_skill="skills/example-skill",
+                        passed=False,
+                        assertions_passed=0,
+                        assertions_total=2,
+                        total_tokens=21,
+                    )
+                ],
+                baseline=ScenarioVariantResult(
+                    variant_id="baseline",
+                    variant_type="baseline",
+                    skills=[],
+                    passed=False,
+                    assertions_passed=0,
+                    assertions_total=2,
+                    total_tokens=18,
+                ),
+                contributions=[
+                    ScenarioContribution(
+                        skill="skills/example-skill",
+                        hard_score_delta=1.0,
+                        judge_score_delta=0.3,
+                        passed_without_skill=False,
+                    )
+                ],
+            )
+        ],
+    )
+
+    markdown = render_ci_report_markdown(report)
+    assert "# upskill CI" in markdown
+    assert "fixture-scenario" in markdown
+    assert "without-example-skill" in markdown
+    assert "skills/example-skill" in markdown
+
+    report_path = tmp_path / "report.json"
+    write_ci_report(report_path, report)
+    saved = json.loads(report_path.read_text(encoding="utf-8"))
+    assert saved["selected_scenarios"] == ["fixture-scenario"]
+
+    summary_path = tmp_path / "summary.md"
+    monkeypatch.setenv("GITHUB_STEP_SUMMARY", str(summary_path))
+    write_step_summary(report)
+    assert "fixture-scenario" in summary_path.read_text(encoding="utf-8")