From 56a5bcacbe3d9667b67dd9ad9f0d41efeb2031bd Mon Sep 17 00:00:00 2001 From: evalstate <1936278+evalstate@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:38:58 +0000 Subject: [PATCH 1/2] Add verifier-based eval validation --- src/upskill/cli.py | 10 +- src/upskill/evaluate.py | 48 ++---- src/upskill/generate.py | 6 +- src/upskill/models.py | 59 +++++++- src/upskill/verifiers.py | 243 +++++++++++++++++++++++++++++++ tests/test_execution_backends.py | 69 ++++++++- tests/test_verifiers.py | 118 +++++++++++++++ 7 files changed, 506 insertions(+), 47 deletions(-) create mode 100644 src/upskill/verifiers.py create mode 100644 tests/test_verifiers.py diff --git a/src/upskill/cli.py b/src/upskill/cli.py index d38396a..1fca0a4 100644 --- a/src/upskill/cli.py +++ b/src/upskill/cli.py @@ -547,9 +547,11 @@ async def _load_test_cases( def _count_invalid_expected_cases(test_cases: list[TestCase]) -> int: - """Count generated or loaded tests missing enough expected strings.""" + """Count tests with legacy expected-string checks missing enough values.""" invalid_expected = 0 for test_case in test_cases: + if test_case.expected is None: + continue expected_values = [value.strip() for value in test_case.expected.contains if value.strip()] if len(expected_values) < 2: invalid_expected += 1 @@ -1601,7 +1603,11 @@ async def _eval_async( # noqa: C901 invalid_expected = _count_invalid_expected_cases(test_cases) console.print(f"[dim]Loaded {len(test_cases)} test case(s) from {test_source}[/dim]") if invalid_expected: - console.print(f"[yellow]{invalid_expected} test case(s) missing expected strings[/yellow]") + console.print( + "[yellow]" + f"{invalid_expected} legacy expected-string test case(s) missing expected strings" + "[/yellow]" + ) runs_path = Path(runs_dir) if runs_dir else config.runs_dir batch_id, batch_folder = create_batch_folder(runs_path) diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py index b5412ca..99c63f6 100644 --- a/src/upskill/evaluate.py +++ b/src/upskill/evaluate.py @@ -18,13 +18,12 @@ from upskill.executors.contracts import ExecutionRequest from upskill.models import ( EvalResults, - ExpectedSpec, Skill, TestCase, TestResult, ValidationResult, ) -from upskill.validators import get_validator +from upskill.verifiers import run_verifiers logger = logging.getLogger(__name__) @@ -141,39 +140,11 @@ def load_eval_results_from_artifact_root( def check_expected( output: str, - expected: ExpectedSpec, + test_case: TestCase, workspace: Path | None = None, - test_case: TestCase | None = None, -) -> tuple[bool, ValidationResult | None]: - """Check if output matches expected conditions. - - Args: - output: The agent's output string - expected: Expected conditions dict (legacy format with "contains") - workspace: Optional workspace directory for file-based validation - test_case: Optional test case with custom validator config - - Returns: - Tuple of (success, validation_result) - """ - # Handle custom validator if specified - if test_case and test_case.validator: - validator = get_validator(test_case.validator) - if validator and workspace: - config = test_case.validator_config or {} - result = validator( - workspace=workspace, - output_file=test_case.output_file or "", - **config, - ) - return result.passed, result - - required = expected.contains - output_lower = output.lower() - if any(item.lower() not in output_lower for item in required): - return False, None - - return True, None +) -> ValidationResult: + """Run normalized deterministic verifiers for one test case.""" + return run_verifiers(test_case, output=output, workspace=workspace) def format_test_prompt(test_case: TestCase) -> str: @@ -216,7 +187,7 @@ def build_eval_execution_request( "instance_name": instance_name, "operation": operation, "skill_name": skill.name if skill else None, - "has_validator": bool(test_case.validator), + "has_validator": bool(test_case.effective_verifiers()), }, ) @@ -321,15 +292,14 @@ async def _run_test_with_evaluator( _write_test_result_summary(normalized_artifact_dir / "test_result.json", result) return result - success, validation_result = check_expected( + validation_result = check_expected( execution_result.output_text or "", - test_case.expected, - execution_result.workspace_dir, test_case, + execution_result.workspace_dir, ) result = TestResult( test_case=test_case, - success=success, + success=validation_result.passed, output=execution_result.output_text, tokens_used=execution_result.stats.total_tokens, turns=execution_result.stats.turns, diff --git a/src/upskill/generate.py b/src/upskill/generate.py index 8abc943..83b6dad 100644 --- a/src/upskill/generate.py +++ b/src/upskill/generate.py @@ -149,6 +149,8 @@ async def generate_tests( cases = result.cases invalid_expected = 0 for tc in cases: + if tc.expected is None: + continue expected_values = [value.strip() for value in tc.expected.contains if value.strip()] if len(expected_values) < 2: invalid_expected += 1 @@ -160,8 +162,8 @@ async def generate_tests( ) if invalid_expected: print( - "Warning: some test cases are missing at least two expected strings; " - "review generated tests." + "Warning: some legacy expected-string test cases are missing at least two expected " + "strings; review generated tests." ) return cases diff --git a/src/upskill/models.py b/src/upskill/models.py index c4de86f..695fa9f 100644 --- a/src/upskill/models.py +++ b/src/upskill/models.py @@ -7,7 +7,7 @@ from datetime import datetime from typing import TYPE_CHECKING -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator if TYPE_CHECKING: from pathlib import Path @@ -34,6 +34,33 @@ class ValidationResult(BaseModel): metrics_count: int = 0 benchmarks_found: list[str] = Field(default_factory=list) error_message: str | None = None + details: list[str] = Field(default_factory=list) + + +type VerifierConfigValue = str | int | float | bool + + +class VerifierSpec(BaseModel): + """Deterministic verifier configuration for a test case.""" + + model_config = ConfigDict(extra="forbid") + + type: str + name: str | None = None + values: list[str] = Field(default_factory=list) + path: str | None = None + text: str | None = None + cmd: str | None = None + config: dict[str, VerifierConfigValue] | None = None + + @field_validator("values", mode="before") + @classmethod + def coerce_values(cls, value: str | list[str] | None) -> list[str]: + if value is None: + return [] + if isinstance(value, str): + return [value] + return value class ExpectedSpec(BaseModel): @@ -67,12 +94,38 @@ class TestCase(BaseModel): input: str # Task/prompt to give the agent context: TestCaseContext | None = None # Files, env vars, etc. - expected: ExpectedSpec # Expected output checks + expected: ExpectedSpec | None = None # Legacy expected output checks + verifiers: list[VerifierSpec] = Field(default_factory=list) # Custom validator support output_file: str | None = None # File to validate instead of agent output validator: str | None = None # Validator name (e.g., "hf_eval_yaml") - validator_config: dict[str, str | int | float | bool] | None = None + validator_config: dict[str, VerifierConfigValue] | None = None + + @model_validator(mode="after") + def validate_expectations(self) -> TestCase: + if self.expected is None and not self.verifiers and self.validator is None: + raise ValueError("TestCase requires at least one of expected, verifiers, or validator.") + return self + + def effective_verifiers(self) -> list[VerifierSpec]: + """Return normalized verifier specs including legacy expectation fields.""" + effective = list(self.verifiers) + if self.expected is not None and self.expected.contains: + effective.insert( + 0, + VerifierSpec(type="contains", values=self.expected.contains), + ) + if self.validator is not None: + effective.append( + VerifierSpec( + type="validator", + name=self.validator, + path=self.output_file, + config=self.validator_config, + ) + ) + return effective class TestCaseSuite(BaseModel): diff --git a/src/upskill/verifiers.py b/src/upskill/verifiers.py new file mode 100644 index 0000000..0f56408 --- /dev/null +++ b/src/upskill/verifiers.py @@ -0,0 +1,243 @@ +"""Deterministic verifier execution for upskill test cases.""" + +from __future__ import annotations + +import subprocess +from typing import TYPE_CHECKING + +from upskill.models import TestCase, ValidationResult, VerifierSpec +from upskill.validators import get_validator + +if TYPE_CHECKING: + from pathlib import Path + +DEFAULT_COMMAND_TIMEOUT_SECONDS = 60 +MAX_COMMAND_OUTPUT_CHARS = 1200 + + +def _build_validation_result( + passed: bool, + *, + error_message: str | None = None, + details: list[str] | None = None, +) -> ValidationResult: + return ValidationResult( + passed=passed, + assertions_passed=1 if passed else 0, + assertions_total=1, + error_message=error_message, + details=details or [], + ) + + +def _format_command_failure(output: str) -> str: + compact = output.strip() + if len(compact) > MAX_COMMAND_OUTPUT_CHARS: + compact = compact[:MAX_COMMAND_OUTPUT_CHARS].rstrip() + "..." + return compact or "command exited with a non-zero status" + + +def _resolve_values(spec: VerifierSpec) -> list[str]: + if spec.values: + return spec.values + if spec.text: + return [spec.text] + return [] + + +def _run_contains_verifier(spec: VerifierSpec, output: str) -> ValidationResult: + required = [value for value in _resolve_values(spec) if value.strip()] + if not required: + return _build_validation_result(False, error_message="contains verifier is missing values") + + output_lower = output.lower() + missing = [item for item in required if item.lower() not in output_lower] + if missing: + return _build_validation_result( + False, + error_message=f"missing required output text: {missing[0]}", + details=[f"missing: {item}" for item in missing], + ) + return _build_validation_result(True) + + +def _require_workspace(spec: VerifierSpec, workspace: Path | None) -> ValidationResult | None: + if workspace is not None: + return None + return _build_validation_result( + False, + error_message=f"{spec.type} verifier requires a workspace", + ) + + +def _run_file_exists_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + assert workspace is not None + if not spec.path: + return _build_validation_result(False, error_message="file_exists verifier is missing path") + + target = workspace / spec.path + if target.exists(): + return _build_validation_result(True) + return _build_validation_result( + False, + error_message=f"expected file does not exist: {spec.path}", + ) + + +def _run_file_contains_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + assert workspace is not None + if not spec.path: + return _build_validation_result( + False, + error_message="file_contains verifier is missing path", + ) + + target = workspace / spec.path + if not target.exists(): + return _build_validation_result( + False, + error_message=f"expected file does not exist: {spec.path}", + ) + + required = [value for value in _resolve_values(spec) if value.strip()] + if not required: + return _build_validation_result( + False, + error_message="file_contains verifier is missing text or values", + ) + + content = target.read_text(encoding="utf-8") + content_lower = content.lower() + missing = [item for item in required if item.lower() not in content_lower] + if missing: + return _build_validation_result( + False, + error_message=f"missing required file text: {missing[0]}", + details=[f"missing: {item}" for item in missing], + ) + return _build_validation_result(True) + + +def _run_command_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + assert workspace is not None + if not spec.cmd: + return _build_validation_result(False, error_message="command verifier is missing cmd") + + timeout_seconds = DEFAULT_COMMAND_TIMEOUT_SECONDS + if spec.config and "timeout_seconds" in spec.config: + timeout_seconds = int(spec.config["timeout_seconds"]) + + completed = subprocess.run( + spec.cmd, + shell=True, + cwd=workspace, + text=True, + capture_output=True, + timeout=timeout_seconds, + check=False, + ) + if completed.returncode == 0: + return _build_validation_result(True) + + combined_output = "\n".join(part for part in (completed.stdout, completed.stderr) if part) + return _build_validation_result( + False, + error_message=_format_command_failure(combined_output), + ) + + +def _run_legacy_validator_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + assert workspace is not None + if not spec.name: + return _build_validation_result(False, error_message="validator verifier is missing name") + + validator = get_validator(spec.name) + if validator is None: + return _build_validation_result( + False, + error_message=f"unknown validator: {spec.name}", + ) + + config = spec.config or {} + return validator( + workspace=workspace, + output_file=spec.path or "", + **config, + ) + + +def run_verifier( + spec: VerifierSpec, + *, + output: str, + workspace: Path | None, +) -> ValidationResult: + """Run one verifier against the current output/workspace.""" + + if spec.type == "contains": + return _run_contains_verifier(spec, output) + if spec.type == "file_exists": + return _run_file_exists_verifier(spec, workspace) + if spec.type == "file_contains": + return _run_file_contains_verifier(spec, workspace) + if spec.type == "command": + return _run_command_verifier(spec, workspace) + if spec.type == "validator": + return _run_legacy_validator_verifier(spec, workspace) + + return _build_validation_result( + False, + error_message=f"unsupported verifier type: {spec.type}", + ) + + +def run_verifiers( + test_case: TestCase, + *, + output: str, + workspace: Path | None, +) -> ValidationResult: + """Run all verifiers configured for a test case.""" + + specs = test_case.effective_verifiers() + if not specs: + return ValidationResult( + passed=False, + assertions_passed=0, + assertions_total=0, + error_message="no verifiers configured", + ) + + passed = 0 + total = 0 + details: list[str] = [] + error_messages: list[str] = [] + + for spec in specs: + result = run_verifier(spec, output=output, workspace=workspace) + passed += result.assertions_passed + total += result.assertions_total + if result.error_message: + error_messages.append(result.error_message) + if result.details: + details.extend(result.details) + + return ValidationResult( + passed=passed == total, + assertions_passed=passed, + assertions_total=total, + error_message="; ".join(error_messages) if error_messages else None, + details=details, + ) diff --git a/tests/test_execution_backends.py b/tests/test_execution_backends.py index e14f960..bdc7cd7 100644 --- a/tests/test_execution_backends.py +++ b/tests/test_execution_backends.py @@ -19,7 +19,14 @@ from upskill.executors.remote_fast_agent import RemoteFastAgentExecutor from upskill.fast_agent_cli import build_fast_agent_command from upskill.hf_jobs import JobsConfig, SubmittedJob -from upskill.models import ConversationStats, ExpectedSpec, Skill, TestCase, TestResult +from upskill.models import ( + ConversationStats, + ExpectedSpec, + Skill, + TestCase, + TestResult, + VerifierSpec, +) from upskill.result_parsing import parse_fast_agent_results @@ -513,6 +520,66 @@ async def cancel(self, handle: ExecutionHandle) -> None: assert "finished with-skill test 1/1 (ok)" in messages +@pytest.mark.asyncio +async def test_evaluate_skill_supports_verifier_only_test_cases(tmp_path: Path) -> None: + skill = Skill( + name="write-good-prs", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + test_case = TestCase( + input="write a report", + verifiers=[ + VerifierSpec(type="file_exists", path="report.txt"), + VerifierSpec(type="file_contains", path="report.txt", text="answer"), + ], + ) + + class FakeExecutor: + async def execute(self, request: ExecutionRequest) -> ExecutionHandle: + request.artifact_dir.mkdir(parents=True, exist_ok=True) + workspace_dir = request.artifact_dir / "workspace" + workspace_dir.mkdir(parents=True, exist_ok=True) + (workspace_dir / "report.txt").write_text("answer in report", encoding="utf-8") + task = asyncio.create_task( + asyncio.sleep( + 0, + result=ExecutionResult( + output_text="done", + raw_results_path=None, + stdout_path=request.artifact_dir / "stdout.txt", + stderr_path=request.artifact_dir / "stderr.txt", + artifact_dir=request.artifact_dir, + workspace_dir=workspace_dir, + stats=ConversationStats(), + ), + ) + ) + return ExecutionHandle(request=request, task=task) + + async def collect(self, handle: ExecutionHandle) -> ExecutionResult: + return await handle.task + + async def cancel(self, handle: ExecutionHandle) -> None: + handle.task.cancel() + + results = await evaluate_skill( + skill, + [test_case], + FakeExecutor(), + model="haiku", + fastagent_config_path=tmp_path / "fastagent.config.yaml", + cards_source_dir=tmp_path, + artifact_root=tmp_path / "eval", + ) + + test_result = results.with_skill_results[0] + assert test_result.success is True + assert test_result.validation_result is not None + assert test_result.validation_result.assertions_passed == 2 + assert test_result.validation_result.assertions_total == 2 + + @pytest.mark.asyncio async def test_evaluate_skill_includes_job_id_in_execution_errors(tmp_path: Path) -> None: skill = Skill( diff --git a/tests/test_verifiers.py b/tests/test_verifiers.py new file mode 100644 index 0000000..3704833 --- /dev/null +++ b/tests/test_verifiers.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING + +import pytest + +from upskill.models import ExpectedSpec, ValidationResult, VerifierSpec +from upskill.models import TestCase as UpskillTestCase +from upskill.validators import register_validator +from upskill.verifiers import run_verifiers + +if TYPE_CHECKING: + from pathlib import Path + + +@register_validator("test-counting-validator") +def _test_counting_validator( + workspace: Path, + output_file: str, + **_: object, +) -> ValidationResult: + target = workspace / output_file + passed = target.exists() + return ValidationResult( + passed=passed, + assertions_passed=2 if passed else 0, + assertions_total=2, + error_message=None if passed else f"missing file: {output_file}", + ) + + +def test_run_verifiers_supports_legacy_expected_contains() -> None: + test_case = UpskillTestCase( + input="say hello", + expected=ExpectedSpec(contains=["hello", "world"]), + ) + + result = run_verifiers(test_case, output="Hello, world!", workspace=None) + + assert result.passed is True + assert result.assertions_passed == 1 + assert result.assertions_total == 1 + + +def test_run_verifiers_supports_file_verifiers(tmp_path: Path) -> None: + target = tmp_path / "report.txt" + target.write_text("bundle ok", encoding="utf-8") + test_case = UpskillTestCase( + input="write file", + verifiers=[ + VerifierSpec(type="file_exists", path="report.txt"), + VerifierSpec(type="file_contains", path="report.txt", text="bundle ok"), + ], + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is True + assert result.assertions_passed == 2 + assert result.assertions_total == 2 + + +def test_run_verifiers_supports_command_verifier(tmp_path: Path) -> None: + script = tmp_path / "check.py" + script.write_text("print('ok')\n", encoding="utf-8") + test_case = UpskillTestCase( + input="run assertion script", + verifiers=[VerifierSpec(type="command", cmd=f"'{sys.executable}' check.py")], + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is True + assert result.assertions_passed == 1 + + +def test_run_verifiers_translates_legacy_validator(tmp_path: Path) -> None: + target = tmp_path / "artifact.txt" + target.write_text("ok", encoding="utf-8") + test_case = UpskillTestCase( + input="validate artifact", + validator="test-counting-validator", + output_file="artifact.txt", + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is True + assert result.assertions_passed == 2 + assert result.assertions_total == 2 + + +def test_run_verifiers_reports_failures(tmp_path: Path) -> None: + test_case = UpskillTestCase( + input="write report", + verifiers=[ + VerifierSpec(type="file_exists", path="report.txt"), + VerifierSpec( + type="command", + cmd=f"'{sys.executable}' -c 'import sys; sys.exit(1)'", + ), + ], + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is False + assert result.assertions_passed == 0 + assert result.assertions_total == 2 + assert result.error_message is not None + + +def test_test_case_rejects_missing_expectation_configuration() -> None: + with pytest.raises( + ValueError, match="requires at least one of expected, verifiers, or validator" + ): + UpskillTestCase(input="missing all checks") From b6ec94002c334bc1814d60744ca54958ab1a0e06 Mon Sep 17 00:00:00 2001 From: evalstate <1936278+evalstate@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:51:09 +0000 Subject: [PATCH 2/2] Add CI manifest planning helpers --- src/upskill/ci.py | 214 ++++++++++++++++++ src/upskill/models.py | 84 ++++++- .../ci_action_repo/.upskill/evals.yaml | 5 + .../ci_action_repo/evals/example.yaml | 6 + .../skills/example-skill/SKILL.md | 6 + tests/test_ci.py | 168 ++++++++++++++ 6 files changed, 482 insertions(+), 1 deletion(-) create mode 100644 src/upskill/ci.py create mode 100644 tests/fixtures/ci_action_repo/.upskill/evals.yaml create mode 100644 tests/fixtures/ci_action_repo/evals/example.yaml create mode 100644 tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md create mode 100644 tests/test_ci.py diff --git a/src/upskill/ci.py b/src/upskill/ci.py new file mode 100644 index 0000000..275096a --- /dev/null +++ b/src/upskill/ci.py @@ -0,0 +1,214 @@ +"""Scenario planning and report helpers for ``upskill ci``.""" + +from __future__ import annotations + +import json +import os +import subprocess +from pathlib import Path + +import yaml + +from upskill.models import CiReport, EvalManifest, EvalScenario, TestCase + + +def _normalize_relative_path(path: Path, root: Path) -> str: + """Return a stable report path relative to ``root`` when possible.""" + try: + return path.resolve().relative_to(root.resolve()).as_posix() + except ValueError: + return path.resolve().as_posix() + + +def load_eval_manifest(path: Path) -> EvalManifest: + """Load a YAML or JSON CI manifest.""" + with open(path, encoding="utf-8") as handle: + if path.suffix.lower() == ".json": + payload = json.load(handle) + else: + payload = yaml.safe_load(handle) or {} + return EvalManifest.model_validate(payload) + + +def load_test_cases(path: Path) -> list[TestCase]: + """Load test cases from YAML or JSON.""" + with open(path, encoding="utf-8") as handle: + if path.suffix.lower() == ".json": + payload = json.load(handle) + else: + payload = yaml.safe_load(handle) or {} + + cases = payload["cases"] if isinstance(payload, dict) and "cases" in payload else payload + return [TestCase.model_validate(item) for item in cases] + + +def plan_ci_suite( + manifest_path: Path, + *, + scope: str = "changed", + base_ref: str = "origin/main", + working_dir: Path | None = None, +) -> tuple[CiReport, list[EvalScenario]]: + """Resolve scenario selection without executing the suite.""" + root = (working_dir or Path.cwd()).resolve() + manifest = load_eval_manifest(manifest_path) + + changed_files: list[str] = [] + changed_skills: list[str] = [] + if scope == "changed": + changed_files = resolve_changed_files(base_ref=base_ref, working_dir=root) + changed_skills = resolve_changed_skill_dirs(changed_files, working_dir=root) + + selected_scenarios = select_scenarios( + manifest, + scope=scope, + changed_skills=changed_skills, + ) + + report = CiReport( + manifest_path=_normalize_relative_path(manifest_path, root), + scope=scope, + base_ref=base_ref if scope == "changed" else None, + changed_files=changed_files, + changed_skills=changed_skills, + selected_scenarios=[scenario.id for scenario in selected_scenarios], + success=True, + ) + return report, selected_scenarios + + +def resolve_changed_files(*, base_ref: str, working_dir: Path) -> list[str]: + """Return changed files for the current checkout.""" + completed = subprocess.run( + ["git", "diff", "--name-only", f"{base_ref}...HEAD"], + cwd=working_dir, + text=True, + capture_output=True, + check=False, + ) + if completed.returncode != 0: + error = completed.stderr.strip() or completed.stdout.strip() or "git diff failed" + raise RuntimeError(error) + return [line.strip() for line in completed.stdout.splitlines() if line.strip()] + + +def resolve_changed_skill_dirs(changed_files: list[str], *, working_dir: Path) -> list[str]: + """Find skill directories impacted by changed files.""" + changed_skills: set[str] = set() + root = working_dir.resolve() + + for changed_file in changed_files: + path = (working_dir / changed_file).resolve() + current = path if path.is_dir() else path.parent + while current != root and current != current.parent: + if (current / "SKILL.md").exists(): + changed_skills.add(current.relative_to(root).as_posix()) + break + current = current.parent + + return sorted(changed_skills) + + +def select_scenarios( + manifest: EvalManifest, + *, + scope: str, + changed_skills: list[str], +) -> list[EvalScenario]: + """Filter manifest scenarios for the requested CI scope.""" + if scope == "all": + return list(manifest.scenarios) + + changed = set(changed_skills) + selected = [] + for scenario in manifest.scenarios: + scenario_skills = {Path(skill).as_posix() for skill in scenario.skills} + if scenario_skills & changed: + selected.append(scenario) + return selected + + +def write_ci_report(path: Path, report: CiReport) -> None: + """Write the machine-readable CI report.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(report.model_dump(mode="json"), indent=2), + encoding="utf-8", + ) + + +def render_ci_report_markdown(report: CiReport) -> str: + """Render a GitHub-friendly markdown summary.""" + lines = [ + "# upskill CI", + "", + f"- Scope: `{report.scope}`", + f"- Manifest: `{report.manifest_path}`", + ] + if report.base_ref: + lines.append(f"- Base ref: `{report.base_ref}`") + if report.changed_skills: + changed = ", ".join(f"`{item}`" for item in report.changed_skills) + lines.append(f"- Changed skills: {changed}") + if not report.scenarios: + lines.extend(["", "No scenarios were selected."]) + return "\n".join(lines) + + for scenario in report.scenarios: + lines.extend( + [ + "", + f"## {scenario.scenario_id}", + "", + "| Variant | Skills | Pass | Assertions | Judge | Tokens |", + "| --- | --- | --- | --- | --- | --- |", + ] + ) + variants = [scenario.bundle, *scenario.ablations] + if scenario.baseline is not None: + variants.append(scenario.baseline) + for variant in variants: + judge_value = f"{variant.judge_score:.2f}" if variant.judge_score is not None else "n/a" + lines.append( + "| " + f"{variant.variant_id} | " + f"{', '.join(variant.skills) or '(none)'} | " + f"{'PASS' if variant.passed else 'FAIL'} | " + f"{variant.assertions_passed}/{variant.assertions_total} | " + f"{judge_value} | " + f"{variant.total_tokens} |" + ) + if scenario.contributions: + lines.extend( + [ + "", + "| Skill | Hard Delta | Judge Delta | Passed Without Skill |", + "| --- | --- | --- | --- |", + ] + ) + for contribution in scenario.contributions: + judge_delta = ( + f"{contribution.judge_score_delta:+.2f}" + if contribution.judge_score_delta is not None + else "n/a" + ) + lines.append( + "| " + f"{contribution.skill} | " + f"{contribution.hard_score_delta:+.2f} | " + f"{judge_delta} | " + f"{'yes' if contribution.passed_without_skill else 'no'} |" + ) + + return "\n".join(lines) + + +def write_step_summary(report: CiReport) -> None: + """Append the markdown summary to GitHub's step summary file when available.""" + summary_path = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_path: + return + + with open(summary_path, "a", encoding="utf-8") as handle: + handle.write(render_ci_report_markdown(report)) + handle.write("\n") diff --git a/src/upskill/models.py b/src/upskill/models.py index 695fa9f..990eb32 100644 --- a/src/upskill/models.py +++ b/src/upskill/models.py @@ -5,7 +5,7 @@ import json import re from datetime import datetime -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -470,6 +470,88 @@ def is_beneficial(self) -> bool: return self.skill_lift > 0.05 or (self.skill_lift >= 0 and self.token_savings > 0.2) +class ScenarioJudgeConfig(BaseModel): + """Judge configuration for a scenario.""" + + model_config = ConfigDict(extra="forbid") + + enabled: bool = False + criteria: list[str] | None = None + + +class EvalScenario(BaseModel): + """Scenario definition for CI evaluation.""" + + model_config = ConfigDict(extra="forbid") + + id: str = Field(..., min_length=1) + skills: list[str] = Field(default_factory=list) + tests: str + judge: ScenarioJudgeConfig | None = None + include_baseline: bool = False + + +class EvalManifest(BaseModel): + """Top-level CI manifest.""" + + model_config = ConfigDict(extra="forbid") + + scenarios: list[EvalScenario] = Field(default_factory=list) + + +class ScenarioVariantResult(BaseModel): + """Aggregate result for one scenario variant.""" + + variant_id: str + variant_type: Literal["bundle", "ablation", "baseline"] + skills: list[str] = Field(default_factory=list) + omitted_skill: str | None = None + passed: bool + assertions_passed: int = 0 + assertions_total: int = 0 + hard_score: float = 0.0 + judge_score: float | None = None + judge_summary: str | None = None + total_tokens: int = 0 + average_turns: float = 0.0 + run_folder: str | None = None + + +class ScenarioContribution(BaseModel): + """Contribution delta for leaving one skill out of a bundle.""" + + skill: str + hard_score_delta: float = 0.0 + judge_score_delta: float | None = None + passed_without_skill: bool = False + + +class ScenarioReport(BaseModel): + """Report for one selected scenario.""" + + scenario_id: str + skills: list[str] = Field(default_factory=list) + tests_path: str + passed: bool + bundle: ScenarioVariantResult + ablations: list[ScenarioVariantResult] = Field(default_factory=list) + baseline: ScenarioVariantResult | None = None + contributions: list[ScenarioContribution] = Field(default_factory=list) + + +class CiReport(BaseModel): + """Machine-readable report for a CI evaluation run.""" + + manifest_path: str + scope: str + base_ref: str | None = None + changed_files: list[str] = Field(default_factory=list) + changed_skills: list[str] = Field(default_factory=list) + selected_scenarios: list[str] = Field(default_factory=list) + success: bool = True + scenarios: list[ScenarioReport] = Field(default_factory=list) + + # Run logging models (similar to skills-test) diff --git a/tests/fixtures/ci_action_repo/.upskill/evals.yaml b/tests/fixtures/ci_action_repo/.upskill/evals.yaml new file mode 100644 index 0000000..52da53d --- /dev/null +++ b/tests/fixtures/ci_action_repo/.upskill/evals.yaml @@ -0,0 +1,5 @@ +scenarios: + - id: fixture-scenario + skills: + - skills/example-skill + tests: evals/example.yaml diff --git a/tests/fixtures/ci_action_repo/evals/example.yaml b/tests/fixtures/ci_action_repo/evals/example.yaml new file mode 100644 index 0000000..031a2eb --- /dev/null +++ b/tests/fixtures/ci_action_repo/evals/example.yaml @@ -0,0 +1,6 @@ +cases: + - input: "Say fixture" + expected: + contains: + - fixture + - response diff --git a/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md new file mode 100644 index 0000000..70b1902 --- /dev/null +++ b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md @@ -0,0 +1,6 @@ +--- +name: example-skill +description: Example fixture skill for action smoke tests +--- + +Respond with the fixture response. diff --git a/tests/test_ci.py b/tests/test_ci.py new file mode 100644 index 0000000..8ff3999 --- /dev/null +++ b/tests/test_ci.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +from upskill.ci import ( + load_eval_manifest, + load_test_cases, + plan_ci_suite, + render_ci_report_markdown, + resolve_changed_skill_dirs, + write_ci_report, + write_step_summary, +) +from upskill.models import ( + CiReport, + ScenarioContribution, + ScenarioReport, + ScenarioVariantResult, +) + +FIXTURE_REPO = Path(__file__).parent / "fixtures" / "ci_action_repo" + + +def _run_git(repo: Path, *args: str) -> None: + completed = subprocess.run( + ["git", *args], + cwd=repo, + text=True, + capture_output=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr + + +def _copy_fixture_repo(destination: Path) -> None: + shutil.copytree(FIXTURE_REPO, destination, dirs_exist_ok=True) + + +def test_load_eval_manifest_and_test_cases_from_fixture() -> None: + manifest = load_eval_manifest(FIXTURE_REPO / ".upskill" / "evals.yaml") + cases = load_test_cases(FIXTURE_REPO / "evals" / "example.yaml") + + assert [scenario.id for scenario in manifest.scenarios] == ["fixture-scenario"] + assert manifest.scenarios[0].skills == ["skills/example-skill"] + assert manifest.scenarios[0].tests == "evals/example.yaml" + assert len(cases) == 1 + assert cases[0].expected is not None + assert cases[0].expected.contains == ["fixture", "response"] + + +def test_resolve_changed_skill_dirs_walks_to_nearest_skill(tmp_path: Path) -> None: + skill_dir = tmp_path / "skills" / "example-skill" + refs_dir = skill_dir / "references" + refs_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("example", encoding="utf-8") + (refs_dir / "guide.md").write_text("details", encoding="utf-8") + + changed_skills = resolve_changed_skill_dirs( + ["skills/example-skill/references/guide.md", "README.md"], + working_dir=tmp_path, + ) + + assert changed_skills == ["skills/example-skill"] + + +def test_plan_ci_suite_selects_changed_scenarios_from_git_history(tmp_path: Path) -> None: + repo = tmp_path / "repo" + _copy_fixture_repo(repo) + + _run_git(repo, "init") + _run_git(repo, "config", "user.name", "Test User") + _run_git(repo, "config", "user.email", "test@example.com") + _run_git(repo, "add", ".") + _run_git(repo, "commit", "-m", "initial") + + skill_path = repo / "skills" / "example-skill" / "SKILL.md" + skill_path.write_text(skill_path.read_text(encoding="utf-8") + "\nUpdated.\n", encoding="utf-8") + _run_git(repo, "add", "skills/example-skill/SKILL.md") + _run_git(repo, "commit", "-m", "update skill") + + report, selected = plan_ci_suite( + repo / ".upskill" / "evals.yaml", + scope="changed", + base_ref="HEAD~1", + working_dir=repo, + ) + + assert report.changed_files == ["skills/example-skill/SKILL.md"] + assert report.changed_skills == ["skills/example-skill"] + assert report.selected_scenarios == ["fixture-scenario"] + assert [scenario.id for scenario in selected] == ["fixture-scenario"] + + +def test_render_ci_report_markdown_and_write_outputs( + tmp_path: Path, + monkeypatch, +) -> None: + report = CiReport( + manifest_path=".upskill/evals.yaml", + scope="all", + selected_scenarios=["fixture-scenario"], + scenarios=[ + ScenarioReport( + scenario_id="fixture-scenario", + skills=["skills/example-skill"], + tests_path="evals/example.yaml", + passed=True, + bundle=ScenarioVariantResult( + variant_id="bundle", + variant_type="bundle", + skills=["skills/example-skill"], + passed=True, + assertions_passed=2, + assertions_total=2, + judge_score=0.8, + total_tokens=42, + ), + ablations=[ + ScenarioVariantResult( + variant_id="without-example-skill", + variant_type="ablation", + skills=[], + omitted_skill="skills/example-skill", + passed=False, + assertions_passed=0, + assertions_total=2, + total_tokens=21, + ) + ], + baseline=ScenarioVariantResult( + variant_id="baseline", + variant_type="baseline", + skills=[], + passed=False, + assertions_passed=0, + assertions_total=2, + total_tokens=18, + ), + contributions=[ + ScenarioContribution( + skill="skills/example-skill", + hard_score_delta=1.0, + judge_score_delta=0.3, + passed_without_skill=False, + ) + ], + ) + ], + ) + + markdown = render_ci_report_markdown(report) + assert "# upskill CI" in markdown + assert "fixture-scenario" in markdown + assert "without-example-skill" in markdown + assert "skills/example-skill" in markdown + + report_path = tmp_path / "report.json" + write_ci_report(report_path, report) + saved = json.loads(report_path.read_text(encoding="utf-8")) + assert saved["selected_scenarios"] == ["fixture-scenario"] + + summary_path = tmp_path / "summary.md" + monkeypatch.setenv("GITHUB_STEP_SUMMARY", str(summary_path)) + write_step_summary(report) + assert "fixture-scenario" in summary_path.read_text(encoding="utf-8")