diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81945b4..c8bcadd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,3 +40,20 @@ jobs: - name: Run tests run: uv run pytest -v + + action-smoke: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run local upskill action against fixture repo + uses: ./ + with: + working-directory: tests/fixtures/ci_action_repo + scope: changed + base-ref: HEAD + summary-json: upskill-report.json + fail-on-no-scenarios: "false" + + - name: Verify smoke report exists + run: test -f tests/fixtures/ci_action_repo/upskill-report.json diff --git a/README.md b/README.md index 24e9088..3529ed6 100644 --- a/README.md +++ b/README.md @@ -216,6 +216,71 @@ sonnet ] ``` +### `upskill ci` + +Run scenario-based CI evaluation for changed or declared skill bundles. + +```bash +upskill ci [OPTIONS] +``` + +**Options:** +- `--manifest PATH` - Scenario manifest (default: `./.upskill/evals.yaml`) +- `--scope [changed|all]` - Run only impacted scenarios or the full suite +- `--base-ref REF` - Base ref for changed-skill selection (default: `origin/main`) +- `--eval-model MODEL` - Evaluator model override +- `--judge-model MODEL` - Judge model override +- `--summary-json PATH` - Output path for the machine-readable report +- `--runs-dir PATH` - Directory for run artifacts +- `--fail-on-no-scenarios / --no-fail-on-no-scenarios` - Control empty-selection behavior + +**Scenario manifest example:** + +```yaml +scenarios: + - id: hf-model-card-readme + skills: + - skills/hugging-face-evaluation-manager + - skills/hf-cli + tests: evals/hf-model-card-readme.yaml + judge: + enabled: true +``` + +**Test suite example:** + +```yaml +cases: + - input: "Read README and write olmo_7b_evaluations.yaml" + output_file: olmo_7b_evaluations.yaml + verifiers: + - type: file_exists + path: olmo_7b_evaluations.yaml + - type: command + cmd: python test_eval_assertions.py +``` + +The CI command runs the full declared bundle, then leave-one-out ablations for each +contributing skill. Deterministic verifiers gate pass/fail; judge scoring is advisory. + +## GitHub Action + +Use the reusable action from another repository after `actions/checkout`: + +```yaml +- uses: huggingface/upskill@vX + with: + working-directory: . + manifest-path: .upskill/evals.yaml + scope: changed + base-ref: origin/main + eval-model: haiku + judge-model: openai.gpt-4.1-mini +``` + +The action installs `upskill` from the tagged action source, writes `upskill-report.json` +by default, and uploads the JSON report plus run artifacts. + ### `upskill list` List all generated skills in a tree view. diff --git a/action.yml b/action.yml new file mode 100644 index 0000000..fc846a9 --- /dev/null +++ b/action.yml @@ -0,0 +1,119 @@ +name: upskill-ci +description: Run upskill scenario-based CI evaluation for changed or declared skill bundles. + +inputs: + manifest-path: + description: Path to the upskill scenario manifest, relative to the working directory. + required: false + default: .upskill/evals.yaml + scope: + description: Run only changed scenarios or the entire manifest. + required: false + default: changed + base-ref: + description: Git base ref used when scope is changed. + required: false + default: origin/main + eval-model: + description: Model used for evaluator execution. + required: false + judge-model: + description: Model used for advisory judge scoring. + required: false + working-directory: + description: Repository directory where the manifest and skills live. + required: false + default: . + runs-dir: + description: Directory, relative to the working directory, for run artifacts. + required: false + default: runs + summary-json: + description: JSON report path, relative to the working directory. + required: false + default: upskill-report.json + fail-on-no-scenarios: + description: Exit non-zero when no scenarios are selected. + required: false + default: "false" + upload-artifacts: + description: Upload the JSON report and run artifacts. + required: false + default: "true" + +outputs: + summary-json: + description: Path to the generated JSON report. + value: ${{ steps.paths.outputs.summary_json }} + runs-dir: + description: Path to the generated run artifacts directory. + value: ${{ steps.paths.outputs.runs_dir }} + +runs: + using: composite + steps: + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Resolve output paths + id: paths + shell: bash + run: | + set -euo pipefail + workdir="${{ inputs.working-directory }}" + summary_json="${workdir}/${{ inputs.summary-json }}" + runs_dir="${workdir}/${{ inputs.runs-dir }}" + echo "summary_json=${summary_json}" >> "$GITHUB_OUTPUT" + echo "runs_dir=${runs_dir}" >> "$GITHUB_OUTPUT" + + - name: Install upskill from action source + shell: bash + run: | + set -euo pipefail + uv pip install --system "${{ github.action_path }}" + + - name: Run upskill ci + shell: bash + working-directory: ${{ inputs.working-directory }} + run: | + set -euo pipefail + args=( + --manifest "${{ inputs.manifest-path }}" + --scope "${{ inputs.scope }}" + --base-ref "${{ inputs.base-ref }}" + --runs-dir "${{ inputs.runs-dir }}" + --summary-json "${{ inputs.summary-json }}" + ) + if [[ -n "${{ inputs.eval-model }}" ]]; then + args+=(--eval-model "${{ inputs.eval-model }}") + fi + if [[ -n "${{ inputs.judge-model }}" ]]; then + args+=(--judge-model "${{ inputs.judge-model }}") + fi + if [[ "${{ inputs.fail-on-no-scenarios }}" == "true" ]]; then + args+=(--fail-on-no-scenarios) + else + args+=(--no-fail-on-no-scenarios) + fi + upskill ci "${args[@]}" + + - name: Upload CI report + if: ${{ always() && inputs.upload-artifacts == 'true' }} + uses: actions/upload-artifact@v4 + with: + name: upskill-report + path: ${{ steps.paths.outputs.summary_json }} + if-no-files-found: error + + - name: Upload run artifacts + if: ${{ always() && inputs.upload-artifacts == 'true' }} + uses: actions/upload-artifact@v4 + with: + name: upskill-runs + path: ${{ steps.paths.outputs.runs_dir }} + if-no-files-found: warn diff --git a/pyproject.toml b/pyproject.toml index 160bcec..e10e254 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,3 +40,6 @@ target-version = "py313" [tool.ruff.lint] select = ["E", "F", "I", "UP"] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/src/upskill/__init__.py b/src/upskill/__init__.py index 40b0c50..4b9030b 100644 --- a/src/upskill/__init__.py +++ b/src/upskill/__init__.py @@ -2,6 +2,7 @@ __version__ = "0.2.0" +from upskill.ci import load_eval_manifest, run_ci_suite from upskill.config import Config from upskill.evaluate import evaluate_skill from upskill.generate import generate_skill, generate_tests, refine_skill @@ -16,14 +17,18 @@ ) from upskill.models import ( BatchSummary, + CiReport, ConversationStats, + EvalManifest, EvalResults, + EvalScenario, RunMetadata, RunResult, Skill, SkillMetadata, TestCase, TestResult, + VerifierSpec, ) __all__ = [ @@ -39,12 +44,18 @@ "RunResult", "ConversationStats", "BatchSummary", + "VerifierSpec", + "EvalScenario", + "EvalManifest", + "CiReport", # Generation "generate_skill", "generate_tests", "refine_skill", # Evaluation "evaluate_skill", + "run_ci_suite", + "load_eval_manifest", # Logging "create_batch_folder", "create_run_folder", diff --git a/src/upskill/agent_cards/judge.md b/src/upskill/agent_cards/judge.md new file mode 100644 index 0000000..65766b5 --- /dev/null +++ b/src/upskill/agent_cards/judge.md @@ -0,0 +1,13 @@ +--- +type: agent +description: Judge executed skill candidates with a structured rubric. +--- +You are an expert judge for AI agent skills. + +Score the executed skill candidate using the provided rubric only. + +Return structured output with: +- a short summary +- one entry for each criterion +- integer scores from 1 to 5 +- concise rationales grounded in the provided test case, output, and validation result diff --git a/src/upskill/ci.py b/src/upskill/ci.py new file mode 100644 index 0000000..24e5d95 --- /dev/null +++ b/src/upskill/ci.py @@ -0,0 +1,534 @@ +"""Scenario-based CI evaluation for upskill.""" + +from __future__ import annotations + +import json +import os +import subprocess +from pathlib import Path + +import yaml + +from upskill.evaluate import judge_test_result, run_test_with_skills, summarize_test_results +from upskill.logging import ( + aggregate_conversation_stats, + create_batch_folder, + create_run_folder, + write_batch_summary, + write_run_metadata, + write_run_result, +) +from upskill.models import ( + BatchSummary, + CiReport, + EvalManifest, + EvalScenario, + RunMetadata, + RunResult, + ScenarioContribution, + ScenarioReport, + ScenarioVariantResult, + Skill, + TestCase, +) + + +def _normalize_relative_path(path: Path, root: Path) -> str: + try: + return path.resolve().relative_to(root.resolve()).as_posix() + except ValueError: + return path.resolve().as_posix() + + +def load_eval_manifest(path: Path) -> EvalManifest: + """Load a YAML or JSON CI manifest.""" + with open(path, encoding="utf-8") as handle: + if path.suffix.lower() == ".json": + payload = json.load(handle) + else: + payload = yaml.safe_load(handle) or {} + return EvalManifest.model_validate(payload) + + +def load_test_cases(path: Path) -> list[TestCase]: + """Load test cases from YAML or JSON.""" + with open(path, encoding="utf-8") as handle: + if path.suffix.lower() == ".json": + payload = json.load(handle) + else: + payload = yaml.safe_load(handle) or {} + + cases = payload["cases"] if isinstance(payload, dict) and "cases" in payload else payload + return [TestCase.model_validate(item) for item in cases] + + +def plan_ci_suite( + manifest_path: Path, + *, + scope: str = "changed", + base_ref: str = "origin/main", + working_dir: Path | None = None, +) -> tuple[CiReport, list[EvalScenario]]: + """Resolve scenario selection without executing the suite.""" + root = (working_dir or Path.cwd()).resolve() + manifest = load_eval_manifest(manifest_path) + + changed_files: list[str] = [] + changed_skills: list[str] = [] + if scope == "changed": + changed_files = resolve_changed_files(base_ref=base_ref, working_dir=root) + changed_skills = resolve_changed_skill_dirs(changed_files, working_dir=root) + + selected_scenarios = select_scenarios( + manifest, + scope=scope, + changed_skills=changed_skills, + ) + + return ( + CiReport( + manifest_path=_normalize_relative_path(manifest_path, root), + scope=scope, + base_ref=base_ref if scope == "changed" else None, + changed_files=changed_files, + changed_skills=changed_skills, + selected_scenarios=[scenario.id for scenario in selected_scenarios], + success=True, + ), + selected_scenarios, + ) + + +def resolve_changed_files(*, base_ref: str, working_dir: Path) -> list[str]: + """Return changed files for the current checkout.""" + completed = subprocess.run( + ["git", "diff", "--name-only", f"{base_ref}...HEAD"], + cwd=working_dir, + text=True, + capture_output=True, + check=False, + ) + if completed.returncode != 0: + error = completed.stderr.strip() or completed.stdout.strip() or "git diff failed" + raise RuntimeError(error) + return [line.strip() for line in completed.stdout.splitlines() if line.strip()] + + +def resolve_changed_skill_dirs(changed_files: list[str], *, working_dir: Path) -> list[str]: + """Find skill directories impacted by changed files.""" + changed_skills: set[str] = set() + root = working_dir.resolve() + + for changed_file in changed_files: + path = (working_dir / changed_file).resolve() + current = path if path.is_dir() else path.parent + while current != root and current != current.parent: + if (current / "SKILL.md").exists(): + changed_skills.add(current.relative_to(root).as_posix()) + break + current = current.parent + + return sorted(changed_skills) + + +def select_scenarios( + manifest: EvalManifest, + *, + scope: str, + changed_skills: list[str], +) -> list[EvalScenario]: + """Filter manifest scenarios for the requested CI scope.""" + if scope == "all": + return list(manifest.scenarios) + + changed = set(changed_skills) + selected = [] + for scenario in manifest.scenarios: + scenario_skills = set(Path(skill).as_posix() for skill in scenario.skills) + if scenario_skills & changed: + selected.append(scenario) + return selected + + +def write_ci_report(path: Path, report: CiReport) -> None: + """Write the machine-readable CI report.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(report.model_dump(mode="json"), indent=2), + encoding="utf-8", + ) + + +def render_ci_report_markdown(report: CiReport) -> str: + """Render a GitHub-friendly markdown summary.""" + lines = [ + "# upskill CI", + "", + f"- Scope: `{report.scope}`", + f"- Manifest: `{report.manifest_path}`", + ] + if report.base_ref: + lines.append(f"- Base ref: `{report.base_ref}`") + if report.changed_skills: + changed = ", ".join(f"`{item}`" for item in report.changed_skills) + lines.append(f"- Changed skills: {changed}") + if not report.scenarios: + lines.extend(["", "No scenarios were selected."]) + return "\n".join(lines) + + for scenario in report.scenarios: + lines.extend( + [ + "", + f"## {scenario.scenario_id}", + "", + "| Variant | Skills | Pass | Assertions | Judge | Tokens |", + "| --- | --- | --- | --- | --- | --- |", + ] + ) + variants = [scenario.bundle, *scenario.ablations] + if scenario.baseline is not None: + variants.append(scenario.baseline) + for variant in variants: + judge_value = f"{variant.judge_score:.2f}" if variant.judge_score is not None else "n/a" + lines.append( + "| " + f"{variant.variant_id} | " + f"{', '.join(variant.skills) or '(none)'} | " + f"{'PASS' if variant.passed else 'FAIL'} | " + f"{variant.assertions_passed}/{variant.assertions_total} | " + f"{judge_value} | " + f"{variant.total_tokens} |" + ) + if scenario.contributions: + lines.extend( + [ + "", + "| Skill | Hard Delta | Judge Delta | Passed Without Skill |", + "| --- | --- | --- | --- |", + ] + ) + for contribution in scenario.contributions: + judge_delta = ( + f"{contribution.judge_score_delta:+.2f}" + if contribution.judge_score_delta is not None + else "n/a" + ) + lines.append( + "| " + f"{contribution.skill} | " + f"{contribution.hard_score_delta:+.2f} | " + f"{judge_delta} | " + f"{'yes' if contribution.passed_without_skill else 'no'} |" + ) + + return "\n".join(lines) + + +def write_step_summary(report: CiReport) -> None: + """Append the markdown summary to GitHub's step summary file when available.""" + summary_path = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_path: + return + with open(summary_path, "a", encoding="utf-8") as handle: + handle.write(render_ci_report_markdown(report)) + handle.write("\n") + + +async def _evaluate_variant( + *, + scenario: EvalScenario, + variant_id: str, + variant_type: str, + skills: list[tuple[str, Path, Skill]], + omitted_skill: str | None, + test_cases: list[TestCase], + evaluator, + judge, + eval_model: str | None, + judge_model: str | None, + working_dir: Path, + judge_enabled: bool, + judge_criteria: list[str] | None, + batch_id: str, + batch_folder: Path | None, + run_number: int, +) -> tuple[ScenarioVariantResult, RunResult]: + test_results = [] + mounted_skills = [(path, skill) for _, path, skill in skills] + bundle_skills = [skill for _, _, skill in skills] + skill_labels = [label for label, _, _ in skills] + + for test_index, test_case in enumerate(test_cases, start=1): + test_results.append( + await run_test_with_skills( + test_case, + evaluator, + bundle_skills, + model=eval_model, + instance_name=( + f"ci ({scenario.id} {variant_id} test {test_index})" + ), + seed_dir=working_dir, + mounted_skills=mounted_skills, + ) + ) + + assertions_passed, assertions_total, avg_tokens, avg_turns = summarize_test_results( + test_results + ) + passed = all(result.success for result in test_results) + hard_score = assertions_passed / assertions_total if assertions_total else 0.0 + + judge_score = None + judge_summary = None + if judge is not None and judge_enabled and passed: + judge_results = [] + for test_index, test_result in enumerate(test_results, start=1): + judge_results.append( + await judge_test_result( + scenario.id, + bundle_skills, + test_result, + judge, + judge_model=judge_model, + criteria=judge_criteria, + instance_name=( + f"judge ({scenario.id} {variant_id} test {test_index})" + ), + ) + ) + if judge_results: + judge_score = sum(item.normalized_score for item in judge_results) / len(judge_results) + judge_summary = judge_results[0].summary + + aggregated_stats = aggregate_conversation_stats(test_results) + + run_folder_path: Path | None = None + if batch_folder is not None: + run_folder_path = create_run_folder(batch_folder, run_number) + run_result = RunResult( + metadata=RunMetadata( + model=eval_model or "", + task=scenario.id, + batch_id=batch_id, + run_number=run_number, + ), + stats=aggregated_stats, + passed=passed, + assertions_passed=assertions_passed, + assertions_total=assertions_total, + run_type="baseline" if variant_type == "baseline" else "with_skill", + skill_name=scenario.id, + judge_model=judge_model, + judge_score=judge_score, + judge_summary=judge_summary, + scenario_id=scenario.id, + variant_id=variant_id, + variant_type=variant_type, + skills=skill_labels, + omitted_skill=omitted_skill, + ) + write_run_metadata(run_folder_path, run_result.metadata) + write_run_result(run_folder_path, run_result) + else: + run_result = RunResult( + metadata=RunMetadata( + model=eval_model or "", + task=scenario.id, + batch_id=batch_id, + run_number=run_number, + ), + stats=aggregated_stats, + passed=passed, + assertions_passed=assertions_passed, + assertions_total=assertions_total, + run_type="baseline" if variant_type == "baseline" else "with_skill", + skill_name=scenario.id, + judge_model=judge_model, + judge_score=judge_score, + judge_summary=judge_summary, + scenario_id=scenario.id, + variant_id=variant_id, + variant_type=variant_type, + skills=skill_labels, + omitted_skill=omitted_skill, + ) + + return ( + ScenarioVariantResult( + variant_id=variant_id, + variant_type=variant_type, # type: ignore[arg-type] + skills=skill_labels, + omitted_skill=omitted_skill, + passed=passed, + assertions_passed=assertions_passed, + assertions_total=assertions_total, + hard_score=hard_score, + judge_score=judge_score, + judge_summary=judge_summary, + total_tokens=aggregated_stats.total_tokens, + average_turns=avg_turns, + run_folder=str(run_folder_path) if run_folder_path is not None else None, + ), + run_result, + ) + + +async def run_ci_suite( + manifest_path: Path, + *, + evaluator, + judge=None, + scope: str = "changed", + base_ref: str = "origin/main", + eval_model: str | None = None, + judge_model: str | None = None, + working_dir: Path | None = None, + runs_dir: Path | None = None, +) -> CiReport: + """Execute the selected scenario suite and return a machine-readable report.""" + root = (working_dir or Path.cwd()).resolve() + report, selected_scenarios = plan_ci_suite( + manifest_path, + scope=scope, + base_ref=base_ref, + working_dir=root, + ) + + if not selected_scenarios: + return report + + batch_id = "" + batch_folder: Path | None = None + all_run_results: list[RunResult] = [] + if runs_dir is not None: + batch_id, batch_folder = create_batch_folder(runs_dir) + + run_number = 0 + for scenario in selected_scenarios: + tests_path = (root / scenario.tests).resolve() + test_cases = load_test_cases(tests_path) + loaded_skills = [] + for skill_path in scenario.skills: + absolute_skill_path = (root / skill_path).resolve() + loaded_skills.append( + ( + Path(skill_path).as_posix(), + absolute_skill_path, + Skill.load(absolute_skill_path), + ) + ) + + judge_enabled = bool(scenario.judge and scenario.judge.enabled) + judge_criteria = scenario.judge.criteria if scenario.judge else None + + run_number += 1 + bundle_result, bundle_run = await _evaluate_variant( + scenario=scenario, + variant_id="bundle", + variant_type="bundle", + skills=loaded_skills, + omitted_skill=None, + test_cases=test_cases, + evaluator=evaluator, + judge=judge, + eval_model=eval_model, + judge_model=judge_model, + working_dir=root, + judge_enabled=judge_enabled, + judge_criteria=judge_criteria, + batch_id=batch_id, + batch_folder=batch_folder, + run_number=run_number, + ) + all_run_results.append(bundle_run) + + ablation_results: list[ScenarioVariantResult] = [] + contributions: list[ScenarioContribution] = [] + for skill_label, _, _ in loaded_skills: + remaining = [item for item in loaded_skills if item[0] != skill_label] + run_number += 1 + ablation_result, ablation_run = await _evaluate_variant( + scenario=scenario, + variant_id=f"without-{Path(skill_label).name}", + variant_type="ablation", + skills=remaining, + omitted_skill=skill_label, + test_cases=test_cases, + evaluator=evaluator, + judge=judge, + eval_model=eval_model, + judge_model=judge_model, + working_dir=root, + judge_enabled=judge_enabled, + judge_criteria=judge_criteria, + batch_id=batch_id, + batch_folder=batch_folder, + run_number=run_number, + ) + all_run_results.append(ablation_run) + ablation_results.append(ablation_result) + contributions.append( + ScenarioContribution( + skill=skill_label, + hard_score_delta=bundle_result.hard_score - ablation_result.hard_score, + judge_score_delta=( + None + if bundle_result.judge_score is None or ablation_result.judge_score is None + else bundle_result.judge_score - ablation_result.judge_score + ), + passed_without_skill=ablation_result.passed, + ) + ) + + baseline_result = None + if scenario.include_baseline: + run_number += 1 + baseline_result, baseline_run = await _evaluate_variant( + scenario=scenario, + variant_id="baseline", + variant_type="baseline", + skills=[], + omitted_skill=None, + test_cases=test_cases, + evaluator=evaluator, + judge=judge, + eval_model=eval_model, + judge_model=judge_model, + working_dir=root, + judge_enabled=judge_enabled, + judge_criteria=judge_criteria, + batch_id=batch_id, + batch_folder=batch_folder, + run_number=run_number, + ) + all_run_results.append(baseline_run) + + report.scenarios.append( + ScenarioReport( + scenario_id=scenario.id, + skills=[label for label, _, _ in loaded_skills], + tests_path=_normalize_relative_path(tests_path, root), + passed=bundle_result.passed, + bundle=bundle_result, + ablations=ablation_results, + baseline=baseline_result, + contributions=contributions, + ) + ) + + report.success = all(item.passed for item in report.scenarios) + + if batch_folder is not None: + summary = BatchSummary( + batch_id=batch_id, + model=eval_model or "", + task="upskill-ci", + total_runs=len(all_run_results), + passed_runs=sum(1 for item in all_run_results if item.passed), + results=all_run_results, + ) + write_batch_summary(batch_folder, summary) + + return report diff --git a/src/upskill/cli.py b/src/upskill/cli.py index 1a9d63c..cb4ac18 100644 --- a/src/upskill/cli.py +++ b/src/upskill/cli.py @@ -13,15 +13,32 @@ import click from dotenv import load_dotenv -from fast_agent import FastAgent from rich.console import Console from rich.panel import Panel from rich.table import Table from rich.tree import Tree +try: + from fast_agent import FastAgent +except ModuleNotFoundError: # pragma: no cover - enables unit tests without fast-agent + FastAgent = None # type: ignore[assignment] + +from upskill.ci import ( + plan_ci_suite, + render_ci_report_markdown, + run_ci_suite, + write_ci_report, + write_step_summary, +) from upskill.config import Config, resolve_upskill_config_path -from upskill.evaluate import evaluate_skill, get_failure_descriptions -from upskill.generate import generate_skill, generate_tests, improve_skill, refine_skill +from upskill.evaluate import evaluate_skill, evaluate_skill_candidates, get_failure_descriptions +from upskill.generate import ( + generate_skill, + generate_skill_candidates, + generate_tests, + improve_skill, + refine_skill, +) from upskill.logging import ( aggregate_conversation_stats, create_batch_folder, @@ -30,12 +47,14 @@ load_run_result, summarize_runs_to_csv, write_batch_summary, + write_ranking_summary, write_run_metadata, write_run_result, ) from upskill.model_resolution import ResolvedModels, resolve_models from upskill.models import ( BatchSummary, + RankedSkillBatch, RunMetadata, RunResult, Skill, @@ -51,6 +70,8 @@ @asynccontextmanager async def _fast_agent_context(config: Config | None = None) -> AsyncIterator[object]: config = config or Config.load() + if FastAgent is None: + raise RuntimeError("fast-agent-mcp is required to run upskill commands.") fast = FastAgent( "upskill", config_path=str(config.effective_fastagent_config), @@ -136,6 +157,36 @@ def _render_bar(value: float, width: int = 20) -> str: return "█" * filled + "░" * empty +def _render_ranking_summary(ranking: RankedSkillBatch) -> None: + """Render top candidate ranking details.""" + if not ranking.ranked_results: + return + + top_results = ranking.ranked_results[:3] + console.print() + console.print(f" candidates ranked: {ranking.candidate_count}") + for ranked in top_results: + candidate = ranked.candidate + margin = "" + if ranked.score_margin_from_next is not None: + margin = f" margin {ranked.score_margin_from_next:+.3f}" + console.print( + f" #{ranked.rank} {candidate.candidate_id} " + f"hard {candidate.hard_score:.0%} " + f"judge {candidate.judge_score:.0%} " + f"tokens {candidate.token_efficiency_score:.0%} " + f"composite {candidate.composite_score:.3f}{margin}" + ) + + winner = ranking.winner + if winner is not None: + console.print() + console.print( + f" winner: [bold]{winner.candidate.candidate_id}[/bold]" + f" ({winner.candidate.skill.name})" + ) + + class EvalPlotResult(TypedDict): """Structured plot data for eval runs.""" @@ -258,6 +309,22 @@ def main(): @click.option("-o", "--output", type=click.Path(), help="Output directory for skill") @click.option("--no-eval", is_flag=True, help="Skip eval and refinement") @click.option("--eval-model", help="Optional extra cross-model eval pass after generation") +@click.option( + "--candidates", + type=int, + help="Number of candidate skills to generate before ranking", +) +@click.option("--judge-model", help="Model to use for LLM-as-a-judge ranking") +@click.option( + "--rank-with-judge/--no-rank-with-judge", + default=None, + help="Enable judge-based candidate ranking (defaults on when candidates > 1)", +) +@click.option( + "--judge-strategy", + type=click.Choice(["pointwise", "pairwise"]), + help="Judge ranking strategy", +) @click.option("--runs-dir", type=click.Path(), help="Directory for run logs (default: ./runs)") @click.option("--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)") def generate( @@ -270,6 +337,10 @@ def generate( output: str | None, no_eval: bool, eval_model: str | None, + candidates: int | None, + judge_model: str | None, + rank_with_judge: bool | None, + judge_strategy: str | None, runs_dir: str | None, log_runs: bool, ): @@ -324,6 +395,10 @@ def generate( output, no_eval, eval_model, + candidates, + judge_model, + rank_with_judge, + judge_strategy, runs_dir, log_runs, ) @@ -340,10 +415,17 @@ async def _generate_async( output: str | None, no_eval: bool, eval_model: str | None, + candidates: int | None, + judge_model: str | None, + rank_with_judge: bool | None, + judge_strategy: str | None, runs_dir: str | None, log_runs: bool, ): """Async implementation of generate command.""" + if candidates is not None and candidates < 1: + raise click.ClickException("--candidates must be at least 1.") + config = Config.load() resolved = resolve_models( "generate", @@ -363,6 +445,14 @@ async def _generate_async( command="generate", ) extra_eval_model = resolved.extra_eval_model + candidate_count = candidates or config.default_candidate_count + resolved_judge_model = judge_model or config.effective_judge_model + resolved_judge_strategy = judge_strategy or config.judge_strategy + resolved_rank_with_judge = rank_with_judge + if resolved_rank_with_judge is None: + resolved_rank_with_judge = candidate_count > 1 + if resolved_judge_strategy != "pointwise" and resolved_rank_with_judge: + raise click.ClickException("Only --judge-strategy pointwise is supported in v1.") _print_model_plan("generate", resolved) @@ -377,6 +467,8 @@ async def _generate_async( console.print(f"Logging runs to: {batch_folder}", style="dim") async with _fast_agent_context(config) as agent: + generation_task = task + # Generate from trace file if from_trace: console.print(f"Generating skill from trace: {from_trace}", style="dim") @@ -395,234 +487,342 @@ async def _generate_async( # Plain text, markdown, etc. trace_context = trace_content[:4000] - task = f"{task}\n\nBased on this agent trace:\n\n{trace_context}" - console.print(f"Generating skill with {skill_gen_model}...", style="dim") - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await generate_skill( - task=task, - examples=examples, - generator=agent.skill_gen, - model=skill_gen_model, - ) + generation_task = f"{task}\n\nBased on this agent trace:\n\n{trace_context}" # Improve existing skill elif from_skill: existing_skill = Skill.load(Path(from_skill)) - console.print( - f"Improving [bold]{existing_skill.name}[/bold] with {skill_gen_model}...", - style="dim", - ) - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await improve_skill( - existing_skill, - instructions=task, - generator=agent.skill_gen, - model=skill_gen_model, - ) else: - console.print(f"Generating skill with {skill_gen_model}...", style="dim") - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await generate_skill( - task=task, - examples=examples, - generator=agent.skill_gen, - model=skill_gen_model, - ) - if no_eval: - _save_and_display(skill, output, config) - return - - console.print("Generating test cases...", style="dim") - await _set_agent_model(agent.test_gen, test_gen_model) - test_cases = await generate_tests(task, generator=agent.test_gen, model=test_gen_model) + existing_skill = None - # Eval loop with refinement (on skill generation model) - prev_success_rate = 0.0 + await _set_agent_model(agent.skill_gen, skill_gen_model) + ranking: RankedSkillBatch | None = None + skill: Skill | None = None results = None - attempts = max(1, config.max_refine_attempts) - for attempt in range(attempts): + eval_results = None + + if candidate_count > 1: console.print( - f"Evaluating on {skill_gen_model}... (attempt {attempt + 1})", + f"Generating {candidate_count} candidate skills with {skill_gen_model}...", style="dim", ) - - # Create run folder for logging (2 folders per attempt: baseline + with_skill) - run_folder = None - if log_runs and batch_folder: - baseline_run_num = attempt * 2 + 1 - run_folder = create_run_folder(batch_folder, baseline_run_num) - write_run_metadata( - run_folder, - RunMetadata( + if existing_skill is not None: + candidates_list: list[Skill] = [] + for index in range(candidate_count): + candidate_skill = await improve_skill( + existing_skill, + instructions=task, + generator=agent.skill_gen, model=skill_gen_model, - task=task, - batch_id=batch_id or "", - run_number=baseline_run_num, - ), + ) + candidate_skill.metadata.candidate_id = f"candidate-{index + 1}" + candidates_list.append(candidate_skill) + else: + candidates_list = await generate_skill_candidates( + task=generation_task, + examples=examples, + generator=agent.skill_gen, + count=candidate_count, + model=skill_gen_model, ) - console.print("[dim]Starting evaluation run...[/dim]") + if no_eval: + skill = candidates_list[0] + _save_and_display(skill, output, config) + return - results = await evaluate_skill( - skill, - test_cases=test_cases, - evaluator=agent.evaluator, - model=skill_gen_model, - show_baseline_progress=False, + console.print("Generating shared test cases...", style="dim") + await _set_agent_model(agent.test_gen, test_gen_model) + test_cases = await generate_tests( + generation_task, + generator=agent.test_gen, + model=test_gen_model, ) - # Log run results (both baseline and with-skill for plot command) - if log_runs and run_folder: - # Log baseline result - baseline_result = RunResult( - metadata=RunMetadata( - model=skill_gen_model, - task=task, - batch_id=batch_id or "", - run_number=baseline_run_num, - ), - stats=aggregate_conversation_stats(results.baseline_results), - passed=results.baseline_success_rate > 0.5, - assertions_passed=int(results.baseline_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="baseline", - skill_name=skill.name, - ) - write_run_result(run_folder, baseline_result) - run_results.append(baseline_result) + judge_agent = agent.judge if resolved_rank_with_judge else None + ranking = await evaluate_skill_candidates( + generation_task, + candidates_list, + test_cases, + evaluator=agent.evaluator, + judge=judge_agent, + skill_generation_model=skill_gen_model, + evaluation_model=skill_gen_model, + judge_model=resolved_judge_model if resolved_rank_with_judge else None, + judge_strategy=resolved_judge_strategy, + judge_weight=config.judge_weight, + ) + winner = ranking.winner + if winner is None: + raise click.ClickException("Candidate ranking produced no winner.") + skill = winner.candidate.skill + skill.metadata.test_pass_rate = winner.candidate.hard_score - # Log with-skill result (in a separate folder) - with_skill_folder = create_run_folder(batch_folder, attempt * 2 + 2) - with_skill_result = RunResult( - metadata=RunMetadata( - model=skill_gen_model, - task=task, - batch_id=batch_id or "", - run_number=attempt * 2 + 2, - ), - stats=aggregate_conversation_stats(results.with_skill_results), - passed=results.is_beneficial, - assertions_passed=int(results.with_skill_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="with_skill", - skill_name=skill.name, - ) - write_run_metadata(with_skill_folder, with_skill_result.metadata) - write_run_result(with_skill_folder, with_skill_result) - run_results.append(with_skill_result) + if log_runs and batch_folder and batch_id: + for ranked in ranking.ranked_results: + candidate_run_folder = create_run_folder(batch_folder, len(run_results) + 1) + candidate = ranked.candidate + candidate_run = RunResult( + metadata=RunMetadata( + model=skill_gen_model, + task=generation_task, + batch_id=batch_id, + run_number=len(run_results) + 1, + ), + stats=aggregate_conversation_stats(candidate.test_results), + passed=not candidate.hard_gate_failed, + assertions_passed=candidate.assertions_passed, + assertions_total=candidate.assertions_total, + run_type="with_skill", + skill_name=candidate.skill.name, + judge_model=ranked.judge_model, + judge_score=candidate.judge_score, + judge_summary=ranked.judge_summary, + candidate_id=candidate.candidate_id, + rank=ranked.rank, + ) + write_run_metadata(candidate_run_folder, candidate_run.metadata) + write_run_result(candidate_run_folder, candidate_run) + run_results.append(candidate_run) - lift = results.skill_lift - lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" + write_ranking_summary(batch_folder, ranking) - if results.is_beneficial: + if extra_eval_model: + console.print(f"Evaluating winning candidate on {extra_eval_model}...", style="dim") + eval_results = await evaluate_skill( + skill, + test_cases, + evaluator=agent.evaluator, + model=extra_eval_model, + show_baseline_progress=False, + ) + else: + if from_trace: + console.print(f"Generating skill with {skill_gen_model}...", style="dim") + skill = await generate_skill( + task=generation_task, + examples=examples, + generator=agent.skill_gen, + model=skill_gen_model, + ) + elif existing_skill is not None: console.print( - f" {results.baseline_success_rate:.0%} -> " - f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]" + f"Improving [bold]{existing_skill.name}[/bold] with {skill_gen_model}...", + style="dim", + ) + skill = await improve_skill( + existing_skill, + instructions=task, + generator=agent.skill_gen, + model=skill_gen_model, + ) + else: + console.print(f"Generating skill with {skill_gen_model}...", style="dim") + skill = await generate_skill( + task=generation_task, + examples=examples, + generator=agent.skill_gen, + model=skill_gen_model, ) - break + if no_eval: + _save_and_display(skill, output, config) + return - console.print( - f" {results.baseline_success_rate:.0%} -> " - f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough" + console.print("Generating test cases...", style="dim") + await _set_agent_model(agent.test_gen, test_gen_model) + test_cases = await generate_tests( + generation_task, + generator=agent.test_gen, + model=test_gen_model, ) - if abs(results.with_skill_success_rate - prev_success_rate) < 0.05: - console.print(" [yellow]Plateaued, stopping[/yellow]") - break + # Eval loop with refinement (on skill generation model) + prev_success_rate = 0.0 + attempts = max(1, config.max_refine_attempts) + for attempt in range(attempts): + console.print( + f"Evaluating on {skill_gen_model}... (attempt {attempt + 1})", + style="dim", + ) - prev_success_rate = results.with_skill_success_rate + # Create run folder for logging (2 folders per attempt: baseline + with_skill) + run_folder = None + if log_runs and batch_folder: + baseline_run_num = attempt * 2 + 1 + run_folder = create_run_folder(batch_folder, baseline_run_num) + write_run_metadata( + run_folder, + RunMetadata( + model=skill_gen_model, + task=generation_task, + batch_id=batch_id or "", + run_number=baseline_run_num, + ), + ) + + console.print("[dim]Starting evaluation run...[/dim]") - if attempt < attempts - 1: - console.print("Refining...", style="dim") - failures = get_failure_descriptions(results) - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await refine_skill( + results = await evaluate_skill( skill, - failures, - generator=agent.skill_gen, + test_cases=test_cases, + evaluator=agent.evaluator, model=skill_gen_model, + show_baseline_progress=False, ) - # If eval_model specified, also eval on that model - eval_results = None - if extra_eval_model: - console.print(f"Evaluating on {extra_eval_model}...", style="dim") + # Log run results (both baseline and with-skill for plot command) + if log_runs and run_folder: + # Log baseline result + baseline_result = RunResult( + metadata=RunMetadata( + model=skill_gen_model, + task=generation_task, + batch_id=batch_id or "", + run_number=baseline_run_num, + ), + stats=aggregate_conversation_stats(results.baseline_results), + passed=results.baseline_success_rate > 0.5, + assertions_passed=int(results.baseline_success_rate * len(test_cases)), + assertions_total=len(test_cases), + run_type="baseline", + skill_name=skill.name, + ) + write_run_result(run_folder, baseline_result) + run_results.append(baseline_result) - # Create run folder for eval model - run_folder = None - if log_runs and batch_folder: - run_number = len(run_results) + 1 - run_folder = create_run_folder(batch_folder, run_number) - write_run_metadata( - run_folder, - RunMetadata( - model=extra_eval_model, - task=task, - batch_id=batch_id or "", - run_number=run_number, - ), - ) + # Log with-skill result (in a separate folder) + with_skill_folder = create_run_folder(batch_folder, attempt * 2 + 2) + with_skill_result = RunResult( + metadata=RunMetadata( + model=skill_gen_model, + task=generation_task, + batch_id=batch_id or "", + run_number=attempt * 2 + 2, + ), + stats=aggregate_conversation_stats(results.with_skill_results), + passed=results.is_beneficial, + assertions_passed=int(results.with_skill_success_rate * len(test_cases)), + assertions_total=len(test_cases), + run_type="with_skill", + skill_name=skill.name, + ) + write_run_metadata(with_skill_folder, with_skill_result.metadata) + write_run_result(with_skill_folder, with_skill_result) + run_results.append(with_skill_result) - eval_results = await evaluate_skill( - skill, - test_cases, - evaluator=agent.evaluator, - model=extra_eval_model, - show_baseline_progress=False, - ) + lift = results.skill_lift + lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" - # Log eval run results (both baseline and with-skill) - if log_runs and run_folder: - # Log baseline result - baseline_result = RunResult( - metadata=RunMetadata( - model=extra_eval_model, - task=task, - batch_id=batch_id or "", - run_number=run_number, - ), - stats=aggregate_conversation_stats(eval_results.baseline_results), - passed=eval_results.baseline_success_rate > 0.5, - assertions_passed=int(eval_results.baseline_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="baseline", - skill_name=skill.name, + if results.is_beneficial: + console.print( + f" {results.baseline_success_rate:.0%} -> " + f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]" + ) + break + + console.print( + f" {results.baseline_success_rate:.0%} -> " + f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough" ) - write_run_result(run_folder, baseline_result) - run_results.append(baseline_result) - # Log with-skill result - with_skill_folder = create_run_folder(batch_folder, run_number + 1) - with_skill_result = RunResult( - metadata=RunMetadata( - model=extra_eval_model, - task=task, - batch_id=batch_id or "", - run_number=run_number + 1, - ), - stats=aggregate_conversation_stats(eval_results.with_skill_results), - passed=eval_results.is_beneficial, - assertions_passed=int(eval_results.with_skill_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="with_skill", - skill_name=skill.name, + if abs(results.with_skill_success_rate - prev_success_rate) < 0.05: + console.print(" [yellow]Plateaued, stopping[/yellow]") + break + + prev_success_rate = results.with_skill_success_rate + + if attempt < attempts - 1: + console.print("Refining...", style="dim") + failures = get_failure_descriptions(results) + await _set_agent_model(agent.skill_gen, skill_gen_model) + skill = await refine_skill( + skill, + failures, + generator=agent.skill_gen, + model=skill_gen_model, + ) + + # If eval_model specified, also eval on that model + if extra_eval_model: + console.print(f"Evaluating on {extra_eval_model}...", style="dim") + + # Create run folder for eval model + run_folder = None + if log_runs and batch_folder: + run_number = len(run_results) + 1 + run_folder = create_run_folder(batch_folder, run_number) + write_run_metadata( + run_folder, + RunMetadata( + model=extra_eval_model, + task=generation_task, + batch_id=batch_id or "", + run_number=run_number, + ), + ) + + eval_results = await evaluate_skill( + skill, + test_cases, + evaluator=agent.evaluator, + model=extra_eval_model, + show_baseline_progress=False, ) - write_run_metadata(with_skill_folder, with_skill_result.metadata) - write_run_result(with_skill_folder, with_skill_result) - run_results.append(with_skill_result) - lift = eval_results.skill_lift - lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" - console.print( - f" {eval_results.baseline_success_rate:.0%} -> " - f"{eval_results.with_skill_success_rate:.0%} ({lift_str})" - ) + # Log eval run results (both baseline and with-skill) + if log_runs and run_folder: + # Log baseline result + baseline_result = RunResult( + metadata=RunMetadata( + model=extra_eval_model, + task=generation_task, + batch_id=batch_id or "", + run_number=run_number, + ), + stats=aggregate_conversation_stats(eval_results.baseline_results), + passed=eval_results.baseline_success_rate > 0.5, + assertions_passed=int(eval_results.baseline_success_rate * len(test_cases)), + assertions_total=len(test_cases), + run_type="baseline", + skill_name=skill.name, + ) + write_run_result(run_folder, baseline_result) + run_results.append(baseline_result) + + # Log with-skill result + with_skill_folder = create_run_folder(batch_folder, run_number + 1) + with_skill_result = RunResult( + metadata=RunMetadata( + model=extra_eval_model, + task=generation_task, + batch_id=batch_id or "", + run_number=run_number + 1, + ), + stats=aggregate_conversation_stats(eval_results.with_skill_results), + passed=eval_results.is_beneficial, + assertions_passed=int( + eval_results.with_skill_success_rate * len(test_cases) + ), + assertions_total=len(test_cases), + run_type="with_skill", + skill_name=skill.name, + ) + write_run_metadata(with_skill_folder, with_skill_result.metadata) + write_run_result(with_skill_folder, with_skill_result) + run_results.append(with_skill_result) + + lift = eval_results.skill_lift + lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" + console.print( + f" {eval_results.baseline_success_rate:.0%} -> " + f"{eval_results.with_skill_success_rate:.0%} ({lift_str})" + ) # Write batch summary if log_runs and batch_folder and batch_id: summary = BatchSummary( batch_id=batch_id, model=skill_gen_model, - task=task, + task=generation_task, total_runs=len(run_results), passed_runs=sum(1 for r in run_results if r.passed), results=run_results, @@ -632,6 +832,8 @@ async def _generate_async( if not no_eval and skill is not None: if results: skill.metadata.test_pass_rate = results.with_skill_success_rate + elif ranking and ranking.winner: + skill.metadata.test_pass_rate = ranking.winner.candidate.hard_score else: console.print( "[yellow]No evaluation results available; skipping report output.[/yellow]" @@ -645,6 +847,7 @@ async def _generate_async( eval_results, skill_gen_model, extra_eval_model, + ranking, ) @@ -659,6 +862,7 @@ def _save_and_display( eval_results=None, skill_gen_model: str | None = None, eval_model: str | None = None, + ranking: RankedSkillBatch | None = None, ): """Save skill and display summary.""" if output: @@ -683,6 +887,9 @@ def _save_and_display( for name in skill.scripts: console.print(f" scripts/{name} (exec only)") + if ranking: + _render_ranking_summary(ranking) + # Show results with horizontal bars if results and eval_results: # Multiple models - show each with bars @@ -1178,6 +1385,137 @@ async def _eval_async( console.print("\n[yellow]Recommendation: skill may not be beneficial[/yellow]") +@main.command("ci") +@click.option( + "--manifest", + "manifest_path", + type=click.Path(exists=True), + default=".upskill/evals.yaml", + show_default=True, + help="Path to scenario manifest", +) +@click.option( + "--scope", + type=click.Choice(["changed", "all"]), + default="changed", + show_default=True, + help="Whether to run only impacted scenarios or the full suite", +) +@click.option( + "--base-ref", + default="origin/main", + show_default=True, + help="Base ref used for changed-skill selection", +) +@click.option("--eval-model", help="Model to use for scenario execution") +@click.option("--judge-model", help="Model to use for advisory judge scoring") +@click.option( + "--summary-json", + type=click.Path(), + help="Path for machine-readable CI report JSON", +) +@click.option("--runs-dir", type=click.Path(), help="Directory for run logs") +@click.option( + "--fail-on-no-scenarios/--no-fail-on-no-scenarios", + default=False, + help="Exit with an error when no scenarios are selected", +) +def ci_cmd( + manifest_path: str, + scope: str, + base_ref: str, + eval_model: str | None, + judge_model: str | None, + summary_json: str | None, + runs_dir: str | None, + fail_on_no_scenarios: bool, +): + """Run scenario-based CI evaluation for impacted skills.""" + asyncio.run( + _ci_async( + manifest_path=manifest_path, + scope=scope, + base_ref=base_ref, + eval_model=eval_model, + judge_model=judge_model, + summary_json=summary_json, + runs_dir=runs_dir, + fail_on_no_scenarios=fail_on_no_scenarios, + ) + ) + + +async def _ci_async( + *, + manifest_path: str, + scope: str, + base_ref: str, + eval_model: str | None, + judge_model: str | None, + summary_json: str | None, + runs_dir: str | None, + fail_on_no_scenarios: bool, +) -> None: + """Async implementation of the CI command.""" + config = Config.load() + manifest = Path(manifest_path).resolve() + runs_path = (Path(runs_dir) if runs_dir else config.runs_dir).resolve() + summary_path = ( + Path(summary_json).resolve() if summary_json else (Path.cwd() / "upskill-report.json") + ) + + resolved_eval_model = eval_model or config.effective_eval_model + resolved_judge_model = judge_model or config.effective_judge_model + + console.print("[dim]CI model plan:[/dim]") + console.print(f" Evaluation Model: {resolved_eval_model}") + console.print(f" Judge Model: {resolved_judge_model}") + console.print(f" Scope: {scope}") + if scope == "changed": + console.print(f" Base Ref: {base_ref}") + + preview_report, selected_scenarios = plan_ci_suite( + manifest, + scope=scope, + base_ref=base_ref, + working_dir=Path.cwd(), + ) + if not selected_scenarios: + write_ci_report(summary_path, preview_report) + write_step_summary(preview_report) + console.print() + console.print("[yellow]No scenarios selected.[/yellow]") + console.print(f"[dim]Report written to {summary_path}[/dim]") + if fail_on_no_scenarios: + sys.exit(1) + return + + async with _fast_agent_context(config) as agent: + report = await run_ci_suite( + manifest, + evaluator=agent.evaluator, + judge=agent.judge, + scope=scope, + base_ref=base_ref, + eval_model=resolved_eval_model, + judge_model=resolved_judge_model, + working_dir=Path.cwd(), + runs_dir=runs_path, + ) + + write_ci_report(summary_path, report) + write_step_summary(report) + + console.print() + console.print(render_ci_report_markdown(report)) + console.print() + console.print(f"[dim]Report written to {summary_path}[/dim]") + console.print(f"[dim]Runs written under {runs_path}[/dim]") + + if not report.success: + sys.exit(1) + + @main.command("list") @click.option("-d", "--dir", "skills_dir", type=click.Path(), help="Skills directory to list") @click.option("-v", "--verbose", is_flag=True, help="Show detailed skill structure") diff --git a/src/upskill/config.py b/src/upskill/config.py index 01f1f5c..32950c9 100644 --- a/src/upskill/config.py +++ b/src/upskill/config.py @@ -120,6 +120,13 @@ class Config(BaseModel): default=None, description="Model for test generation (defaults to skill generation model)", ) + judge_model: str | None = Field( + default=None, + description=( + "Model for LLM-as-a-judge ranking " + "(defaults to eval_model or skill generation model)" + ), + ) # Directory settings skills_dir: Path = Field( @@ -132,6 +139,12 @@ class Config(BaseModel): # Generation settings auto_eval: bool = Field(default=True, description="Run eval after generation") max_refine_attempts: int = Field(default=2, description="Max refinement iterations") + default_candidate_count: int = Field( + default=1, + description="Default number of candidate skills to generate per task", + ) + judge_strategy: str = Field(default="pointwise", description="Judge ranking strategy") + judge_weight: float = Field(default=0.3, description="Weight for judge score in ranking") # FastAgent settings fastagent_config: Path | None = Field(default=None, description="Path to fastagent.config.yaml") @@ -176,6 +189,11 @@ def effective_eval_model(self) -> str: """Get the model to use for evaluation.""" return self.eval_model or self.skill_generation_model + @property + def effective_judge_model(self) -> str: + """Get the model to use for judge-based ranking.""" + return self.judge_model or self.effective_eval_model + @property def model(self) -> str: """Backward-compatible alias for ``skill_generation_model``.""" diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py index fdc07c4..212da22 100644 --- a/src/upskill/evaluate.py +++ b/src/upskill/evaluate.py @@ -9,9 +9,16 @@ from collections.abc import Generator from contextlib import contextmanager, nullcontext from pathlib import Path +from typing import Any -from fast_agent import ConversationSummary -from fast_agent.agents.llm_agent import LlmAgent +try: + from fast_agent import ConversationSummary + from fast_agent.agents.llm_agent import LlmAgent +except ModuleNotFoundError: # pragma: no cover - enables unit tests without fast-agent + ConversationSummary = Any + + class LlmAgent: # type: ignore[no-redef] + pass try: from fast_agent.ui.rich_progress import progress_display @@ -20,18 +27,23 @@ from upskill.fastagent_integration import ( compose_instruction, + compose_instruction_bundle, ) from upskill.logging import extract_stats_from_summary from upskill.models import ( + CandidateEvalResult, + CapturedArtifact, ConversationStats, EvalResults, - ExpectedSpec, + JudgeCriterionScore, + JudgeEvaluation, + RankedSkillBatch, + RankedSkillResult, Skill, TestCase, TestResult, - ValidationResult, ) -from upskill.validators import get_validator +from upskill.verifiers import run_verifiers def _hide_progress_task(task_name: str | None) -> None: @@ -54,9 +66,31 @@ def _hide_progress_task(task_name: str | None) -> None: "You need to evaluate the skill on the test case and return a score." ) +JUDGE_CRITERIA = ( + "instruction_quality", + "helpfulness", + "robustness", + "concision", + "generalizability", +) + +MAX_CAPTURE_CHARS = 4000 +WORKSPACE_IGNORE_NAMES = { + ".git", + ".venv", + ".mypy_cache", + ".pytest_cache", + "__pycache__", + "runs", +} + @contextmanager -def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Generator[Path]: +def isolated_workspace( + base_dir: Path | None = None, + cleanup: bool = True, + seed_dir: Path | None = None, +) -> Generator[Path]: """Create an isolated workspace for a test run. Args: @@ -69,6 +103,8 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge workspace = tempfile.mkdtemp(dir=base_dir, prefix="upskill_run_") workspace_path = Path(workspace) try: + if seed_dir is not None: + _seed_workspace_from_directory(seed_dir, workspace_path) yield workspace_path finally: if cleanup: @@ -78,41 +114,71 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge pass # Ignore cleanup errors -def check_expected( - output: str, - expected: ExpectedSpec, - workspace: Path | None = None, - test_case: TestCase | None = None, -) -> tuple[bool, ValidationResult | None]: - """Check if output matches expected conditions. +def _seed_workspace_from_directory(seed_dir: Path, workspace: Path) -> None: + """Copy a seed checkout into a temporary workspace.""" + for source in seed_dir.iterdir(): + if source.name in WORKSPACE_IGNORE_NAMES: + continue + destination = workspace / source.name + if source.is_dir(): + shutil.copytree( + source, + destination, + dirs_exist_ok=True, + ignore=shutil.ignore_patterns(*WORKSPACE_IGNORE_NAMES), + ) + else: + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, destination) - Args: - output: The agent's output string - expected: Expected conditions dict (legacy format with "contains") - workspace: Optional workspace directory for file-based validation - test_case: Optional test case with custom validator config - Returns: - Tuple of (success, validation_result) - """ - # Handle custom validator if specified - if test_case and test_case.validator: - validator = get_validator(test_case.validator) - if validator and workspace: - config = test_case.validator_config or {} - result = validator( - workspace=workspace, - output_file=test_case.output_file or "", - **config, +def _copy_skill_directory(source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(source_dir, destination_dir, dirs_exist_ok=True) + + +def _capture_artifacts(test_case: TestCase, workspace: Path | None) -> list[CapturedArtifact]: + if workspace is None or not test_case.output_file: + return [] + + target = workspace / test_case.output_file + if not target.exists() or not target.is_file(): + return [] + + try: + content = target.read_text(encoding="utf-8") + except UnicodeDecodeError: + return [ + CapturedArtifact( + path=test_case.output_file, + content="", + truncated=False, ) - return result.passed, result + ] + + truncated = len(content) > MAX_CAPTURE_CHARS + if truncated: + content = content[:MAX_CAPTURE_CHARS].rstrip() + "\n..." + return [ + CapturedArtifact( + path=test_case.output_file, + content=content, + truncated=truncated, + ) + ] - required = expected.contains - output_lower = output.lower() - if any(item.lower() not in output_lower for item in required): - return False, None - return True, None +def _workspace_required_for_test( + test_case: TestCase, + *, + seed_dir: Path | None, + mounted_skills: list[tuple[Path, Skill]] | None, +) -> bool: + if test_case.output_file or test_case.validator or test_case.verifiers: + return True + if seed_dir is not None: + return True + return bool(mounted_skills) async def _run_test_with_evaluator( @@ -122,6 +188,8 @@ async def _run_test_with_evaluator( *, use_workspace: bool | None = None, instance_name: str | None = None, + seed_dir: Path | None = None, + mounted_skills: list[tuple[Path, Skill]] | None = None, ) -> TestResult: """Run a single test case using a provided evaluator agent.""" user_content = test_case.input @@ -130,11 +198,25 @@ async def _run_test_with_evaluator( user_content += f"\n\n```{filename}\n{content}\n```" # Determine if we need workspace isolation - needs_workspace = use_workspace if use_workspace is not None else bool(test_case.validator) + needs_workspace = use_workspace if use_workspace is not None else _workspace_required_for_test( + test_case, + seed_dir=seed_dir, + mounted_skills=mounted_skills, + ) async def _run_in_workspace(workspace: Path | None) -> TestResult: clone: LlmAgent | None = None try: + if workspace is not None and mounted_skills: + for source_dir, skill in mounted_skills: + try: + relative_path = source_dir.relative_to(seed_dir) if seed_dir else None + except ValueError: + relative_path = None + if relative_path is None: + relative_path = Path(".upskill") / "skills" / skill.name + _copy_skill_directory(source_dir, workspace / relative_path) + clone = await evaluator.spawn_detached_instance(name=instance_name) if workspace is not None: enable_shell = getattr(clone, "enable_shell", None) @@ -157,21 +239,13 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult: except Exception as exc: logger.exception("Failed to extract stats from evaluator history", exc_info=exc) - # Check expected with custom validator support - if workspace and test_case.validator: - success, validation_result = check_expected( - output or "", - test_case.expected, - workspace, - test_case, - ) - else: - success, validation_result = check_expected( - output or "", - test_case.expected, - None, - test_case, - ) + artifacts = _capture_artifacts(test_case, workspace) + validation_result = run_verifiers( + test_case, + output=output or "", + workspace=workspace, + ) + success = validation_result.passed return TestResult( test_case=test_case, @@ -181,6 +255,7 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult: turns=stats.turns, stats=stats, validation_result=validation_result, + artifacts=artifacts, ) except Exception as exc: return TestResult(test_case=test_case, success=False, error=str(exc)) @@ -193,11 +268,62 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult: _hide_progress_task(instance_name) if needs_workspace: - with isolated_workspace() as workspace: + with isolated_workspace(seed_dir=seed_dir) as workspace: return await _run_in_workspace(workspace) return await _run_in_workspace(None) +async def run_test_with_skills( + test_case: TestCase, + evaluator: LlmAgent, + skills: list[Skill] | None = None, + *, + use_workspace: bool | None = None, + model: str | None = None, + instance_name: str | None = None, + seed_dir: Path | None = None, + mounted_skills: list[tuple[Path, Skill]] | None = None, +) -> TestResult: + """Run a single test case with a bundle of injected skills.""" + + bundle = skills or [] + try: + if model is not None: + await evaluator.set_model(model) + + mounted_paths: dict[str, str] = {} + if mounted_skills: + for source_dir, skill in mounted_skills: + try: + relative_path = source_dir.relative_to(seed_dir) if seed_dir else None + except ValueError: + relative_path = None + if relative_path is None: + relative_path = Path(".upskill") / "skills" / skill.name + mounted_paths[skill.name] = relative_path.as_posix() + + instruction = ( + compose_instruction_bundle( + evaluator.instruction, + bundle, + mounted_paths=mounted_paths or None, + ) + if bundle + else None + ) + return await _run_test_with_evaluator( + test_case, + evaluator, + instruction, + use_workspace=use_workspace, + instance_name=instance_name, + seed_dir=seed_dir, + mounted_skills=mounted_skills, + ) + except Exception as exc: + return TestResult(test_case=test_case, success=False, error=str(exc)) + + async def run_test( test_case: TestCase, evaluator: LlmAgent, @@ -218,14 +344,13 @@ async def run_test( """ try: - if model is not None: - await evaluator.set_model(model) - instruction = compose_instruction(evaluator.instruction, skill) if skill else None - return await _run_test_with_evaluator( + bundle = [skill] if skill else [] + return await run_test_with_skills( test_case, evaluator, - instruction, + bundle, use_workspace=use_workspace, + model=model, instance_name=instance_name, ) except Exception as exc: @@ -319,6 +444,296 @@ async def _run_batch( return results +def summarize_test_results(test_results: list[TestResult]) -> tuple[int, int, float, float]: + """Return assertions passed/total plus average tokens and turns.""" + assertions_passed = 0 + assertions_total = 0 + total_tokens = 0 + total_turns = 0 + + for result in test_results: + total_tokens += result.stats.total_tokens + total_turns += result.stats.turns + if result.validation_result: + assertions_passed += result.validation_result.assertions_passed + assertions_total += result.validation_result.assertions_total + else: + assertions_total += 1 + if result.success: + assertions_passed += 1 + + count = len(test_results) + avg_tokens = total_tokens / count if count else 0.0 + avg_turns = total_turns / count if count else 0.0 + return assertions_passed, assertions_total, avg_tokens, avg_turns + + +def build_default_judge_evaluation(summary: str) -> JudgeEvaluation: + """Build a neutral fallback judge result.""" + return JudgeEvaluation( + summary=summary, + criteria=[ + JudgeCriterionScore( + criterion=criterion, + score=3, + rationale="Judge output unavailable; using neutral fallback score.", + ) + for criterion in JUDGE_CRITERIA + ], + ) + + +async def judge_test_result( + task: str, + skill: Skill | list[Skill], + test_result: TestResult, + judge: LlmAgent, + *, + judge_model: str | None = None, + criteria: tuple[str, ...] | list[str] | None = None, + instance_name: str | None = None, +) -> JudgeEvaluation: + """Run LLM-as-a-judge for one executed candidate/test result.""" + + clone: LlmAgent | None = None + try: + clone = await judge.spawn_detached_instance(name=instance_name) + if judge_model is not None: + await clone.set_model(judge_model) + + skill_bundle = skill if isinstance(skill, list) else [skill] + artifact_sections = [] + for artifact in test_result.artifacts: + artifact_sections.append( + f"Artifact path: {artifact.path}\n" + f"Artifact content:\n{artifact.content}" + ) + artifact_block = "\n\n".join(artifact_sections) if artifact_sections else "none" + rubric = tuple(criteria or JUDGE_CRITERIA) + skill_sections = [] + for item in skill_bundle: + skill_sections.append( + f"Skill name: {item.name}\n" + f"Skill description: {item.description}\n" + f"Skill body:\n{item.body}" + ) + verifier_payload = [ + spec.model_dump(mode="json") for spec in test_result.test_case.effective_verifiers() + ] + validation_payload = ( + test_result.validation_result.model_dump(mode="json") + if test_result.validation_result + else "none" + ) + + prompt = ( + f"Original task:\n{task}\n\n" + f"Candidate skill bundle:\n{chr(10).join(skill_sections)}\n\n" + f"Test case input:\n{test_result.test_case.input}\n\n" + f"Verifiers:\n{verifier_payload}\n\n" + f"Agent output:\n{test_result.output or ''}\n\n" + f"Captured artifacts:\n{artifact_block}\n\n" + f"Execution success: {test_result.success}\n" + f"Execution error: {test_result.error or ''}\n" + f"Validation result: {validation_payload}\n\n" + "Score this executed candidate against the rubric. " + "Return structured data with exactly these criteria: " + f"{', '.join(rubric)}." + ) + result, _ = await clone.structured(prompt, JudgeEvaluation) + if result is None: + return build_default_judge_evaluation("Judge returned no structured result.") + return result + except Exception as exc: + logger.exception("Judge evaluation failed", exc_info=exc) + return build_default_judge_evaluation(f"Judge evaluation failed: {exc}") + finally: + if clone is not None: + try: + await clone.shutdown() + except Exception as exc: + logger.exception("Failed to shutdown judge clone", exc_info=exc) + _hide_progress_task(instance_name) + + +def rank_candidate_results( + task: str, + candidate_results: list[CandidateEvalResult], + *, + skill_generation_model: str, + evaluation_model: str, + judge_model: str | None, + judge_strategy: str, + tests: list[TestCase], + judge_weight: float = 0.3, +) -> RankedSkillBatch: + """Rank evaluated candidates using hard score, judge score, and token efficiency.""" + + if not candidate_results: + return RankedSkillBatch( + task=task, + skill_generation_model=skill_generation_model, + evaluation_model=evaluation_model, + judge_model=judge_model, + judge_strategy=judge_strategy, + candidate_count=0, + tests=tests, + ) + + min_avg_tokens = min(result.average_tokens for result in candidate_results) + max_avg_tokens = max(result.average_tokens for result in candidate_results) + token_range = max_avg_tokens - min_avg_tokens + + for result in candidate_results: + if token_range <= 0: + result.token_efficiency_score = 1.0 + else: + result.token_efficiency_score = 1 - ( + (result.average_tokens - min_avg_tokens) / token_range + ) + + hard_score = result.hard_score + judge_score = result.judge_score + token_score = result.token_efficiency_score + result.composite_score = ( + 0.6 * hard_score + + judge_weight * judge_score + + 0.1 * token_score + ) + + best_hard_score = max(result.hard_score for result in candidate_results) + hard_gate_threshold = max(0.0, best_hard_score - 0.2) + for result in candidate_results: + result.hard_gate_failed = result.hard_score < hard_gate_threshold + + ordered = sorted( + candidate_results, + key=lambda result: ( + result.hard_gate_failed, + -result.hard_score, + -result.judge_score, + -result.token_efficiency_score, + -result.composite_score, + result.candidate_id, + ), + ) + + ranked_results: list[RankedSkillResult] = [] + for index, result in enumerate(ordered, start=1): + result.skill.metadata.candidate_id = result.candidate_id + summary = None + if result.judge_evaluations: + summaries = [item.summary for item in result.judge_evaluations if item.summary] + summary = summaries[0] if summaries else None + margin = None + if index < len(ordered): + margin = result.composite_score - ordered[index].composite_score + ranked_results.append( + RankedSkillResult( + rank=index, + candidate=result, + judge_model=judge_model, + judge_summary=summary, + score_margin_from_next=margin, + ) + ) + + return RankedSkillBatch( + task=task, + skill_generation_model=skill_generation_model, + evaluation_model=evaluation_model, + judge_model=judge_model, + judge_strategy=judge_strategy, + candidate_count=len(candidate_results), + ranked_results=ranked_results, + tests=tests, + ) + + +async def evaluate_skill_candidates( + task: str, + candidates: list[Skill], + test_cases: list[TestCase], + evaluator: LlmAgent, + judge: LlmAgent | None, + *, + skill_generation_model: str | None = None, + evaluation_model: str, + judge_model: str | None, + judge_strategy: str = "pointwise", + judge_weight: float = 0.3, +) -> RankedSkillBatch: + """Evaluate and rank multiple candidate skills.""" + + if judge_strategy != "pointwise": + raise ValueError("Only pointwise judge strategy is supported in v1.") + + candidate_results: list[CandidateEvalResult] = [] + for index, skill in enumerate(candidates, start=1): + eval_results = await evaluate_skill( + skill, + test_cases=test_cases, + evaluator=evaluator, + model=evaluation_model, + run_baseline=False, + show_baseline_progress=False, + ) + assertions_passed, assertions_total, avg_tokens, avg_turns = summarize_test_results( + eval_results.with_skill_results + ) + hard_score = ( + assertions_passed / assertions_total if assertions_total else 0.0 + ) + + judge_evaluations: list[JudgeEvaluation] = [] + if judge is not None: + for test_index, test_result in enumerate(eval_results.with_skill_results, start=1): + judge_evaluations.append( + await judge_test_result( + task, + skill, + test_result, + judge, + judge_model=judge_model, + instance_name=( + f"judge ({skill.metadata.candidate_id or index} test {test_index})" + ), + ) + ) + + judge_score = ( + sum(item.normalized_score for item in judge_evaluations) / len(judge_evaluations) + if judge_evaluations + else 0.0 + ) + candidate_id = skill.metadata.candidate_id or f"candidate-{index}" + candidate_results.append( + CandidateEvalResult( + candidate_id=candidate_id, + skill=skill, + test_results=eval_results.with_skill_results, + judge_evaluations=judge_evaluations, + assertions_passed=assertions_passed, + assertions_total=assertions_total, + hard_score=hard_score, + judge_score=judge_score, + average_tokens=avg_tokens, + average_turns=avg_turns, + ) + ) + + return rank_candidate_results( + task, + candidate_results, + skill_generation_model=skill_generation_model or evaluation_model, + evaluation_model=evaluation_model, + judge_model=judge_model, + judge_strategy=judge_strategy, + tests=test_cases, + judge_weight=judge_weight, + ) + + def get_failure_descriptions(results: EvalResults) -> list[str]: """Extract descriptions of failed tests for refinement.""" failures = [] diff --git a/src/upskill/fastagent_integration.py b/src/upskill/fastagent_integration.py index 32f8349..44b9ec2 100644 --- a/src/upskill/fastagent_integration.py +++ b/src/upskill/fastagent_integration.py @@ -12,5 +12,32 @@ def compose_instruction(instruction: str, skill: Skill | None) -> str: """Inject the skill content into an instruction when provided.""" if not skill: return instruction - return f"{instruction}\n\n## Skill: {skill.name}\n\n{skill.body}" + return compose_instruction_bundle(instruction, [skill]) + +def compose_instruction_bundle( + instruction: str, + skills: list[Skill], + *, + mounted_paths: dict[str, str] | None = None, +) -> str: + """Inject one or more skills into the evaluator instruction.""" + if not skills: + return instruction + + sections = [instruction] + if mounted_paths: + path_lines = [ + f"- {skill.name}: {mounted_paths[skill.name]}" + for skill in skills + if skill.name in mounted_paths + ] + if path_lines: + sections.append("## Mounted Skills\n" + "\n".join(path_lines)) + + skill_sections = [] + for skill in skills: + skill_sections.append(f"## Skill: {skill.name}\n\n{skill.body}") + + sections.append("\n\n".join(skill_sections)) + return "\n\n".join(part for part in sections if part) diff --git a/src/upskill/generate.py b/src/upskill/generate.py index 1cca065..5f17c39 100644 --- a/src/upskill/generate.py +++ b/src/upskill/generate.py @@ -3,9 +3,14 @@ from __future__ import annotations from datetime import UTC, datetime +from typing import Any -from fast_agent.interfaces import AgentProtocol -from fast_agent.skills.registry import SkillManifest +try: + from fast_agent.interfaces import AgentProtocol + from fast_agent.skills.registry import SkillManifest +except ModuleNotFoundError: # pragma: no cover - enables unit tests without fast-agent + AgentProtocol = Any + SkillManifest = Any from upskill.manifest_utils import parse_skill_manifest_text from upskill.models import Skill, SkillMetadata, TestCase, TestCaseSuite @@ -128,6 +133,35 @@ async def generate_skill( ) +async def generate_skill_candidates( + task: str, + generator: AgentProtocol, + count: int, + examples: list[str] | None = None, + model: str | None = None, +) -> list[Skill]: + """Generate multiple candidate skills for the same task.""" + + candidates: list[Skill] = [] + total = max(1, count) + for index in range(total): + variant_task = ( + f"{task}\n\n" + f"Candidate variant {index + 1} of {total}. Produce a distinct but valid skill " + "that teaches the same task. Vary structure, examples, and phrasing while keeping " + "the behavior correct and practical." + ) + skill = await generate_skill( + task=variant_task, + examples=examples, + generator=generator, + model=model, + ) + skill.metadata.candidate_id = f"candidate-{index + 1}" + candidates.append(skill) + return candidates + + async def generate_tests( task: str, generator: AgentProtocol, diff --git a/src/upskill/logging.py b/src/upskill/logging.py index 59c723e..cbea94d 100644 --- a/src/upskill/logging.py +++ b/src/upskill/logging.py @@ -6,12 +6,28 @@ import json from datetime import datetime from pathlib import Path - -from fast_agent import ConversationSummary -from fast_agent.constants import FAST_AGENT_TIMING, FAST_AGENT_USAGE -from fast_agent.mcp.helpers.content_helpers import get_text - -from upskill.models import BatchSummary, ConversationStats, RunMetadata, RunResult, TestResult +from typing import Any + +try: + from fast_agent import ConversationSummary + from fast_agent.constants import FAST_AGENT_TIMING, FAST_AGENT_USAGE + from fast_agent.mcp.helpers.content_helpers import get_text +except ModuleNotFoundError: # pragma: no cover - enables unit tests without fast-agent + ConversationSummary = Any + FAST_AGENT_TIMING = "fast-agent-timing" + FAST_AGENT_USAGE = "fast-agent-usage" + + def get_text(content: object) -> str | None: + return getattr(content, "text", None) + +from upskill.models import ( + BatchSummary, + ConversationStats, + RankedSkillBatch, + RunMetadata, + RunResult, + TestResult, +) # CSV field names for run summaries (matching skills-test format) FIELDNAMES = [ @@ -107,6 +123,27 @@ def load_run_result(run_folder: Path) -> RunResult | None: return None +def write_ranking_summary(batch_folder: Path, ranking: RankedSkillBatch) -> None: + """Write candidate ranking output to JSON.""" + path = batch_folder / "ranking_summary.json" + path.write_text( + json.dumps(ranking.model_dump(mode="json"), indent=2), + encoding="utf-8", + ) + + +def load_ranking_summary(batch_folder: Path) -> RankedSkillBatch | None: + """Load candidate ranking output from JSON.""" + path = batch_folder / "ranking_summary.json" + if not path.exists(): + return None + try: + data = json.loads(path.read_text(encoding="utf-8")) + return RankedSkillBatch(**data) + except (json.JSONDecodeError, ValueError): + return None + + def extract_tokens_from_messages( messages: list, ) -> tuple[int, int, int, list[dict[str, object]]]: diff --git a/src/upskill/manifest_utils.py b/src/upskill/manifest_utils.py index c84c6f2..5724361 100644 --- a/src/upskill/manifest_utils.py +++ b/src/upskill/manifest_utils.py @@ -3,8 +3,13 @@ from __future__ import annotations from pathlib import Path +from typing import Any -from fast_agent.skills.registry import SkillManifest, SkillRegistry +try: + from fast_agent.skills.registry import SkillManifest, SkillRegistry +except ModuleNotFoundError: # pragma: no cover - enables unit tests without fast-agent + SkillManifest = Any + SkillRegistry = None def parse_skill_manifest_text( @@ -21,4 +26,6 @@ def parse_skill_manifest_text( Returns: Tuple of (SkillManifest | None, error message | None). """ + if SkillRegistry is None: + return None, "fast-agent-mcp is required to parse skill manifests." return SkillRegistry.parse_manifest_text(manifest_text, path=path) diff --git a/src/upskill/models.py b/src/upskill/models.py index cf0a6ea..5e772cc 100644 --- a/src/upskill/models.py +++ b/src/upskill/models.py @@ -6,8 +6,9 @@ import re from datetime import datetime from pathlib import Path +from typing import Literal -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator class SkillMetadata(BaseModel): @@ -20,6 +21,7 @@ class SkillMetadata(BaseModel): test_pass_rate: float | None = None license: str | None = None compatibility: str | None = None + candidate_id: str | None = None class ValidationResult(BaseModel): @@ -31,6 +33,38 @@ class ValidationResult(BaseModel): metrics_count: int = 0 benchmarks_found: list[str] = Field(default_factory=list) error_message: str | None = None + details: list[str] = Field(default_factory=list) + + +class VerifierSpec(BaseModel): + """Deterministic verifier configuration for a test case.""" + + model_config = ConfigDict(extra="forbid") + + type: str + name: str | None = None + values: list[str] = Field(default_factory=list) + path: str | None = None + text: str | None = None + cmd: str | None = None + config: dict[str, str | int | float | bool] | None = None + + @field_validator("values", mode="before") + @classmethod + def coerce_values(cls, value: str | list[str] | None) -> list[str]: + if value is None: + return [] + if isinstance(value, str): + return [value] + return value + + +class CapturedArtifact(BaseModel): + """Text artifact captured from a test workspace.""" + + path: str + content: str + truncated: bool = False class ExpectedSpec(BaseModel): @@ -63,13 +97,41 @@ class TestCase(BaseModel): input: str # Task/prompt to give the agent context: TestCaseContext | None = None # Files, env vars, etc. - expected: ExpectedSpec # Expected output checks + expected: ExpectedSpec | None = None # Legacy expected output checks + verifiers: list[VerifierSpec] = Field(default_factory=list) # Custom validator support output_file: str | None = None # File to validate instead of agent output validator: str | None = None # Validator name (e.g., "hf_eval_yaml") validator_config: dict[str, str | int | float | bool] | None = None + @model_validator(mode="after") + def validate_expectations(self) -> TestCase: + if self.expected is None and not self.verifiers and self.validator is None: + raise ValueError( + "TestCase requires at least one of expected, verifiers, or validator." + ) + return self + + def effective_verifiers(self) -> list[VerifierSpec]: + """Return normalized verifier specs including legacy fields.""" + effective = list(self.verifiers) + if self.expected is not None and self.expected.contains: + effective.insert( + 0, + VerifierSpec(type="contains", values=self.expected.contains), + ) + if self.validator is not None: + effective.append( + VerifierSpec( + type="validator", + name=self.validator, + path=self.output_file, + config=self.validator_config, + ) + ) + return effective + @@ -324,6 +386,40 @@ class TestResult(BaseModel): # Detailed validation results (for custom validators) validation_result: ValidationResult | None = None + artifacts: list[CapturedArtifact] = Field(default_factory=list) + + +class JudgeCriterionScore(BaseModel): + """Score for a single judge rubric criterion.""" + + criterion: str + score: int = Field(..., ge=1, le=5) + rationale: str + + +class JudgeEvaluation(BaseModel): + """Structured LLM-as-a-judge evaluation for one executed test.""" + + summary: str + criteria: list[JudgeCriterionScore] = Field(default_factory=list) + + @property + def total_score(self) -> int: + """Return the summed rubric score.""" + return sum(item.score for item in self.criteria) + + @property + def max_score(self) -> int: + """Return the maximum possible score.""" + return len(self.criteria) * 5 + + @property + def normalized_score(self) -> float: + """Return the judge score normalized to 0-1.""" + max_score = self.max_score + if max_score == 0: + return 0.0 + return self.total_score / max_score class EvalResults(BaseModel): @@ -363,6 +459,136 @@ def is_beneficial(self) -> bool: return self.skill_lift > 0.05 or (self.skill_lift >= 0 and self.token_savings > 0.2) +class CandidateEvalResult(BaseModel): + """Evaluation data for one candidate skill.""" + + candidate_id: str + skill: Skill + test_results: list[TestResult] = Field(default_factory=list) + judge_evaluations: list[JudgeEvaluation] = Field(default_factory=list) + assertions_passed: int = 0 + assertions_total: int = 0 + hard_score: float = 0.0 + judge_score: float = 0.0 + token_efficiency_score: float = 0.0 + composite_score: float = 0.0 + hard_gate_failed: bool = False + average_tokens: float = 0.0 + average_turns: float = 0.0 + + +class RankedSkillResult(BaseModel): + """Ranked wrapper around one candidate result.""" + + rank: int + candidate: CandidateEvalResult + judge_model: str | None = None + judge_summary: str | None = None + score_margin_from_next: float | None = None + + +class RankedSkillBatch(BaseModel): + """Full ranking output for one candidate generation batch.""" + + task: str + skill_generation_model: str + evaluation_model: str + judge_model: str | None = None + judge_strategy: str = "pointwise" + candidate_count: int = 0 + ranked_results: list[RankedSkillResult] = Field(default_factory=list) + tests: list[TestCase] = Field(default_factory=list) + + @property + def winner(self) -> RankedSkillResult | None: + """Return the highest-ranked candidate.""" + if not self.ranked_results: + return None + return self.ranked_results[0] + + +class ScenarioJudgeConfig(BaseModel): + """Judge configuration for a scenario.""" + + model_config = ConfigDict(extra="forbid") + + enabled: bool = False + criteria: list[str] | None = None + + +class EvalScenario(BaseModel): + """Scenario definition for CI evaluation.""" + + model_config = ConfigDict(extra="forbid") + + id: str = Field(..., min_length=1) + skills: list[str] = Field(default_factory=list) + tests: str + judge: ScenarioJudgeConfig | None = None + include_baseline: bool = False + + +class EvalManifest(BaseModel): + """Top-level CI manifest.""" + + model_config = ConfigDict(extra="forbid") + + scenarios: list[EvalScenario] = Field(default_factory=list) + + +class ScenarioVariantResult(BaseModel): + """Aggregate result for one scenario variant.""" + + variant_id: str + variant_type: Literal["bundle", "ablation", "baseline"] + skills: list[str] = Field(default_factory=list) + omitted_skill: str | None = None + passed: bool + assertions_passed: int = 0 + assertions_total: int = 0 + hard_score: float = 0.0 + judge_score: float | None = None + judge_summary: str | None = None + total_tokens: int = 0 + average_turns: float = 0.0 + run_folder: str | None = None + + +class ScenarioContribution(BaseModel): + """Contribution delta for leaving one skill out of a bundle.""" + + skill: str + hard_score_delta: float = 0.0 + judge_score_delta: float | None = None + passed_without_skill: bool = False + + +class ScenarioReport(BaseModel): + """Report for one selected scenario.""" + + scenario_id: str + skills: list[str] = Field(default_factory=list) + tests_path: str + passed: bool + bundle: ScenarioVariantResult + ablations: list[ScenarioVariantResult] = Field(default_factory=list) + baseline: ScenarioVariantResult | None = None + contributions: list[ScenarioContribution] = Field(default_factory=list) + + +class CiReport(BaseModel): + """Machine-readable report for a CI evaluation run.""" + + manifest_path: str + scope: str + base_ref: str | None = None + changed_files: list[str] = Field(default_factory=list) + changed_skills: list[str] = Field(default_factory=list) + selected_scenarios: list[str] = Field(default_factory=list) + success: bool = True + scenarios: list[ScenarioReport] = Field(default_factory=list) + + # Run logging models (similar to skills-test) @@ -391,6 +617,16 @@ class RunResult(BaseModel): # For plot command: distinguish baseline vs with-skill runs run_type: str = "with_skill" # "with_skill" | "baseline" skill_name: str | None = None # Name of the skill being evaluated + judge_model: str | None = None + judge_score: float | None = None + judge_summary: str | None = None + candidate_id: str | None = None + rank: int | None = None + scenario_id: str | None = None + variant_id: str | None = None + variant_type: str | None = None + skills: list[str] = Field(default_factory=list) + omitted_skill: str | None = None class BatchSummary(BaseModel): diff --git a/src/upskill/verifiers.py b/src/upskill/verifiers.py new file mode 100644 index 0000000..ef49528 --- /dev/null +++ b/src/upskill/verifiers.py @@ -0,0 +1,238 @@ +"""Deterministic verifier execution for upskill test cases.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +from upskill.models import TestCase, ValidationResult, VerifierSpec +from upskill.validators import get_validator + +DEFAULT_COMMAND_TIMEOUT_SECONDS = 60 +MAX_COMMAND_OUTPUT_CHARS = 1200 + + +def _build_validation_result( + passed: bool, + *, + error_message: str | None = None, + details: list[str] | None = None, +) -> ValidationResult: + return ValidationResult( + passed=passed, + assertions_passed=1 if passed else 0, + assertions_total=1, + error_message=error_message, + details=details or [], + ) + + +def _format_command_failure(output: str) -> str: + compact = output.strip() + if len(compact) > MAX_COMMAND_OUTPUT_CHARS: + compact = compact[:MAX_COMMAND_OUTPUT_CHARS].rstrip() + "..." + return compact or "command exited with a non-zero status" + + +def _resolve_values(spec: VerifierSpec) -> list[str]: + if spec.values: + return spec.values + if spec.text: + return [spec.text] + return [] + + +def _run_contains_verifier(spec: VerifierSpec, output: str) -> ValidationResult: + required = [value for value in _resolve_values(spec) if value.strip()] + if not required: + return _build_validation_result(False, error_message="contains verifier is missing values") + + output_lower = output.lower() + missing = [item for item in required if item.lower() not in output_lower] + if missing: + return _build_validation_result( + False, + error_message=f"missing required output text: {missing[0]}", + details=[f"missing: {item}" for item in missing], + ) + return _build_validation_result(True) + + +def _require_workspace(spec: VerifierSpec, workspace: Path | None) -> ValidationResult | None: + if workspace is not None: + return None + return _build_validation_result( + False, + error_message=f"{spec.type} verifier requires a workspace", + ) + + +def _run_file_exists_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + if not spec.path: + return _build_validation_result(False, error_message="file_exists verifier is missing path") + + target = workspace / spec.path + if target.exists(): + return _build_validation_result(True) + return _build_validation_result( + False, + error_message=f"expected file does not exist: {spec.path}", + ) + + +def _run_file_contains_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + if not spec.path: + return _build_validation_result( + False, + error_message="file_contains verifier is missing path", + ) + + target = workspace / spec.path + if not target.exists(): + return _build_validation_result( + False, + error_message=f"expected file does not exist: {spec.path}", + ) + + required = [value for value in _resolve_values(spec) if value.strip()] + if not required: + return _build_validation_result( + False, + error_message="file_contains verifier is missing text or values", + ) + + content = target.read_text(encoding="utf-8") + content_lower = content.lower() + missing = [item for item in required if item.lower() not in content_lower] + if missing: + return _build_validation_result( + False, + error_message=f"missing required file text: {missing[0]}", + details=[f"missing: {item}" for item in missing], + ) + return _build_validation_result(True) + + +def _run_command_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + if not spec.cmd: + return _build_validation_result(False, error_message="command verifier is missing cmd") + + timeout_seconds = DEFAULT_COMMAND_TIMEOUT_SECONDS + if spec.config and "timeout_seconds" in spec.config: + timeout_seconds = int(spec.config["timeout_seconds"]) + + completed = subprocess.run( + spec.cmd, + shell=True, + cwd=workspace, + text=True, + capture_output=True, + timeout=timeout_seconds, + check=False, + ) + if completed.returncode == 0: + return _build_validation_result(True) + + combined_output = "\n".join( + part for part in (completed.stdout, completed.stderr) if part + ) + return _build_validation_result( + False, + error_message=_format_command_failure(combined_output), + ) + + +def _run_legacy_validator_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult: + workspace_error = _require_workspace(spec, workspace) + if workspace_error is not None: + return workspace_error + if not spec.name: + return _build_validation_result(False, error_message="validator verifier is missing name") + + validator = get_validator(spec.name) + if validator is None: + return _build_validation_result( + False, + error_message=f"unknown validator: {spec.name}", + ) + + config = spec.config or {} + return validator( + workspace=workspace, + output_file=spec.path or "", + **config, + ) + + +def run_verifier( + spec: VerifierSpec, + *, + output: str, + workspace: Path | None, +) -> ValidationResult: + """Run one verifier against the current output/workspace.""" + + if spec.type == "contains": + return _run_contains_verifier(spec, output) + if spec.type == "file_exists": + return _run_file_exists_verifier(spec, workspace) + if spec.type == "file_contains": + return _run_file_contains_verifier(spec, workspace) + if spec.type == "command": + return _run_command_verifier(spec, workspace) + if spec.type == "validator": + return _run_legacy_validator_verifier(spec, workspace) + + return _build_validation_result( + False, + error_message=f"unsupported verifier type: {spec.type}", + ) + + +def run_verifiers( + test_case: TestCase, + *, + output: str, + workspace: Path | None, +) -> ValidationResult: + """Run all verifiers configured for a test case.""" + + specs = test_case.effective_verifiers() + if not specs: + return ValidationResult( + passed=False, + assertions_passed=0, + assertions_total=0, + error_message="no verifiers configured", + ) + + passed = 0 + total = 0 + details: list[str] = [] + error_messages: list[str] = [] + + for spec in specs: + result = run_verifier(spec, output=output, workspace=workspace) + passed += result.assertions_passed + total += result.assertions_total + if result.error_message: + error_messages.append(result.error_message) + if result.details: + details.extend(result.details) + + return ValidationResult( + passed=passed == total, + assertions_passed=passed, + assertions_total=total, + error_message="; ".join(error_messages) if error_messages else None, + details=details, + ) diff --git a/tests/fixtures/ci_action_repo/.upskill/evals.yaml b/tests/fixtures/ci_action_repo/.upskill/evals.yaml new file mode 100644 index 0000000..52da53d --- /dev/null +++ b/tests/fixtures/ci_action_repo/.upskill/evals.yaml @@ -0,0 +1,5 @@ +scenarios: + - id: fixture-scenario + skills: + - skills/example-skill + tests: evals/example.yaml diff --git a/tests/fixtures/ci_action_repo/evals/example.yaml b/tests/fixtures/ci_action_repo/evals/example.yaml new file mode 100644 index 0000000..031a2eb --- /dev/null +++ b/tests/fixtures/ci_action_repo/evals/example.yaml @@ -0,0 +1,6 @@ +cases: + - input: "Say fixture" + expected: + contains: + - fixture + - response diff --git a/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md new file mode 100644 index 0000000..70b1902 --- /dev/null +++ b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md @@ -0,0 +1,6 @@ +--- +name: example-skill +description: Example fixture skill for action smoke tests +--- + +Respond with the fixture response. diff --git a/tests/test_ci.py b/tests/test_ci.py new file mode 100644 index 0000000..35f9e20 --- /dev/null +++ b/tests/test_ci.py @@ -0,0 +1,298 @@ +from __future__ import annotations + +import asyncio +from pathlib import Path + +from click.testing import CliRunner + +from upskill.ci import ( + load_eval_manifest, + render_ci_report_markdown, + run_ci_suite, + select_scenarios, + write_ci_report, +) +from upskill.cli import main +from upskill.models import JudgeCriterionScore, JudgeEvaluation + + +def _write_skill(path: Path, name: str, description: str, body: str) -> None: + path.mkdir(parents=True, exist_ok=True) + (path / "SKILL.md").write_text( + "\n".join( + [ + "---", + f"name: {name}", + f"description: {description}", + "---", + "", + body, + "", + ] + ), + encoding="utf-8", + ) + + +def _write_fixture_repo(root: Path) -> Path: + skills_dir = root / "skills" + _write_skill( + skills_dir / "alpha-skill", + "alpha-skill", + "alpha helper", + "Alpha bundle helper.", + ) + _write_skill( + skills_dir / "beta-skill", + "beta-skill", + "beta helper", + "Beta bundle helper.", + ) + + scripts_dir = root / "scripts" + scripts_dir.mkdir(parents=True, exist_ok=True) + (scripts_dir / "assert_report.py").write_text( + "\n".join( + [ + "from __future__ import annotations", + "", + "from pathlib import Path", + "import sys", + "", + "target = Path(sys.argv[1])", + "content = target.read_text(encoding='utf-8').strip()", + "if content != 'bundle ok':", + " raise SystemExit(f'unexpected content: {content}')", + ] + ), + encoding="utf-8", + ) + + evals_dir = root / "evals" + evals_dir.mkdir(parents=True, exist_ok=True) + (evals_dir / "bundle.yaml").write_text( + "\n".join( + [ + "cases:", + " - input: write the report", + " output_file: report.txt", + " verifiers:", + " - type: file_exists", + " path: report.txt", + " - type: command", + " cmd: python scripts/assert_report.py report.txt", + ] + ), + encoding="utf-8", + ) + + manifest_dir = root / ".upskill" + manifest_dir.mkdir(parents=True, exist_ok=True) + manifest_path = manifest_dir / "evals.yaml" + manifest_path.write_text( + "\n".join( + [ + "scenarios:", + " - id: bundle-scenario", + " skills:", + " - skills/alpha-skill", + " - skills/beta-skill", + " tests: evals/bundle.yaml", + " judge:", + " enabled: true", + ] + ), + encoding="utf-8", + ) + return manifest_path + + +class _FakeEvaluatorClone: + shell_runtime_enabled = True + + def __init__(self, parent: _FakeEvaluatorAgent) -> None: + self._parent = parent + self.instruction = "" + self.message_history: list[object] = [] + self.workspace: Path | None = None + + def enable_shell(self, working_directory: Path) -> None: + self.workspace = Path(working_directory) + + def set_instruction(self, instruction: str) -> None: + self.instruction = instruction + + async def set_model(self, model: str) -> None: + self._parent.model = model + + async def send(self, user_content: str) -> str: + assert self.workspace is not None + alpha_path = self.workspace / "skills" / "alpha-skill" / "SKILL.md" + beta_path = self.workspace / "skills" / "beta-skill" / "SKILL.md" + assert alpha_path.exists() + assert beta_path.exists() + + has_alpha = "Alpha bundle helper." in self.instruction + has_beta = "Beta bundle helper." in self.instruction + if has_alpha and has_beta: + content = "bundle ok" + elif has_alpha: + content = "alpha only" + elif has_beta: + content = "beta only" + else: + content = "baseline" + + (self.workspace / "report.txt").write_text(content, encoding="utf-8") + return f"{user_content}\n{content}" + + async def shutdown(self) -> None: + return None + + +class _FakeEvaluatorAgent: + def __init__(self) -> None: + self.instruction = "Base evaluator instruction" + self.model: str | None = None + + async def set_model(self, model: str) -> None: + self.model = model + + async def spawn_detached_instance(self, name: str | None = None) -> _FakeEvaluatorClone: + return _FakeEvaluatorClone(self) + + +class _FakeJudgeClone: + def __init__(self, parent: _FakeJudgeAgent) -> None: + self._parent = parent + + async def set_model(self, model: str) -> None: + self._parent.model = model + + async def structured(self, prompt: str, schema: type[JudgeEvaluation]): + score = 5 if "bundle ok" in prompt else 2 + result = schema( + summary="strong" if score == 5 else "weak", + criteria=[ + JudgeCriterionScore( + criterion=criterion, + score=score, + rationale="test rationale", + ) + for criterion in ( + "instruction_quality", + "helpfulness", + "robustness", + "concision", + "generalizability", + ) + ], + ) + return result, None + + async def shutdown(self) -> None: + return None + + +class _FakeJudgeAgent: + def __init__(self) -> None: + self.model: str | None = None + + async def spawn_detached_instance(self, name: str | None = None) -> _FakeJudgeClone: + return _FakeJudgeClone(self) + + +def test_manifest_selection_uses_changed_skills(tmp_path) -> None: + manifest_path = _write_fixture_repo(tmp_path) + manifest = load_eval_manifest(manifest_path) + + selected = select_scenarios( + manifest, + scope="changed", + changed_skills=["skills/beta-skill"], + ) + + assert [scenario.id for scenario in selected] == ["bundle-scenario"] + + +def test_run_ci_suite_executes_bundle_and_ablations(tmp_path) -> None: + manifest_path = _write_fixture_repo(tmp_path) + report = asyncio.run( + run_ci_suite( + manifest_path, + evaluator=_FakeEvaluatorAgent(), + judge=_FakeJudgeAgent(), + scope="all", + eval_model="haiku", + judge_model="judge-mini", + working_dir=tmp_path, + runs_dir=tmp_path / "runs", + ) + ) + + assert report.success is True + assert report.selected_scenarios == ["bundle-scenario"] + assert len(report.scenarios) == 1 + + scenario = report.scenarios[0] + assert scenario.bundle.passed is True + assert scenario.bundle.judge_score is not None + assert len(scenario.ablations) == 2 + assert all(item.passed is False for item in scenario.ablations) + assert {item.skill for item in scenario.contributions} == { + "skills/alpha-skill", + "skills/beta-skill", + } + assert all(item.hard_score_delta > 0 for item in scenario.contributions) + + markdown = render_ci_report_markdown(report) + assert "bundle-scenario" in markdown + assert "without-alpha-skill" in markdown + + report_path = tmp_path / "report.json" + write_ci_report(report_path, report) + assert report_path.exists() + + +def test_ci_cli_forwards_options(monkeypatch, tmp_path) -> None: + captured: dict[str, object] = {} + manifest_path = tmp_path / "evals.yaml" + manifest_path.write_text("scenarios: []\n", encoding="utf-8") + + async def _fake_ci_async(**kwargs): + captured.update(kwargs) + + monkeypatch.setattr("upskill.cli._ci_async", _fake_ci_async) + + runner = CliRunner() + result = runner.invoke( + main, + [ + "ci", + "--manifest", + str(manifest_path), + "--scope", + "all", + "--base-ref", + "origin/release", + "--eval-model", + "haiku", + "--judge-model", + "judge-mini", + "--summary-json", + "report.json", + "--runs-dir", + "runs", + "--fail-on-no-scenarios", + ], + ) + + assert result.exit_code == 0, result.output + assert captured["manifest_path"] == str(manifest_path) + assert captured["scope"] == "all" + assert captured["base_ref"] == "origin/release" + assert captured["eval_model"] == "haiku" + assert captured["judge_model"] == "judge-mini" + assert captured["summary_json"] == "report.json" + assert captured["runs_dir"] == "runs" + assert captured["fail_on_no_scenarios"] is True diff --git a/tests/test_config.py b/tests/test_config.py index 2ec6e1f..b8c53fb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -46,3 +46,23 @@ def test_config_save_uses_env_override_path_when_file_is_missing(tmp_path, monke saved = yaml.safe_load(f) or {} assert saved["skill_generation_model"] == "haiku" + + +def test_effective_judge_model_falls_back_to_eval_then_generation_model() -> None: + config = Config(skill_generation_model="sonnet", eval_model="haiku") + + assert config.effective_judge_model == "haiku" + + config = Config(skill_generation_model="sonnet", eval_model=None, judge_model=None) + + assert config.effective_judge_model == "sonnet" + + +def test_effective_judge_model_prefers_explicit_judge_model() -> None: + config = Config( + skill_generation_model="sonnet", + eval_model="haiku", + judge_model="opus", + ) + + assert config.effective_judge_model == "opus" diff --git a/tests/test_ranking.py b/tests/test_ranking.py new file mode 100644 index 0000000..32af450 --- /dev/null +++ b/tests/test_ranking.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +from click.testing import CliRunner + +from upskill.cli import main +from upskill.evaluate import build_default_judge_evaluation, rank_candidate_results +from upskill.logging import load_ranking_summary, write_ranking_summary +from upskill.models import ( + CandidateEvalResult, + JudgeCriterionScore, + JudgeEvaluation, + RankedSkillBatch, + Skill, +) + + +def _make_skill(name: str) -> Skill: + return Skill(name=name, description=f"{name} desc", body=f"# {name}") + + +def _judge_eval(score: int, summary: str) -> JudgeEvaluation: + return JudgeEvaluation( + summary=summary, + criteria=[ + JudgeCriterionScore( + criterion=criterion, + score=score, + rationale=f"{criterion} rationale", + ) + for criterion in ( + "instruction_quality", + "helpfulness", + "robustness", + "concision", + "generalizability", + ) + ], + ) + + +def test_rank_candidate_results_prefers_hard_score_over_judge_score() -> None: + strong = CandidateEvalResult( + candidate_id="candidate-1", + skill=_make_skill("strong-skill"), + assertions_passed=9, + assertions_total=10, + hard_score=0.9, + judge_score=0.4, + average_tokens=200, + ) + flashy = CandidateEvalResult( + candidate_id="candidate-2", + skill=_make_skill("flashy-skill"), + assertions_passed=6, + assertions_total=10, + hard_score=0.6, + judge_score=1.0, + average_tokens=100, + ) + + ranking = rank_candidate_results( + "task", + [flashy, strong], + skill_generation_model="sonnet", + evaluation_model="sonnet", + judge_model="haiku", + judge_strategy="pointwise", + tests=[], + ) + + assert ranking.winner is not None + assert ranking.winner.candidate.candidate_id == "candidate-1" + assert ranking.ranked_results[1].candidate.hard_gate_failed is True + + +def test_build_default_judge_evaluation_is_neutral() -> None: + result = build_default_judge_evaluation("fallback") + + assert result.summary == "fallback" + assert len(result.criteria) == 5 + assert result.total_score == 15 + assert result.normalized_score == 0.6 + + +def test_ranking_summary_round_trip(tmp_path) -> None: + ranking = RankedSkillBatch( + task="task", + skill_generation_model="sonnet", + evaluation_model="haiku", + judge_model="haiku", + candidate_count=1, + ) + + write_ranking_summary(tmp_path, ranking) + loaded = load_ranking_summary(tmp_path) + + assert loaded is not None + assert loaded.model_dump(mode="json") == ranking.model_dump(mode="json") + + +def test_generate_cli_forwards_judge_options(monkeypatch, tmp_path) -> None: + captured: dict[str, object] = {} + + async def _fake_generate_async(*args): + ( + task, + examples, + from_skill, + from_trace, + model, + test_gen_model, + output, + no_eval, + eval_model, + candidates, + judge_model, + rank_with_judge, + judge_strategy, + runs_dir, + log_runs, + ) = args + captured.update( + { + "task": task, + "examples": examples, + "from_skill": from_skill, + "from_trace": from_trace, + "model": model, + "test_gen_model": test_gen_model, + "output": output, + "no_eval": no_eval, + "eval_model": eval_model, + "candidates": candidates, + "judge_model": judge_model, + "rank_with_judge": rank_with_judge, + "judge_strategy": judge_strategy, + "runs_dir": runs_dir, + "log_runs": log_runs, + } + ) + + monkeypatch.setattr("upskill.cli._generate_async", _fake_generate_async) + + runner = CliRunner() + result = runner.invoke( + main, + [ + "generate", + "rank skills", + "--candidates", + "3", + "--judge-model", + "haiku", + "--rank-with-judge", + "--judge-strategy", + "pointwise", + "--runs-dir", + str(tmp_path), + "--no-log-runs", + ], + ) + + assert result.exit_code == 0, result.output + assert captured["task"] == "rank skills" + assert captured["candidates"] == 3 + assert captured["judge_model"] == "haiku" + assert captured["rank_with_judge"] is True + assert captured["judge_strategy"] == "pointwise" + assert captured["log_runs"] is False diff --git a/tests/test_verifiers.py b/tests/test_verifiers.py new file mode 100644 index 0000000..78fc7f6 --- /dev/null +++ b/tests/test_verifiers.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from pathlib import Path + +from upskill.models import TestCase as UpskillTestCase +from upskill.models import ValidationResult +from upskill.validators import register_validator +from upskill.verifiers import run_verifiers + + +@register_validator("test-counting-validator") +def _test_counting_validator( + workspace: Path, + output_file: str, + **_: object, +) -> ValidationResult: + target = workspace / output_file + passed = target.exists() + return ValidationResult( + passed=passed, + assertions_passed=2 if passed else 0, + assertions_total=2, + error_message=None if passed else f"missing file: {output_file}", + ) + + +def test_run_verifiers_supports_legacy_expected_contains() -> None: + test_case = UpskillTestCase( + input="say hello", + expected={"contains": ["hello", "world"]}, + ) + + result = run_verifiers(test_case, output="Hello, world!", workspace=None) + + assert result.passed is True + assert result.assertions_passed == 1 + assert result.assertions_total == 1 + + +def test_run_verifiers_supports_file_verifiers(tmp_path) -> None: + target = tmp_path / "report.txt" + target.write_text("bundle ok", encoding="utf-8") + test_case = UpskillTestCase( + input="write file", + verifiers=[ + {"type": "file_exists", "path": "report.txt"}, + {"type": "file_contains", "path": "report.txt", "text": "bundle ok"}, + ], + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is True + assert result.assertions_passed == 2 + assert result.assertions_total == 2 + + +def test_run_verifiers_supports_command_verifier(tmp_path) -> None: + script = tmp_path / "check.py" + script.write_text("print('ok')\n", encoding="utf-8") + test_case = UpskillTestCase( + input="run assertion script", + verifiers=[{"type": "command", "cmd": "python check.py"}], + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is True + assert result.assertions_passed == 1 + + +def test_run_verifiers_translates_legacy_validator(tmp_path) -> None: + target = tmp_path / "artifact.txt" + target.write_text("ok", encoding="utf-8") + test_case = UpskillTestCase( + input="validate artifact", + validator="test-counting-validator", + output_file="artifact.txt", + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is True + assert result.assertions_passed == 2 + assert result.assertions_total == 2 + + +def test_run_verifiers_reports_failures(tmp_path) -> None: + test_case = UpskillTestCase( + input="write report", + verifiers=[ + {"type": "file_exists", "path": "report.txt"}, + {"type": "command", "cmd": "python -c 'import sys; sys.exit(1)'"}, + ], + ) + + result = run_verifiers(test_case, output="", workspace=tmp_path) + + assert result.passed is False + assert result.assertions_passed == 0 + assert result.assertions_total == 2 + assert result.error_message is not None