diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 81945b4..c8bcadd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,3 +40,20 @@ jobs:
 
       - name: Run tests
         run: uv run pytest -v
+
+  action-smoke:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run local upskill action against fixture repo
+        uses: ./
+        with:
+          working-directory: tests/fixtures/ci_action_repo
+          scope: changed
+          base-ref: HEAD
+          summary-json: upskill-report.json
+          fail-on-no-scenarios: "false"
+
+      - name: Verify smoke report exists
+        run: test -f tests/fixtures/ci_action_repo/upskill-report.json
diff --git a/README.md b/README.md
index 24e9088..3529ed6 100644
--- a/README.md
+++ b/README.md
@@ -216,6 +216,71 @@ sonnet
 ]
 ```
 
+### `upskill ci`
+
+Run scenario-based CI evaluation for changed or declared skill bundles.
+
+```bash
+upskill ci [OPTIONS]
+```
+
+**Options:**
+- `--manifest PATH` - Scenario manifest (default: `./.upskill/evals.yaml`)
+- `--scope [changed|all]` - Run only impacted scenarios or the full suite
+- `--base-ref REF` - Base ref for changed-skill selection (default: `origin/main`)
+- `--eval-model MODEL` - Evaluator model override
+- `--judge-model MODEL` - Judge model override
+- `--summary-json PATH` - Output path for the machine-readable report
+- `--runs-dir PATH` - Directory for run artifacts
+- `--fail-on-no-scenarios / --no-fail-on-no-scenarios` - Control empty-selection behavior
+
+**Scenario manifest example:**
+
+```yaml
+scenarios:
+  - id: hf-model-card-readme
+    skills:
+      - skills/hugging-face-evaluation-manager
+      - skills/hf-cli
+    tests: evals/hf-model-card-readme.yaml
+    judge:
+      enabled: true
+```
+
+**Test suite example:**
+
+```yaml
+cases:
+  - input: "Read README and write olmo_7b_evaluations.yaml"
+    output_file: olmo_7b_evaluations.yaml
+    verifiers:
+      - type: file_exists
+        path: olmo_7b_evaluations.yaml
+      - type: command
+        cmd: python test_eval_assertions.py
+```
+
+The CI command runs the full declared bundle, then leave-one-out ablations for each
+contributing skill. Deterministic verifiers gate pass/fail; judge scoring is advisory.
+
+## GitHub Action
+
+Use the reusable action from another repository after `actions/checkout`:
+
+```yaml
+- uses: huggingface/upskill@vX
+  with:
+    working-directory: .
+    manifest-path: .upskill/evals.yaml
+    scope: changed
+    base-ref: origin/main
+    eval-model: haiku
+    judge-model: openai.gpt-4.1-mini
+```
+
+The action installs `upskill` from the tagged action source, writes `upskill-report.json`
+by default, and uploads the JSON report plus run artifacts.
+
 ### `upskill list`
 
 List all generated skills in a tree view.
diff --git a/action.yml b/action.yml
new file mode 100644
index 0000000..fc846a9
--- /dev/null
+++ b/action.yml
@@ -0,0 +1,119 @@
+name: upskill-ci
+description: Run upskill scenario-based CI evaluation for changed or declared skill bundles.
+
+inputs:
+  manifest-path:
+    description: Path to the upskill scenario manifest, relative to the working directory.
+    required: false
+    default: .upskill/evals.yaml
+  scope:
+    description: Run only changed scenarios or the entire manifest.
+    required: false
+    default: changed
+  base-ref:
+    description: Git base ref used when scope is changed.
+    required: false
+    default: origin/main
+  eval-model:
+    description: Model used for evaluator execution.
+    required: false
+  judge-model:
+    description: Model used for advisory judge scoring.
+    required: false
+  working-directory:
+    description: Repository directory where the manifest and skills live.
+    required: false
+    default: .
+  runs-dir:
+    description: Directory, relative to the working directory, for run artifacts.
+    required: false
+    default: runs
+  summary-json:
+    description: JSON report path, relative to the working directory.
+    required: false
+    default: upskill-report.json
+  fail-on-no-scenarios:
+    description: Exit non-zero when no scenarios are selected.
+    required: false
+    default: "false"
+  upload-artifacts:
+    description: Upload the JSON report and run artifacts.
+    required: false
+    default: "true"
+
+outputs:
+  summary-json:
+    description: Path to the generated JSON report.
+    value: ${{ steps.paths.outputs.summary_json }}
+  runs-dir:
+    description: Path to the generated run artifacts directory.
+    value: ${{ steps.paths.outputs.runs_dir }}
+
+runs:
+  using: composite
+  steps:
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.13"
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+
+    - name: Resolve output paths
+      id: paths
+      shell: bash
+      run: |
+        set -euo pipefail
+        workdir="${{ inputs.working-directory }}"
+        summary_json="${workdir}/${{ inputs.summary-json }}"
+        runs_dir="${workdir}/${{ inputs.runs-dir }}"
+        echo "summary_json=${summary_json}" >> "$GITHUB_OUTPUT"
+        echo "runs_dir=${runs_dir}" >> "$GITHUB_OUTPUT"
+
+    - name: Install upskill from action source
+      shell: bash
+      run: |
+        set -euo pipefail
+        uv pip install --system "${{ github.action_path }}"
+
+    - name: Run upskill ci
+      shell: bash
+      working-directory: ${{ inputs.working-directory }}
+      run: |
+        set -euo pipefail
+        args=(
+          --manifest "${{ inputs.manifest-path }}"
+          --scope "${{ inputs.scope }}"
+          --base-ref "${{ inputs.base-ref }}"
+          --runs-dir "${{ inputs.runs-dir }}"
+          --summary-json "${{ inputs.summary-json }}"
+        )
+        if [[ -n "${{ inputs.eval-model }}" ]]; then
+          args+=(--eval-model "${{ inputs.eval-model }}")
+        fi
+        if [[ -n "${{ inputs.judge-model }}" ]]; then
+          args+=(--judge-model "${{ inputs.judge-model }}")
+        fi
+        if [[ "${{ inputs.fail-on-no-scenarios }}" == "true" ]]; then
+          args+=(--fail-on-no-scenarios)
+        else
+          args+=(--no-fail-on-no-scenarios)
+        fi
+        upskill ci "${args[@]}"
+
+    - name: Upload CI report
+      if: ${{ always() && inputs.upload-artifacts == 'true' }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: upskill-report
+        path: ${{ steps.paths.outputs.summary_json }}
+        if-no-files-found: error
+
+    - name: Upload run artifacts
+      if: ${{ always() && inputs.upload-artifacts == 'true' }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: upskill-runs
+        path: ${{ steps.paths.outputs.runs_dir }}
+        if-no-files-found: warn
diff --git a/pyproject.toml b/pyproject.toml
index 160bcec..e10e254 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,3 +40,6 @@ target-version = "py313"
 
 [tool.ruff.lint]
 select = ["E", "F", "I", "UP"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/src/upskill/__init__.py b/src/upskill/__init__.py
index 40b0c50..4b9030b 100644
--- a/src/upskill/__init__.py
+++ b/src/upskill/__init__.py
@@ -2,6 +2,7 @@
 
 __version__ = "0.2.0"
 
+from upskill.ci import load_eval_manifest, run_ci_suite
 from upskill.config import Config
 from upskill.evaluate import evaluate_skill
 from upskill.generate import generate_skill, generate_tests, refine_skill
@@ -16,14 +17,18 @@
 )
 from upskill.models import (
     BatchSummary,
+    CiReport,
     ConversationStats,
+    EvalManifest,
     EvalResults,
+    EvalScenario,
     RunMetadata,
     RunResult,
     Skill,
     SkillMetadata,
     TestCase,
     TestResult,
+    VerifierSpec,
 )
 
 __all__ = [
@@ -39,12 +44,18 @@
     "RunResult",
     "ConversationStats",
     "BatchSummary",
+    "VerifierSpec",
+    "EvalScenario",
+    "EvalManifest",
+    "CiReport",
     # Generation
     "generate_skill",
     "generate_tests",
     "refine_skill",
     # Evaluation
     "evaluate_skill",
+    "run_ci_suite",
+    "load_eval_manifest",
     # Logging
     "create_batch_folder",
     "create_run_folder",
diff --git a/src/upskill/agent_cards/judge.md b/src/upskill/agent_cards/judge.md
new file mode 100644
index 0000000..65766b5
--- /dev/null
+++ b/src/upskill/agent_cards/judge.md
@@ -0,0 +1,13 @@
+---
+type: agent
+description: Judge executed skill candidates with a structured rubric.
+---
+You are an expert judge for AI agent skills.
+
+Score the executed skill candidate using the provided rubric only.
+
+Return structured output with:
+- a short summary
+- one entry for each criterion
+- integer scores from 1 to 5
+- concise rationales grounded in the provided test case, output, and validation result
diff --git a/src/upskill/ci.py b/src/upskill/ci.py
new file mode 100644
index 0000000..24e5d95
--- /dev/null
+++ b/src/upskill/ci.py
@@ -0,0 +1,534 @@
+"""Scenario-based CI evaluation for upskill."""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+from pathlib import Path
+
+import yaml
+
+from upskill.evaluate import judge_test_result, run_test_with_skills, summarize_test_results
+from upskill.logging import (
+    aggregate_conversation_stats,
+    create_batch_folder,
+    create_run_folder,
+    write_batch_summary,
+    write_run_metadata,
+    write_run_result,
+)
+from upskill.models import (
+    BatchSummary,
+    CiReport,
+    EvalManifest,
+    EvalScenario,
+    RunMetadata,
+    RunResult,
+    ScenarioContribution,
+    ScenarioReport,
+    ScenarioVariantResult,
+    Skill,
+    TestCase,
+)
+
+
+def _normalize_relative_path(path: Path, root: Path) -> str:
+    try:
+        return path.resolve().relative_to(root.resolve()).as_posix()
+    except ValueError:
+        return path.resolve().as_posix()
+
+
+def load_eval_manifest(path: Path) -> EvalManifest:
+    """Load a YAML or JSON CI manifest."""
+    with open(path, encoding="utf-8") as handle:
+        if path.suffix.lower() == ".json":
+            payload = json.load(handle)
+        else:
+            payload = yaml.safe_load(handle) or {}
+    return EvalManifest.model_validate(payload)
+
+
+def load_test_cases(path: Path) -> list[TestCase]:
+    """Load test cases from YAML or JSON."""
+    with open(path, encoding="utf-8") as handle:
+        if path.suffix.lower() == ".json":
+            payload = json.load(handle)
+        else:
+            payload = yaml.safe_load(handle) or {}
+
+    cases = payload["cases"] if isinstance(payload, dict) and "cases" in payload else payload
+    return [TestCase.model_validate(item) for item in cases]
+
+
+def plan_ci_suite(
+    manifest_path: Path,
+    *,
+    scope: str = "changed",
+    base_ref: str = "origin/main",
+    working_dir: Path | None = None,
+) -> tuple[CiReport, list[EvalScenario]]:
+    """Resolve scenario selection without executing the suite."""
+    root = (working_dir or Path.cwd()).resolve()
+    manifest = load_eval_manifest(manifest_path)
+
+    changed_files: list[str] = []
+    changed_skills: list[str] = []
+    if scope == "changed":
+        changed_files = resolve_changed_files(base_ref=base_ref, working_dir=root)
+        changed_skills = resolve_changed_skill_dirs(changed_files, working_dir=root)
+
+    selected_scenarios = select_scenarios(
+        manifest,
+        scope=scope,
+        changed_skills=changed_skills,
+    )
+
+    return (
+        CiReport(
+            manifest_path=_normalize_relative_path(manifest_path, root),
+            scope=scope,
+            base_ref=base_ref if scope == "changed" else None,
+            changed_files=changed_files,
+            changed_skills=changed_skills,
+            selected_scenarios=[scenario.id for scenario in selected_scenarios],
+            success=True,
+        ),
+        selected_scenarios,
+    )
+
+
+def resolve_changed_files(*, base_ref: str, working_dir: Path) -> list[str]:
+    """Return changed files for the current checkout."""
+    completed = subprocess.run(
+        ["git", "diff", "--name-only", f"{base_ref}...HEAD"],
+        cwd=working_dir,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    if completed.returncode != 0:
+        error = completed.stderr.strip() or completed.stdout.strip() or "git diff failed"
+        raise RuntimeError(error)
+    return [line.strip() for line in completed.stdout.splitlines() if line.strip()]
+
+
+def resolve_changed_skill_dirs(changed_files: list[str], *, working_dir: Path) -> list[str]:
+    """Find skill directories impacted by changed files."""
+    changed_skills: set[str] = set()
+    root = working_dir.resolve()
+
+    for changed_file in changed_files:
+        path = (working_dir / changed_file).resolve()
+        current = path if path.is_dir() else path.parent
+        while current != root and current != current.parent:
+            if (current / "SKILL.md").exists():
+                changed_skills.add(current.relative_to(root).as_posix())
+                break
+            current = current.parent
+
+    return sorted(changed_skills)
+
+
+def select_scenarios(
+    manifest: EvalManifest,
+    *,
+    scope: str,
+    changed_skills: list[str],
+) -> list[EvalScenario]:
+    """Filter manifest scenarios for the requested CI scope."""
+    if scope == "all":
+        return list(manifest.scenarios)
+
+    changed = set(changed_skills)
+    selected = []
+    for scenario in manifest.scenarios:
+        scenario_skills = set(Path(skill).as_posix() for skill in scenario.skills)
+        if scenario_skills & changed:
+            selected.append(scenario)
+    return selected
+
+
+def write_ci_report(path: Path, report: CiReport) -> None:
+    """Write the machine-readable CI report."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(report.model_dump(mode="json"), indent=2),
+        encoding="utf-8",
+    )
+
+
+def render_ci_report_markdown(report: CiReport) -> str:
+    """Render a GitHub-friendly markdown summary."""
+    lines = [
+        "# upskill CI",
+        "",
+        f"- Scope: `{report.scope}`",
+        f"- Manifest: `{report.manifest_path}`",
+    ]
+    if report.base_ref:
+        lines.append(f"- Base ref: `{report.base_ref}`")
+    if report.changed_skills:
+        changed = ", ".join(f"`{item}`" for item in report.changed_skills)
+        lines.append(f"- Changed skills: {changed}")
+    if not report.scenarios:
+        lines.extend(["", "No scenarios were selected."])
+        return "\n".join(lines)
+
+    for scenario in report.scenarios:
+        lines.extend(
+            [
+                "",
+                f"## {scenario.scenario_id}",
+                "",
+                "| Variant | Skills | Pass | Assertions | Judge | Tokens |",
+                "| --- | --- | --- | --- | --- | --- |",
+            ]
+        )
+        variants = [scenario.bundle, *scenario.ablations]
+        if scenario.baseline is not None:
+            variants.append(scenario.baseline)
+        for variant in variants:
+            judge_value = f"{variant.judge_score:.2f}" if variant.judge_score is not None else "n/a"
+            lines.append(
+                "| "
+                f"{variant.variant_id} | "
+                f"{', '.join(variant.skills) or '(none)'} | "
+                f"{'PASS' if variant.passed else 'FAIL'} | "
+                f"{variant.assertions_passed}/{variant.assertions_total} | "
+                f"{judge_value} | "
+                f"{variant.total_tokens} |"
+            )
+        if scenario.contributions:
+            lines.extend(
+                [
+                    "",
+                    "| Skill | Hard Delta | Judge Delta | Passed Without Skill |",
+                    "| --- | --- | --- | --- |",
+                ]
+            )
+            for contribution in scenario.contributions:
+                judge_delta = (
+                    f"{contribution.judge_score_delta:+.2f}"
+                    if contribution.judge_score_delta is not None
+                    else "n/a"
+                )
+                lines.append(
+                    "| "
+                    f"{contribution.skill} | "
+                    f"{contribution.hard_score_delta:+.2f} | "
+                    f"{judge_delta} | "
+                    f"{'yes' if contribution.passed_without_skill else 'no'} |"
+                )
+
+    return "\n".join(lines)
+
+
+def write_step_summary(report: CiReport) -> None:
+    """Append the markdown summary to GitHub's step summary file when available."""
+    summary_path = os.getenv("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        return
+    with open(summary_path, "a", encoding="utf-8") as handle:
+        handle.write(render_ci_report_markdown(report))
+        handle.write("\n")
+
+
+async def _evaluate_variant(
+    *,
+    scenario: EvalScenario,
+    variant_id: str,
+    variant_type: str,
+    skills: list[tuple[str, Path, Skill]],
+    omitted_skill: str | None,
+    test_cases: list[TestCase],
+    evaluator,
+    judge,
+    eval_model: str | None,
+    judge_model: str | None,
+    working_dir: Path,
+    judge_enabled: bool,
+    judge_criteria: list[str] | None,
+    batch_id: str,
+    batch_folder: Path | None,
+    run_number: int,
+) -> tuple[ScenarioVariantResult, RunResult]:
+    test_results = []
+    mounted_skills = [(path, skill) for _, path, skill in skills]
+    bundle_skills = [skill for _, _, skill in skills]
+    skill_labels = [label for label, _, _ in skills]
+
+    for test_index, test_case in enumerate(test_cases, start=1):
+        test_results.append(
+            await run_test_with_skills(
+                test_case,
+                evaluator,
+                bundle_skills,
+                model=eval_model,
+                instance_name=(
+                    f"ci ({scenario.id} {variant_id} test {test_index})"
+                ),
+                seed_dir=working_dir,
+                mounted_skills=mounted_skills,
+            )
+        )
+
+    assertions_passed, assertions_total, avg_tokens, avg_turns = summarize_test_results(
+        test_results
+    )
+    passed = all(result.success for result in test_results)
+    hard_score = assertions_passed / assertions_total if assertions_total else 0.0
+
+    judge_score = None
+    judge_summary = None
+    if judge is not None and judge_enabled and passed:
+        judge_results = []
+        for test_index, test_result in enumerate(test_results, start=1):
+            judge_results.append(
+                await judge_test_result(
+                    scenario.id,
+                    bundle_skills,
+                    test_result,
+                    judge,
+                    judge_model=judge_model,
+                    criteria=judge_criteria,
+                    instance_name=(
+                        f"judge ({scenario.id} {variant_id} test {test_index})"
+                    ),
+                )
+            )
+        if judge_results:
+            judge_score = sum(item.normalized_score for item in judge_results) / len(judge_results)
+            judge_summary = judge_results[0].summary
+
+    aggregated_stats = aggregate_conversation_stats(test_results)
+
+    run_folder_path: Path | None = None
+    if batch_folder is not None:
+        run_folder_path = create_run_folder(batch_folder, run_number)
+        run_result = RunResult(
+            metadata=RunMetadata(
+                model=eval_model or "",
+                task=scenario.id,
+                batch_id=batch_id,
+                run_number=run_number,
+            ),
+            stats=aggregated_stats,
+            passed=passed,
+            assertions_passed=assertions_passed,
+            assertions_total=assertions_total,
+            run_type="baseline" if variant_type == "baseline" else "with_skill",
+            skill_name=scenario.id,
+            judge_model=judge_model,
+            judge_score=judge_score,
+            judge_summary=judge_summary,
+            scenario_id=scenario.id,
+            variant_id=variant_id,
+            variant_type=variant_type,
+            skills=skill_labels,
+            omitted_skill=omitted_skill,
+        )
+        write_run_metadata(run_folder_path, run_result.metadata)
+        write_run_result(run_folder_path, run_result)
+    else:
+        run_result = RunResult(
+            metadata=RunMetadata(
+                model=eval_model or "",
+                task=scenario.id,
+                batch_id=batch_id,
+                run_number=run_number,
+            ),
+            stats=aggregated_stats,
+            passed=passed,
+            assertions_passed=assertions_passed,
+            assertions_total=assertions_total,
+            run_type="baseline" if variant_type == "baseline" else "with_skill",
+            skill_name=scenario.id,
+            judge_model=judge_model,
+            judge_score=judge_score,
+            judge_summary=judge_summary,
+            scenario_id=scenario.id,
+            variant_id=variant_id,
+            variant_type=variant_type,
+            skills=skill_labels,
+            omitted_skill=omitted_skill,
+        )
+
+    return (
+        ScenarioVariantResult(
+            variant_id=variant_id,
+            variant_type=variant_type,  # type: ignore[arg-type]
+            skills=skill_labels,
+            omitted_skill=omitted_skill,
+            passed=passed,
+            assertions_passed=assertions_passed,
+            assertions_total=assertions_total,
+            hard_score=hard_score,
+            judge_score=judge_score,
+            judge_summary=judge_summary,
+            total_tokens=aggregated_stats.total_tokens,
+            average_turns=avg_turns,
+            run_folder=str(run_folder_path) if run_folder_path is not None else None,
+        ),
+        run_result,
+    )
+
+
+async def run_ci_suite(
+    manifest_path: Path,
+    *,
+    evaluator,
+    judge=None,
+    scope: str = "changed",
+    base_ref: str = "origin/main",
+    eval_model: str | None = None,
+    judge_model: str | None = None,
+    working_dir: Path | None = None,
+    runs_dir: Path | None = None,
+) -> CiReport:
+    """Execute the selected scenario suite and return a machine-readable report."""
+    root = (working_dir or Path.cwd()).resolve()
+    report, selected_scenarios = plan_ci_suite(
+        manifest_path,
+        scope=scope,
+        base_ref=base_ref,
+        working_dir=root,
+    )
+
+    if not selected_scenarios:
+        return report
+
+    batch_id = ""
+    batch_folder: Path | None = None
+    all_run_results: list[RunResult] = []
+    if runs_dir is not None:
+        batch_id, batch_folder = create_batch_folder(runs_dir)
+
+    run_number = 0
+    for scenario in selected_scenarios:
+        tests_path = (root / scenario.tests).resolve()
+        test_cases = load_test_cases(tests_path)
+        loaded_skills = []
+        for skill_path in scenario.skills:
+            absolute_skill_path = (root / skill_path).resolve()
+            loaded_skills.append(
+                (
+                    Path(skill_path).as_posix(),
+                    absolute_skill_path,
+                    Skill.load(absolute_skill_path),
+                )
+            )
+
+        judge_enabled = bool(scenario.judge and scenario.judge.enabled)
+        judge_criteria = scenario.judge.criteria if scenario.judge else None
+
+        run_number += 1
+        bundle_result, bundle_run = await _evaluate_variant(
+            scenario=scenario,
+            variant_id="bundle",
+            variant_type="bundle",
+            skills=loaded_skills,
+            omitted_skill=None,
+            test_cases=test_cases,
+            evaluator=evaluator,
+            judge=judge,
+            eval_model=eval_model,
+            judge_model=judge_model,
+            working_dir=root,
+            judge_enabled=judge_enabled,
+            judge_criteria=judge_criteria,
+            batch_id=batch_id,
+            batch_folder=batch_folder,
+            run_number=run_number,
+        )
+        all_run_results.append(bundle_run)
+
+        ablation_results: list[ScenarioVariantResult] = []
+        contributions: list[ScenarioContribution] = []
+        for skill_label, _, _ in loaded_skills:
+            remaining = [item for item in loaded_skills if item[0] != skill_label]
+            run_number += 1
+            ablation_result, ablation_run = await _evaluate_variant(
+                scenario=scenario,
+                variant_id=f"without-{Path(skill_label).name}",
+                variant_type="ablation",
+                skills=remaining,
+                omitted_skill=skill_label,
+                test_cases=test_cases,
+                evaluator=evaluator,
+                judge=judge,
+                eval_model=eval_model,
+                judge_model=judge_model,
+                working_dir=root,
+                judge_enabled=judge_enabled,
+                judge_criteria=judge_criteria,
+                batch_id=batch_id,
+                batch_folder=batch_folder,
+                run_number=run_number,
+            )
+            all_run_results.append(ablation_run)
+            ablation_results.append(ablation_result)
+            contributions.append(
+                ScenarioContribution(
+                    skill=skill_label,
+                    hard_score_delta=bundle_result.hard_score - ablation_result.hard_score,
+                    judge_score_delta=(
+                        None
+                        if bundle_result.judge_score is None or ablation_result.judge_score is None
+                        else bundle_result.judge_score - ablation_result.judge_score
+                    ),
+                    passed_without_skill=ablation_result.passed,
+                )
+            )
+
+        baseline_result = None
+        if scenario.include_baseline:
+            run_number += 1
+            baseline_result, baseline_run = await _evaluate_variant(
+                scenario=scenario,
+                variant_id="baseline",
+                variant_type="baseline",
+                skills=[],
+                omitted_skill=None,
+                test_cases=test_cases,
+                evaluator=evaluator,
+                judge=judge,
+                eval_model=eval_model,
+                judge_model=judge_model,
+                working_dir=root,
+                judge_enabled=judge_enabled,
+                judge_criteria=judge_criteria,
+                batch_id=batch_id,
+                batch_folder=batch_folder,
+                run_number=run_number,
+            )
+            all_run_results.append(baseline_run)
+
+        report.scenarios.append(
+            ScenarioReport(
+                scenario_id=scenario.id,
+                skills=[label for label, _, _ in loaded_skills],
+                tests_path=_normalize_relative_path(tests_path, root),
+                passed=bundle_result.passed,
+                bundle=bundle_result,
+                ablations=ablation_results,
+                baseline=baseline_result,
+                contributions=contributions,
+            )
+        )
+
+    report.success = all(item.passed for item in report.scenarios)
+
+    if batch_folder is not None:
+        summary = BatchSummary(
+            batch_id=batch_id,
+            model=eval_model or "",
+            task="upskill-ci",
+            total_runs=len(all_run_results),
+            passed_runs=sum(1 for item in all_run_results if item.passed),
+            results=all_run_results,
+        )
+        write_batch_summary(batch_folder, summary)
+
+    return report
diff --git a/src/upskill/cli.py b/src/upskill/cli.py
index 1a9d63c..cb4ac18 100644
--- a/src/upskill/cli.py
+++ b/src/upskill/cli.py
@@ -13,15 +13,32 @@
 
 import click
 from dotenv import load_dotenv
-from fast_agent import FastAgent
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 from rich.tree import Tree
 
+try:
+    from fast_agent import FastAgent
+except ModuleNotFoundError:  # pragma: no cover - enables unit tests without fast-agent
+    FastAgent = None  # type: ignore[assignment]
+
+from upskill.ci import (
+    plan_ci_suite,
+    render_ci_report_markdown,
+    run_ci_suite,
+    write_ci_report,
+    write_step_summary,
+)
 from upskill.config import Config, resolve_upskill_config_path
-from upskill.evaluate import evaluate_skill, get_failure_descriptions
-from upskill.generate import generate_skill, generate_tests, improve_skill, refine_skill
+from upskill.evaluate import evaluate_skill, evaluate_skill_candidates, get_failure_descriptions
+from upskill.generate import (
+    generate_skill,
+    generate_skill_candidates,
+    generate_tests,
+    improve_skill,
+    refine_skill,
+)
 from upskill.logging import (
     aggregate_conversation_stats,
     create_batch_folder,
@@ -30,12 +47,14 @@
     load_run_result,
     summarize_runs_to_csv,
     write_batch_summary,
+    write_ranking_summary,
     write_run_metadata,
     write_run_result,
 )
 from upskill.model_resolution import ResolvedModels, resolve_models
 from upskill.models import (
     BatchSummary,
+    RankedSkillBatch,
     RunMetadata,
     RunResult,
     Skill,
@@ -51,6 +70,8 @@
 @asynccontextmanager
 async def _fast_agent_context(config: Config | None = None) -> AsyncIterator[object]:
     config = config or Config.load()
+    if FastAgent is None:
+        raise RuntimeError("fast-agent-mcp is required to run upskill commands.")
     fast = FastAgent(
         "upskill",
         config_path=str(config.effective_fastagent_config),
@@ -136,6 +157,36 @@ def _render_bar(value: float, width: int = 20) -> str:
     return "█" * filled + "░" * empty
 
 
+def _render_ranking_summary(ranking: RankedSkillBatch) -> None:
+    """Render top candidate ranking details."""
+    if not ranking.ranked_results:
+        return
+
+    top_results = ranking.ranked_results[:3]
+    console.print()
+    console.print(f"  candidates ranked: {ranking.candidate_count}")
+    for ranked in top_results:
+        candidate = ranked.candidate
+        margin = ""
+        if ranked.score_margin_from_next is not None:
+            margin = f"  margin {ranked.score_margin_from_next:+.3f}"
+        console.print(
+            f"  #{ranked.rank} {candidate.candidate_id}  "
+            f"hard {candidate.hard_score:.0%}  "
+            f"judge {candidate.judge_score:.0%}  "
+            f"tokens {candidate.token_efficiency_score:.0%}  "
+            f"composite {candidate.composite_score:.3f}{margin}"
+        )
+
+    winner = ranking.winner
+    if winner is not None:
+        console.print()
+        console.print(
+            f"  winner: [bold]{winner.candidate.candidate_id}[/bold]"
+            f" ({winner.candidate.skill.name})"
+        )
+
+
 class EvalPlotResult(TypedDict):
     """Structured plot data for eval runs."""
 
@@ -258,6 +309,22 @@ def main():
 @click.option("-o", "--output", type=click.Path(), help="Output directory for skill")
 @click.option("--no-eval", is_flag=True, help="Skip eval and refinement")
 @click.option("--eval-model", help="Optional extra cross-model eval pass after generation")
+@click.option(
+    "--candidates",
+    type=int,
+    help="Number of candidate skills to generate before ranking",
+)
+@click.option("--judge-model", help="Model to use for LLM-as-a-judge ranking")
+@click.option(
+    "--rank-with-judge/--no-rank-with-judge",
+    default=None,
+    help="Enable judge-based candidate ranking (defaults on when candidates > 1)",
+)
+@click.option(
+    "--judge-strategy",
+    type=click.Choice(["pointwise", "pairwise"]),
+    help="Judge ranking strategy",
+)
 @click.option("--runs-dir", type=click.Path(), help="Directory for run logs (default: ./runs)")
 @click.option("--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)")
 def generate(
@@ -270,6 +337,10 @@ def generate(
     output: str | None,
     no_eval: bool,
     eval_model: str | None,
+    candidates: int | None,
+    judge_model: str | None,
+    rank_with_judge: bool | None,
+    judge_strategy: str | None,
     runs_dir: str | None,
     log_runs: bool,
 ):
@@ -324,6 +395,10 @@ def generate(
             output,
             no_eval,
             eval_model,
+            candidates,
+            judge_model,
+            rank_with_judge,
+            judge_strategy,
             runs_dir,
             log_runs,
         )
@@ -340,10 +415,17 @@ async def _generate_async(
     output: str | None,
     no_eval: bool,
     eval_model: str | None,
+    candidates: int | None,
+    judge_model: str | None,
+    rank_with_judge: bool | None,
+    judge_strategy: str | None,
     runs_dir: str | None,
     log_runs: bool,
 ):
     """Async implementation of generate command."""
+    if candidates is not None and candidates < 1:
+        raise click.ClickException("--candidates must be at least 1.")
+
     config = Config.load()
     resolved = resolve_models(
         "generate",
@@ -363,6 +445,14 @@ async def _generate_async(
         command="generate",
     )
     extra_eval_model = resolved.extra_eval_model
+    candidate_count = candidates or config.default_candidate_count
+    resolved_judge_model = judge_model or config.effective_judge_model
+    resolved_judge_strategy = judge_strategy or config.judge_strategy
+    resolved_rank_with_judge = rank_with_judge
+    if resolved_rank_with_judge is None:
+        resolved_rank_with_judge = candidate_count > 1
+    if resolved_judge_strategy != "pointwise" and resolved_rank_with_judge:
+        raise click.ClickException("Only --judge-strategy pointwise is supported in v1.")
 
     _print_model_plan("generate", resolved)
 
@@ -377,6 +467,8 @@ async def _generate_async(
         console.print(f"Logging runs to: {batch_folder}", style="dim")
 
     async with _fast_agent_context(config) as agent:
+        generation_task = task
+
         # Generate from trace file
         if from_trace:
             console.print(f"Generating skill from trace: {from_trace}", style="dim")
@@ -395,234 +487,342 @@ async def _generate_async(
                 # Plain text, markdown, etc.
                 trace_context = trace_content[:4000]
 
-            task = f"{task}\n\nBased on this agent trace:\n\n{trace_context}"
-            console.print(f"Generating skill with {skill_gen_model}...", style="dim")
-            await _set_agent_model(agent.skill_gen, skill_gen_model)
-            skill = await generate_skill(
-                task=task,
-                examples=examples,
-                generator=agent.skill_gen,
-                model=skill_gen_model,
-            )
+            generation_task = f"{task}\n\nBased on this agent trace:\n\n{trace_context}"
         # Improve existing skill
         elif from_skill:
             existing_skill = Skill.load(Path(from_skill))
-            console.print(
-                f"Improving [bold]{existing_skill.name}[/bold] with {skill_gen_model}...",
-                style="dim",
-            )
-            await _set_agent_model(agent.skill_gen, skill_gen_model)
-            skill = await improve_skill(
-                existing_skill,
-                instructions=task,
-                generator=agent.skill_gen,
-                model=skill_gen_model,
-            )
         else:
-            console.print(f"Generating skill with {skill_gen_model}...", style="dim")
-            await _set_agent_model(agent.skill_gen, skill_gen_model)
-            skill = await generate_skill(
-                task=task,
-                examples=examples,
-                generator=agent.skill_gen,
-                model=skill_gen_model,
-            )
-        if no_eval:
-            _save_and_display(skill, output, config)
-            return
-
-        console.print("Generating test cases...", style="dim")
-        await _set_agent_model(agent.test_gen, test_gen_model)
-        test_cases = await generate_tests(task, generator=agent.test_gen, model=test_gen_model)
+            existing_skill = None
 
-        # Eval loop with refinement (on skill generation model)
-        prev_success_rate = 0.0
+        await _set_agent_model(agent.skill_gen, skill_gen_model)
+        ranking: RankedSkillBatch | None = None
+        skill: Skill | None = None
         results = None
-        attempts = max(1, config.max_refine_attempts)
-        for attempt in range(attempts):
+        eval_results = None
+
+        if candidate_count > 1:
             console.print(
-                f"Evaluating on {skill_gen_model}... (attempt {attempt + 1})",
+                f"Generating {candidate_count} candidate skills with {skill_gen_model}...",
                 style="dim",
             )
-
-            # Create run folder for logging (2 folders per attempt: baseline + with_skill)
-            run_folder = None
-            if log_runs and batch_folder:
-                baseline_run_num = attempt * 2 + 1
-                run_folder = create_run_folder(batch_folder, baseline_run_num)
-                write_run_metadata(
-                    run_folder,
-                    RunMetadata(
+            if existing_skill is not None:
+                candidates_list: list[Skill] = []
+                for index in range(candidate_count):
+                    candidate_skill = await improve_skill(
+                        existing_skill,
+                        instructions=task,
+                        generator=agent.skill_gen,
                         model=skill_gen_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=baseline_run_num,
-                    ),
+                    )
+                    candidate_skill.metadata.candidate_id = f"candidate-{index + 1}"
+                    candidates_list.append(candidate_skill)
+            else:
+                candidates_list = await generate_skill_candidates(
+                    task=generation_task,
+                    examples=examples,
+                    generator=agent.skill_gen,
+                    count=candidate_count,
+                    model=skill_gen_model,
                 )
 
-            console.print("[dim]Starting evaluation run...[/dim]")
+            if no_eval:
+                skill = candidates_list[0]
+                _save_and_display(skill, output, config)
+                return
 
-            results = await evaluate_skill(
-                skill,
-                test_cases=test_cases,
-                evaluator=agent.evaluator,
-                model=skill_gen_model,
-                show_baseline_progress=False,
+            console.print("Generating shared test cases...", style="dim")
+            await _set_agent_model(agent.test_gen, test_gen_model)
+            test_cases = await generate_tests(
+                generation_task,
+                generator=agent.test_gen,
+                model=test_gen_model,
             )
 
-            # Log run results (both baseline and with-skill for plot command)
-            if log_runs and run_folder:
-                # Log baseline result
-                baseline_result = RunResult(
-                    metadata=RunMetadata(
-                        model=skill_gen_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=baseline_run_num,
-                    ),
-                    stats=aggregate_conversation_stats(results.baseline_results),
-                    passed=results.baseline_success_rate > 0.5,
-                    assertions_passed=int(results.baseline_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="baseline",
-                    skill_name=skill.name,
-                )
-                write_run_result(run_folder, baseline_result)
-                run_results.append(baseline_result)
+            judge_agent = agent.judge if resolved_rank_with_judge else None
+            ranking = await evaluate_skill_candidates(
+                generation_task,
+                candidates_list,
+                test_cases,
+                evaluator=agent.evaluator,
+                judge=judge_agent,
+                skill_generation_model=skill_gen_model,
+                evaluation_model=skill_gen_model,
+                judge_model=resolved_judge_model if resolved_rank_with_judge else None,
+                judge_strategy=resolved_judge_strategy,
+                judge_weight=config.judge_weight,
+            )
+            winner = ranking.winner
+            if winner is None:
+                raise click.ClickException("Candidate ranking produced no winner.")
+            skill = winner.candidate.skill
+            skill.metadata.test_pass_rate = winner.candidate.hard_score
 
-                # Log with-skill result (in a separate folder)
-                with_skill_folder = create_run_folder(batch_folder, attempt * 2 + 2)
-                with_skill_result = RunResult(
-                    metadata=RunMetadata(
-                        model=skill_gen_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=attempt * 2 + 2,
-                    ),
-                    stats=aggregate_conversation_stats(results.with_skill_results),
-                    passed=results.is_beneficial,
-                    assertions_passed=int(results.with_skill_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="with_skill",
-                    skill_name=skill.name,
-                )
-                write_run_metadata(with_skill_folder, with_skill_result.metadata)
-                write_run_result(with_skill_folder, with_skill_result)
-                run_results.append(with_skill_result)
+            if log_runs and batch_folder and batch_id:
+                for ranked in ranking.ranked_results:
+                    candidate_run_folder = create_run_folder(batch_folder, len(run_results) + 1)
+                    candidate = ranked.candidate
+                    candidate_run = RunResult(
+                        metadata=RunMetadata(
+                            model=skill_gen_model,
+                            task=generation_task,
+                            batch_id=batch_id,
+                            run_number=len(run_results) + 1,
+                        ),
+                        stats=aggregate_conversation_stats(candidate.test_results),
+                        passed=not candidate.hard_gate_failed,
+                        assertions_passed=candidate.assertions_passed,
+                        assertions_total=candidate.assertions_total,
+                        run_type="with_skill",
+                        skill_name=candidate.skill.name,
+                        judge_model=ranked.judge_model,
+                        judge_score=candidate.judge_score,
+                        judge_summary=ranked.judge_summary,
+                        candidate_id=candidate.candidate_id,
+                        rank=ranked.rank,
+                    )
+                    write_run_metadata(candidate_run_folder, candidate_run.metadata)
+                    write_run_result(candidate_run_folder, candidate_run)
+                    run_results.append(candidate_run)
 
-            lift = results.skill_lift
-            lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
+                write_ranking_summary(batch_folder, ranking)
 
-            if results.is_beneficial:
+            if extra_eval_model:
+                console.print(f"Evaluating winning candidate on {extra_eval_model}...", style="dim")
+                eval_results = await evaluate_skill(
+                    skill,
+                    test_cases,
+                    evaluator=agent.evaluator,
+                    model=extra_eval_model,
+                    show_baseline_progress=False,
+                )
+        else:
+            if from_trace:
+                console.print(f"Generating skill with {skill_gen_model}...", style="dim")
+                skill = await generate_skill(
+                    task=generation_task,
+                    examples=examples,
+                    generator=agent.skill_gen,
+                    model=skill_gen_model,
+                )
+            elif existing_skill is not None:
                 console.print(
-                    f"  {results.baseline_success_rate:.0%} -> "
-                    f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]"
+                    f"Improving [bold]{existing_skill.name}[/bold] with {skill_gen_model}...",
+                    style="dim",
+                )
+                skill = await improve_skill(
+                    existing_skill,
+                    instructions=task,
+                    generator=agent.skill_gen,
+                    model=skill_gen_model,
+                )
+            else:
+                console.print(f"Generating skill with {skill_gen_model}...", style="dim")
+                skill = await generate_skill(
+                    task=generation_task,
+                    examples=examples,
+                    generator=agent.skill_gen,
+                    model=skill_gen_model,
                 )
-                break
+            if no_eval:
+                _save_and_display(skill, output, config)
+                return
 
-            console.print(
-                f"  {results.baseline_success_rate:.0%} -> "
-                f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough"
+            console.print("Generating test cases...", style="dim")
+            await _set_agent_model(agent.test_gen, test_gen_model)
+            test_cases = await generate_tests(
+                generation_task,
+                generator=agent.test_gen,
+                model=test_gen_model,
             )
 
-            if abs(results.with_skill_success_rate - prev_success_rate) < 0.05:
-                console.print("  [yellow]Plateaued, stopping[/yellow]")
-                break
+            # Eval loop with refinement (on skill generation model)
+            prev_success_rate = 0.0
+            attempts = max(1, config.max_refine_attempts)
+            for attempt in range(attempts):
+                console.print(
+                    f"Evaluating on {skill_gen_model}... (attempt {attempt + 1})",
+                    style="dim",
+                )
 
-            prev_success_rate = results.with_skill_success_rate
+                # Create run folder for logging (2 folders per attempt: baseline + with_skill)
+                run_folder = None
+                if log_runs and batch_folder:
+                    baseline_run_num = attempt * 2 + 1
+                    run_folder = create_run_folder(batch_folder, baseline_run_num)
+                    write_run_metadata(
+                        run_folder,
+                        RunMetadata(
+                            model=skill_gen_model,
+                            task=generation_task,
+                            batch_id=batch_id or "",
+                            run_number=baseline_run_num,
+                        ),
+                    )
+
+                console.print("[dim]Starting evaluation run...[/dim]")
 
-            if attempt < attempts - 1:
-                console.print("Refining...", style="dim")
-                failures = get_failure_descriptions(results)
-                await _set_agent_model(agent.skill_gen, skill_gen_model)
-                skill = await refine_skill(
+                results = await evaluate_skill(
                     skill,
-                    failures,
-                    generator=agent.skill_gen,
+                    test_cases=test_cases,
+                    evaluator=agent.evaluator,
                     model=skill_gen_model,
+                    show_baseline_progress=False,
                 )
 
-        # If eval_model specified, also eval on that model
-        eval_results = None
-        if extra_eval_model:
-            console.print(f"Evaluating on {extra_eval_model}...", style="dim")
+                # Log run results (both baseline and with-skill for plot command)
+                if log_runs and run_folder:
+                    # Log baseline result
+                    baseline_result = RunResult(
+                        metadata=RunMetadata(
+                            model=skill_gen_model,
+                            task=generation_task,
+                            batch_id=batch_id or "",
+                            run_number=baseline_run_num,
+                        ),
+                        stats=aggregate_conversation_stats(results.baseline_results),
+                        passed=results.baseline_success_rate > 0.5,
+                        assertions_passed=int(results.baseline_success_rate * len(test_cases)),
+                        assertions_total=len(test_cases),
+                        run_type="baseline",
+                        skill_name=skill.name,
+                    )
+                    write_run_result(run_folder, baseline_result)
+                    run_results.append(baseline_result)
 
-            # Create run folder for eval model
-            run_folder = None
-            if log_runs and batch_folder:
-                run_number = len(run_results) + 1
-                run_folder = create_run_folder(batch_folder, run_number)
-                write_run_metadata(
-                    run_folder,
-                    RunMetadata(
-                        model=extra_eval_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=run_number,
-                    ),
-                )
+                    # Log with-skill result (in a separate folder)
+                    with_skill_folder = create_run_folder(batch_folder, attempt * 2 + 2)
+                    with_skill_result = RunResult(
+                        metadata=RunMetadata(
+                            model=skill_gen_model,
+                            task=generation_task,
+                            batch_id=batch_id or "",
+                            run_number=attempt * 2 + 2,
+                        ),
+                        stats=aggregate_conversation_stats(results.with_skill_results),
+                        passed=results.is_beneficial,
+                        assertions_passed=int(results.with_skill_success_rate * len(test_cases)),
+                        assertions_total=len(test_cases),
+                        run_type="with_skill",
+                        skill_name=skill.name,
+                    )
+                    write_run_metadata(with_skill_folder, with_skill_result.metadata)
+                    write_run_result(with_skill_folder, with_skill_result)
+                    run_results.append(with_skill_result)
 
-            eval_results = await evaluate_skill(
-                skill,
-                test_cases,
-                evaluator=agent.evaluator,
-                model=extra_eval_model,
-                show_baseline_progress=False,
-            )
+                lift = results.skill_lift
+                lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
 
-            # Log eval run results (both baseline and with-skill)
-            if log_runs and run_folder:
-                # Log baseline result
-                baseline_result = RunResult(
-                    metadata=RunMetadata(
-                        model=extra_eval_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=run_number,
-                    ),
-                    stats=aggregate_conversation_stats(eval_results.baseline_results),
-                    passed=eval_results.baseline_success_rate > 0.5,
-                    assertions_passed=int(eval_results.baseline_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="baseline",
-                    skill_name=skill.name,
+                if results.is_beneficial:
+                    console.print(
+                        f"  {results.baseline_success_rate:.0%} -> "
+                        f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]"
+                    )
+                    break
+
+                console.print(
+                    f"  {results.baseline_success_rate:.0%} -> "
+                    f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough"
                 )
-                write_run_result(run_folder, baseline_result)
-                run_results.append(baseline_result)
 
-                # Log with-skill result
-                with_skill_folder = create_run_folder(batch_folder, run_number + 1)
-                with_skill_result = RunResult(
-                    metadata=RunMetadata(
-                        model=extra_eval_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=run_number + 1,
-                    ),
-                    stats=aggregate_conversation_stats(eval_results.with_skill_results),
-                    passed=eval_results.is_beneficial,
-                    assertions_passed=int(eval_results.with_skill_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="with_skill",
-                    skill_name=skill.name,
+                if abs(results.with_skill_success_rate - prev_success_rate) < 0.05:
+                    console.print("  [yellow]Plateaued, stopping[/yellow]")
+                    break
+
+                prev_success_rate = results.with_skill_success_rate
+
+                if attempt < attempts - 1:
+                    console.print("Refining...", style="dim")
+                    failures = get_failure_descriptions(results)
+                    await _set_agent_model(agent.skill_gen, skill_gen_model)
+                    skill = await refine_skill(
+                        skill,
+                        failures,
+                        generator=agent.skill_gen,
+                        model=skill_gen_model,
+                    )
+
+            # If eval_model specified, also eval on that model
+            if extra_eval_model:
+                console.print(f"Evaluating on {extra_eval_model}...", style="dim")
+
+                # Create run folder for eval model
+                run_folder = None
+                if log_runs and batch_folder:
+                    run_number = len(run_results) + 1
+                    run_folder = create_run_folder(batch_folder, run_number)
+                    write_run_metadata(
+                        run_folder,
+                        RunMetadata(
+                            model=extra_eval_model,
+                            task=generation_task,
+                            batch_id=batch_id or "",
+                            run_number=run_number,
+                        ),
+                    )
+
+                eval_results = await evaluate_skill(
+                    skill,
+                    test_cases,
+                    evaluator=agent.evaluator,
+                    model=extra_eval_model,
+                    show_baseline_progress=False,
                 )
-                write_run_metadata(with_skill_folder, with_skill_result.metadata)
-                write_run_result(with_skill_folder, with_skill_result)
-                run_results.append(with_skill_result)
 
-            lift = eval_results.skill_lift
-            lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
-            console.print(
-                f"  {eval_results.baseline_success_rate:.0%} -> "
-                f"{eval_results.with_skill_success_rate:.0%} ({lift_str})"
-            )
+                # Log eval run results (both baseline and with-skill)
+                if log_runs and run_folder:
+                    # Log baseline result
+                    baseline_result = RunResult(
+                        metadata=RunMetadata(
+                            model=extra_eval_model,
+                            task=generation_task,
+                            batch_id=batch_id or "",
+                            run_number=run_number,
+                        ),
+                        stats=aggregate_conversation_stats(eval_results.baseline_results),
+                        passed=eval_results.baseline_success_rate > 0.5,
+                        assertions_passed=int(eval_results.baseline_success_rate * len(test_cases)),
+                        assertions_total=len(test_cases),
+                        run_type="baseline",
+                        skill_name=skill.name,
+                    )
+                    write_run_result(run_folder, baseline_result)
+                    run_results.append(baseline_result)
+
+                    # Log with-skill result
+                    with_skill_folder = create_run_folder(batch_folder, run_number + 1)
+                    with_skill_result = RunResult(
+                        metadata=RunMetadata(
+                            model=extra_eval_model,
+                            task=generation_task,
+                            batch_id=batch_id or "",
+                            run_number=run_number + 1,
+                        ),
+                        stats=aggregate_conversation_stats(eval_results.with_skill_results),
+                        passed=eval_results.is_beneficial,
+                        assertions_passed=int(
+                            eval_results.with_skill_success_rate * len(test_cases)
+                        ),
+                        assertions_total=len(test_cases),
+                        run_type="with_skill",
+                        skill_name=skill.name,
+                    )
+                    write_run_metadata(with_skill_folder, with_skill_result.metadata)
+                    write_run_result(with_skill_folder, with_skill_result)
+                    run_results.append(with_skill_result)
+
+                lift = eval_results.skill_lift
+                lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
+                console.print(
+                    f"  {eval_results.baseline_success_rate:.0%} -> "
+                    f"{eval_results.with_skill_success_rate:.0%} ({lift_str})"
+                )
 
         # Write batch summary
         if log_runs and batch_folder and batch_id:
             summary = BatchSummary(
                 batch_id=batch_id,
                 model=skill_gen_model,
-                task=task,
+                task=generation_task,
                 total_runs=len(run_results),
                 passed_runs=sum(1 for r in run_results if r.passed),
                 results=run_results,
@@ -632,6 +832,8 @@ async def _generate_async(
     if not no_eval and skill is not None:
         if results:
             skill.metadata.test_pass_rate = results.with_skill_success_rate
+        elif ranking and ranking.winner:
+            skill.metadata.test_pass_rate = ranking.winner.candidate.hard_score
         else:
             console.print(
                 "[yellow]No evaluation results available; skipping report output.[/yellow]"
@@ -645,6 +847,7 @@ async def _generate_async(
             eval_results,
             skill_gen_model,
             extra_eval_model,
+            ranking,
         )
 
 
@@ -659,6 +862,7 @@ def _save_and_display(
     eval_results=None,
     skill_gen_model: str | None = None,
     eval_model: str | None = None,
+    ranking: RankedSkillBatch | None = None,
 ):
     """Save skill and display summary."""
     if output:
@@ -683,6 +887,9 @@ def _save_and_display(
     for name in skill.scripts:
         console.print(f"  scripts/{name}     (exec only)")
 
+    if ranking:
+        _render_ranking_summary(ranking)
+
     # Show results with horizontal bars
     if results and eval_results:
         # Multiple models - show each with bars
@@ -1178,6 +1385,137 @@ async def _eval_async(
                     console.print("\n[yellow]Recommendation: skill may not be beneficial[/yellow]")
 
 
+@main.command("ci")
+@click.option(
+    "--manifest",
+    "manifest_path",
+    type=click.Path(exists=True),
+    default=".upskill/evals.yaml",
+    show_default=True,
+    help="Path to scenario manifest",
+)
+@click.option(
+    "--scope",
+    type=click.Choice(["changed", "all"]),
+    default="changed",
+    show_default=True,
+    help="Whether to run only impacted scenarios or the full suite",
+)
+@click.option(
+    "--base-ref",
+    default="origin/main",
+    show_default=True,
+    help="Base ref used for changed-skill selection",
+)
+@click.option("--eval-model", help="Model to use for scenario execution")
+@click.option("--judge-model", help="Model to use for advisory judge scoring")
+@click.option(
+    "--summary-json",
+    type=click.Path(),
+    help="Path for machine-readable CI report JSON",
+)
+@click.option("--runs-dir", type=click.Path(), help="Directory for run logs")
+@click.option(
+    "--fail-on-no-scenarios/--no-fail-on-no-scenarios",
+    default=False,
+    help="Exit with an error when no scenarios are selected",
+)
+def ci_cmd(
+    manifest_path: str,
+    scope: str,
+    base_ref: str,
+    eval_model: str | None,
+    judge_model: str | None,
+    summary_json: str | None,
+    runs_dir: str | None,
+    fail_on_no_scenarios: bool,
+):
+    """Run scenario-based CI evaluation for impacted skills."""
+    asyncio.run(
+        _ci_async(
+            manifest_path=manifest_path,
+            scope=scope,
+            base_ref=base_ref,
+            eval_model=eval_model,
+            judge_model=judge_model,
+            summary_json=summary_json,
+            runs_dir=runs_dir,
+            fail_on_no_scenarios=fail_on_no_scenarios,
+        )
+    )
+
+
+async def _ci_async(
+    *,
+    manifest_path: str,
+    scope: str,
+    base_ref: str,
+    eval_model: str | None,
+    judge_model: str | None,
+    summary_json: str | None,
+    runs_dir: str | None,
+    fail_on_no_scenarios: bool,
+) -> None:
+    """Async implementation of the CI command."""
+    config = Config.load()
+    manifest = Path(manifest_path).resolve()
+    runs_path = (Path(runs_dir) if runs_dir else config.runs_dir).resolve()
+    summary_path = (
+        Path(summary_json).resolve() if summary_json else (Path.cwd() / "upskill-report.json")
+    )
+
+    resolved_eval_model = eval_model or config.effective_eval_model
+    resolved_judge_model = judge_model or config.effective_judge_model
+
+    console.print("[dim]CI model plan:[/dim]")
+    console.print(f"  Evaluation Model: {resolved_eval_model}")
+    console.print(f"  Judge Model: {resolved_judge_model}")
+    console.print(f"  Scope: {scope}")
+    if scope == "changed":
+        console.print(f"  Base Ref: {base_ref}")
+
+    preview_report, selected_scenarios = plan_ci_suite(
+        manifest,
+        scope=scope,
+        base_ref=base_ref,
+        working_dir=Path.cwd(),
+    )
+    if not selected_scenarios:
+        write_ci_report(summary_path, preview_report)
+        write_step_summary(preview_report)
+        console.print()
+        console.print("[yellow]No scenarios selected.[/yellow]")
+        console.print(f"[dim]Report written to {summary_path}[/dim]")
+        if fail_on_no_scenarios:
+            sys.exit(1)
+        return
+
+    async with _fast_agent_context(config) as agent:
+        report = await run_ci_suite(
+            manifest,
+            evaluator=agent.evaluator,
+            judge=agent.judge,
+            scope=scope,
+            base_ref=base_ref,
+            eval_model=resolved_eval_model,
+            judge_model=resolved_judge_model,
+            working_dir=Path.cwd(),
+            runs_dir=runs_path,
+        )
+
+    write_ci_report(summary_path, report)
+    write_step_summary(report)
+
+    console.print()
+    console.print(render_ci_report_markdown(report))
+    console.print()
+    console.print(f"[dim]Report written to {summary_path}[/dim]")
+    console.print(f"[dim]Runs written under {runs_path}[/dim]")
+
+    if not report.success:
+        sys.exit(1)
+
+
 @main.command("list")
 @click.option("-d", "--dir", "skills_dir", type=click.Path(), help="Skills directory to list")
 @click.option("-v", "--verbose", is_flag=True, help="Show detailed skill structure")
diff --git a/src/upskill/config.py b/src/upskill/config.py
index 01f1f5c..32950c9 100644
--- a/src/upskill/config.py
+++ b/src/upskill/config.py
@@ -120,6 +120,13 @@ class Config(BaseModel):
         default=None,
         description="Model for test generation (defaults to skill generation model)",
     )
+    judge_model: str | None = Field(
+        default=None,
+        description=(
+            "Model for LLM-as-a-judge ranking "
+            "(defaults to eval_model or skill generation model)"
+        ),
+    )
 
     # Directory settings
     skills_dir: Path = Field(
@@ -132,6 +139,12 @@ class Config(BaseModel):
     # Generation settings
     auto_eval: bool = Field(default=True, description="Run eval after generation")
     max_refine_attempts: int = Field(default=2, description="Max refinement iterations")
+    default_candidate_count: int = Field(
+        default=1,
+        description="Default number of candidate skills to generate per task",
+    )
+    judge_strategy: str = Field(default="pointwise", description="Judge ranking strategy")
+    judge_weight: float = Field(default=0.3, description="Weight for judge score in ranking")
 
     # FastAgent settings
     fastagent_config: Path | None = Field(default=None, description="Path to fastagent.config.yaml")
@@ -176,6 +189,11 @@ def effective_eval_model(self) -> str:
         """Get the model to use for evaluation."""
         return self.eval_model or self.skill_generation_model
 
+    @property
+    def effective_judge_model(self) -> str:
+        """Get the model to use for judge-based ranking."""
+        return self.judge_model or self.effective_eval_model
+
     @property
     def model(self) -> str:
         """Backward-compatible alias for ``skill_generation_model``."""
diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py
index fdc07c4..212da22 100644
--- a/src/upskill/evaluate.py
+++ b/src/upskill/evaluate.py
@@ -9,9 +9,16 @@
 from collections.abc import Generator
 from contextlib import contextmanager, nullcontext
 from pathlib import Path
+from typing import Any
 
-from fast_agent import ConversationSummary
-from fast_agent.agents.llm_agent import LlmAgent
+try:
+    from fast_agent import ConversationSummary
+    from fast_agent.agents.llm_agent import LlmAgent
+except ModuleNotFoundError:  # pragma: no cover - enables unit tests without fast-agent
+    ConversationSummary = Any
+
+    class LlmAgent:  # type: ignore[no-redef]
+        pass
 
 try:
     from fast_agent.ui.rich_progress import progress_display
@@ -20,18 +27,23 @@
 
 from upskill.fastagent_integration import (
     compose_instruction,
+    compose_instruction_bundle,
 )
 from upskill.logging import extract_stats_from_summary
 from upskill.models import (
+    CandidateEvalResult,
+    CapturedArtifact,
     ConversationStats,
     EvalResults,
-    ExpectedSpec,
+    JudgeCriterionScore,
+    JudgeEvaluation,
+    RankedSkillBatch,
+    RankedSkillResult,
     Skill,
     TestCase,
     TestResult,
-    ValidationResult,
 )
-from upskill.validators import get_validator
+from upskill.verifiers import run_verifiers
 
 
 def _hide_progress_task(task_name: str | None) -> None:
@@ -54,9 +66,31 @@ def _hide_progress_task(task_name: str | None) -> None:
     "You need to evaluate the skill on the test case and return a score."
 )
 
+JUDGE_CRITERIA = (
+    "instruction_quality",
+    "helpfulness",
+    "robustness",
+    "concision",
+    "generalizability",
+)
+
+MAX_CAPTURE_CHARS = 4000
+WORKSPACE_IGNORE_NAMES = {
+    ".git",
+    ".venv",
+    ".mypy_cache",
+    ".pytest_cache",
+    "__pycache__",
+    "runs",
+}
+
 
 @contextmanager
-def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Generator[Path]:
+def isolated_workspace(
+    base_dir: Path | None = None,
+    cleanup: bool = True,
+    seed_dir: Path | None = None,
+) -> Generator[Path]:
     """Create an isolated workspace for a test run.
 
     Args:
@@ -69,6 +103,8 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge
     workspace = tempfile.mkdtemp(dir=base_dir, prefix="upskill_run_")
     workspace_path = Path(workspace)
     try:
+        if seed_dir is not None:
+            _seed_workspace_from_directory(seed_dir, workspace_path)
         yield workspace_path
     finally:
         if cleanup:
@@ -78,41 +114,71 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge
                 pass  # Ignore cleanup errors
 
 
-def check_expected(
-    output: str,
-    expected: ExpectedSpec,
-    workspace: Path | None = None,
-    test_case: TestCase | None = None,
-) -> tuple[bool, ValidationResult | None]:
-    """Check if output matches expected conditions.
+def _seed_workspace_from_directory(seed_dir: Path, workspace: Path) -> None:
+    """Copy a seed checkout into a temporary workspace."""
+    for source in seed_dir.iterdir():
+        if source.name in WORKSPACE_IGNORE_NAMES:
+            continue
+        destination = workspace / source.name
+        if source.is_dir():
+            shutil.copytree(
+                source,
+                destination,
+                dirs_exist_ok=True,
+                ignore=shutil.ignore_patterns(*WORKSPACE_IGNORE_NAMES),
+            )
+        else:
+            destination.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(source, destination)
 
-    Args:
-        output: The agent's output string
-        expected: Expected conditions dict (legacy format with "contains")
-        workspace: Optional workspace directory for file-based validation
-        test_case: Optional test case with custom validator config
 
-    Returns:
-        Tuple of (success, validation_result)
-    """
-    # Handle custom validator if specified
-    if test_case and test_case.validator:
-        validator = get_validator(test_case.validator)
-        if validator and workspace:
-            config = test_case.validator_config or {}
-            result = validator(
-                workspace=workspace,
-                output_file=test_case.output_file or "",
-                **config,
+def _copy_skill_directory(source_dir: Path, destination_dir: Path) -> None:
+    destination_dir.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copytree(source_dir, destination_dir, dirs_exist_ok=True)
+
+
+def _capture_artifacts(test_case: TestCase, workspace: Path | None) -> list[CapturedArtifact]:
+    if workspace is None or not test_case.output_file:
+        return []
+
+    target = workspace / test_case.output_file
+    if not target.exists() or not target.is_file():
+        return []
+
+    try:
+        content = target.read_text(encoding="utf-8")
+    except UnicodeDecodeError:
+        return [
+            CapturedArtifact(
+                path=test_case.output_file,
+                content="<binary artifact omitted>",
+                truncated=False,
             )
-            return result.passed, result
+        ]
+
+    truncated = len(content) > MAX_CAPTURE_CHARS
+    if truncated:
+        content = content[:MAX_CAPTURE_CHARS].rstrip() + "\n..."
+    return [
+        CapturedArtifact(
+            path=test_case.output_file,
+            content=content,
+            truncated=truncated,
+        )
+    ]
 
-    required = expected.contains
-    output_lower = output.lower()
-    if any(item.lower() not in output_lower for item in required):
-        return False, None
 
-    return True, None
+def _workspace_required_for_test(
+    test_case: TestCase,
+    *,
+    seed_dir: Path | None,
+    mounted_skills: list[tuple[Path, Skill]] | None,
+) -> bool:
+    if test_case.output_file or test_case.validator or test_case.verifiers:
+        return True
+    if seed_dir is not None:
+        return True
+    return bool(mounted_skills)
 
 
 async def _run_test_with_evaluator(
@@ -122,6 +188,8 @@ async def _run_test_with_evaluator(
     *,
     use_workspace: bool | None = None,
     instance_name: str | None = None,
+    seed_dir: Path | None = None,
+    mounted_skills: list[tuple[Path, Skill]] | None = None,
 ) -> TestResult:
     """Run a single test case using a provided evaluator agent."""
     user_content = test_case.input
@@ -130,11 +198,25 @@ async def _run_test_with_evaluator(
             user_content += f"\n\n```{filename}\n{content}\n```"
 
     # Determine if we need workspace isolation
-    needs_workspace = use_workspace if use_workspace is not None else bool(test_case.validator)
+    needs_workspace = use_workspace if use_workspace is not None else _workspace_required_for_test(
+        test_case,
+        seed_dir=seed_dir,
+        mounted_skills=mounted_skills,
+    )
 
     async def _run_in_workspace(workspace: Path | None) -> TestResult:
         clone: LlmAgent | None = None
         try:
+            if workspace is not None and mounted_skills:
+                for source_dir, skill in mounted_skills:
+                    try:
+                        relative_path = source_dir.relative_to(seed_dir) if seed_dir else None
+                    except ValueError:
+                        relative_path = None
+                    if relative_path is None:
+                        relative_path = Path(".upskill") / "skills" / skill.name
+                    _copy_skill_directory(source_dir, workspace / relative_path)
+
             clone = await evaluator.spawn_detached_instance(name=instance_name)
             if workspace is not None:
                 enable_shell = getattr(clone, "enable_shell", None)
@@ -157,21 +239,13 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult:
             except Exception as exc:
                 logger.exception("Failed to extract stats from evaluator history", exc_info=exc)
 
-            # Check expected with custom validator support
-            if workspace and test_case.validator:
-                success, validation_result = check_expected(
-                    output or "",
-                    test_case.expected,
-                    workspace,
-                    test_case,
-                )
-            else:
-                success, validation_result = check_expected(
-                    output or "",
-                    test_case.expected,
-                    None,
-                    test_case,
-                )
+            artifacts = _capture_artifacts(test_case, workspace)
+            validation_result = run_verifiers(
+                test_case,
+                output=output or "",
+                workspace=workspace,
+            )
+            success = validation_result.passed
 
             return TestResult(
                 test_case=test_case,
@@ -181,6 +255,7 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult:
                 turns=stats.turns,
                 stats=stats,
                 validation_result=validation_result,
+                artifacts=artifacts,
             )
         except Exception as exc:
             return TestResult(test_case=test_case, success=False, error=str(exc))
@@ -193,11 +268,62 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult:
             _hide_progress_task(instance_name)
 
     if needs_workspace:
-        with isolated_workspace() as workspace:
+        with isolated_workspace(seed_dir=seed_dir) as workspace:
             return await _run_in_workspace(workspace)
     return await _run_in_workspace(None)
 
 
+async def run_test_with_skills(
+    test_case: TestCase,
+    evaluator: LlmAgent,
+    skills: list[Skill] | None = None,
+    *,
+    use_workspace: bool | None = None,
+    model: str | None = None,
+    instance_name: str | None = None,
+    seed_dir: Path | None = None,
+    mounted_skills: list[tuple[Path, Skill]] | None = None,
+) -> TestResult:
+    """Run a single test case with a bundle of injected skills."""
+
+    bundle = skills or []
+    try:
+        if model is not None:
+            await evaluator.set_model(model)
+
+        mounted_paths: dict[str, str] = {}
+        if mounted_skills:
+            for source_dir, skill in mounted_skills:
+                try:
+                    relative_path = source_dir.relative_to(seed_dir) if seed_dir else None
+                except ValueError:
+                    relative_path = None
+                if relative_path is None:
+                    relative_path = Path(".upskill") / "skills" / skill.name
+                mounted_paths[skill.name] = relative_path.as_posix()
+
+        instruction = (
+            compose_instruction_bundle(
+                evaluator.instruction,
+                bundle,
+                mounted_paths=mounted_paths or None,
+            )
+            if bundle
+            else None
+        )
+        return await _run_test_with_evaluator(
+            test_case,
+            evaluator,
+            instruction,
+            use_workspace=use_workspace,
+            instance_name=instance_name,
+            seed_dir=seed_dir,
+            mounted_skills=mounted_skills,
+        )
+    except Exception as exc:
+        return TestResult(test_case=test_case, success=False, error=str(exc))
+
+
 async def run_test(
     test_case: TestCase,
     evaluator: LlmAgent,
@@ -218,14 +344,13 @@ async def run_test(
     """
 
     try:
-        if model is not None:
-            await evaluator.set_model(model)
-        instruction = compose_instruction(evaluator.instruction, skill) if skill else None
-        return await _run_test_with_evaluator(
+        bundle = [skill] if skill else []
+        return await run_test_with_skills(
             test_case,
             evaluator,
-            instruction,
+            bundle,
             use_workspace=use_workspace,
+            model=model,
             instance_name=instance_name,
         )
     except Exception as exc:
@@ -319,6 +444,296 @@ async def _run_batch(
     return results
 
 
+def summarize_test_results(test_results: list[TestResult]) -> tuple[int, int, float, float]:
+    """Return assertions passed/total plus average tokens and turns."""
+    assertions_passed = 0
+    assertions_total = 0
+    total_tokens = 0
+    total_turns = 0
+
+    for result in test_results:
+        total_tokens += result.stats.total_tokens
+        total_turns += result.stats.turns
+        if result.validation_result:
+            assertions_passed += result.validation_result.assertions_passed
+            assertions_total += result.validation_result.assertions_total
+        else:
+            assertions_total += 1
+            if result.success:
+                assertions_passed += 1
+
+    count = len(test_results)
+    avg_tokens = total_tokens / count if count else 0.0
+    avg_turns = total_turns / count if count else 0.0
+    return assertions_passed, assertions_total, avg_tokens, avg_turns
+
+
+def build_default_judge_evaluation(summary: str) -> JudgeEvaluation:
+    """Build a neutral fallback judge result."""
+    return JudgeEvaluation(
+        summary=summary,
+        criteria=[
+            JudgeCriterionScore(
+                criterion=criterion,
+                score=3,
+                rationale="Judge output unavailable; using neutral fallback score.",
+            )
+            for criterion in JUDGE_CRITERIA
+        ],
+    )
+
+
+async def judge_test_result(
+    task: str,
+    skill: Skill | list[Skill],
+    test_result: TestResult,
+    judge: LlmAgent,
+    *,
+    judge_model: str | None = None,
+    criteria: tuple[str, ...] | list[str] | None = None,
+    instance_name: str | None = None,
+) -> JudgeEvaluation:
+    """Run LLM-as-a-judge for one executed candidate/test result."""
+
+    clone: LlmAgent | None = None
+    try:
+        clone = await judge.spawn_detached_instance(name=instance_name)
+        if judge_model is not None:
+            await clone.set_model(judge_model)
+
+        skill_bundle = skill if isinstance(skill, list) else [skill]
+        artifact_sections = []
+        for artifact in test_result.artifacts:
+            artifact_sections.append(
+                f"Artifact path: {artifact.path}\n"
+                f"Artifact content:\n{artifact.content}"
+            )
+        artifact_block = "\n\n".join(artifact_sections) if artifact_sections else "none"
+        rubric = tuple(criteria or JUDGE_CRITERIA)
+        skill_sections = []
+        for item in skill_bundle:
+            skill_sections.append(
+                f"Skill name: {item.name}\n"
+                f"Skill description: {item.description}\n"
+                f"Skill body:\n{item.body}"
+            )
+        verifier_payload = [
+            spec.model_dump(mode="json") for spec in test_result.test_case.effective_verifiers()
+        ]
+        validation_payload = (
+            test_result.validation_result.model_dump(mode="json")
+            if test_result.validation_result
+            else "none"
+        )
+
+        prompt = (
+            f"Original task:\n{task}\n\n"
+            f"Candidate skill bundle:\n{chr(10).join(skill_sections)}\n\n"
+            f"Test case input:\n{test_result.test_case.input}\n\n"
+            f"Verifiers:\n{verifier_payload}\n\n"
+            f"Agent output:\n{test_result.output or ''}\n\n"
+            f"Captured artifacts:\n{artifact_block}\n\n"
+            f"Execution success: {test_result.success}\n"
+            f"Execution error: {test_result.error or ''}\n"
+            f"Validation result: {validation_payload}\n\n"
+            "Score this executed candidate against the rubric. "
+            "Return structured data with exactly these criteria: "
+            f"{', '.join(rubric)}."
+        )
+        result, _ = await clone.structured(prompt, JudgeEvaluation)
+        if result is None:
+            return build_default_judge_evaluation("Judge returned no structured result.")
+        return result
+    except Exception as exc:
+        logger.exception("Judge evaluation failed", exc_info=exc)
+        return build_default_judge_evaluation(f"Judge evaluation failed: {exc}")
+    finally:
+        if clone is not None:
+            try:
+                await clone.shutdown()
+            except Exception as exc:
+                logger.exception("Failed to shutdown judge clone", exc_info=exc)
+        _hide_progress_task(instance_name)
+
+
+def rank_candidate_results(
+    task: str,
+    candidate_results: list[CandidateEvalResult],
+    *,
+    skill_generation_model: str,
+    evaluation_model: str,
+    judge_model: str | None,
+    judge_strategy: str,
+    tests: list[TestCase],
+    judge_weight: float = 0.3,
+) -> RankedSkillBatch:
+    """Rank evaluated candidates using hard score, judge score, and token efficiency."""
+
+    if not candidate_results:
+        return RankedSkillBatch(
+            task=task,
+            skill_generation_model=skill_generation_model,
+            evaluation_model=evaluation_model,
+            judge_model=judge_model,
+            judge_strategy=judge_strategy,
+            candidate_count=0,
+            tests=tests,
+        )
+
+    min_avg_tokens = min(result.average_tokens for result in candidate_results)
+    max_avg_tokens = max(result.average_tokens for result in candidate_results)
+    token_range = max_avg_tokens - min_avg_tokens
+
+    for result in candidate_results:
+        if token_range <= 0:
+            result.token_efficiency_score = 1.0
+        else:
+            result.token_efficiency_score = 1 - (
+                (result.average_tokens - min_avg_tokens) / token_range
+            )
+
+        hard_score = result.hard_score
+        judge_score = result.judge_score
+        token_score = result.token_efficiency_score
+        result.composite_score = (
+            0.6 * hard_score
+            + judge_weight * judge_score
+            + 0.1 * token_score
+        )
+
+    best_hard_score = max(result.hard_score for result in candidate_results)
+    hard_gate_threshold = max(0.0, best_hard_score - 0.2)
+    for result in candidate_results:
+        result.hard_gate_failed = result.hard_score < hard_gate_threshold
+
+    ordered = sorted(
+        candidate_results,
+        key=lambda result: (
+            result.hard_gate_failed,
+            -result.hard_score,
+            -result.judge_score,
+            -result.token_efficiency_score,
+            -result.composite_score,
+            result.candidate_id,
+        ),
+    )
+
+    ranked_results: list[RankedSkillResult] = []
+    for index, result in enumerate(ordered, start=1):
+        result.skill.metadata.candidate_id = result.candidate_id
+        summary = None
+        if result.judge_evaluations:
+            summaries = [item.summary for item in result.judge_evaluations if item.summary]
+            summary = summaries[0] if summaries else None
+        margin = None
+        if index < len(ordered):
+            margin = result.composite_score - ordered[index].composite_score
+        ranked_results.append(
+            RankedSkillResult(
+                rank=index,
+                candidate=result,
+                judge_model=judge_model,
+                judge_summary=summary,
+                score_margin_from_next=margin,
+            )
+        )
+
+    return RankedSkillBatch(
+        task=task,
+        skill_generation_model=skill_generation_model,
+        evaluation_model=evaluation_model,
+        judge_model=judge_model,
+        judge_strategy=judge_strategy,
+        candidate_count=len(candidate_results),
+        ranked_results=ranked_results,
+        tests=tests,
+    )
+
+
+async def evaluate_skill_candidates(
+    task: str,
+    candidates: list[Skill],
+    test_cases: list[TestCase],
+    evaluator: LlmAgent,
+    judge: LlmAgent | None,
+    *,
+    skill_generation_model: str | None = None,
+    evaluation_model: str,
+    judge_model: str | None,
+    judge_strategy: str = "pointwise",
+    judge_weight: float = 0.3,
+) -> RankedSkillBatch:
+    """Evaluate and rank multiple candidate skills."""
+
+    if judge_strategy != "pointwise":
+        raise ValueError("Only pointwise judge strategy is supported in v1.")
+
+    candidate_results: list[CandidateEvalResult] = []
+    for index, skill in enumerate(candidates, start=1):
+        eval_results = await evaluate_skill(
+            skill,
+            test_cases=test_cases,
+            evaluator=evaluator,
+            model=evaluation_model,
+            run_baseline=False,
+            show_baseline_progress=False,
+        )
+        assertions_passed, assertions_total, avg_tokens, avg_turns = summarize_test_results(
+            eval_results.with_skill_results
+        )
+        hard_score = (
+            assertions_passed / assertions_total if assertions_total else 0.0
+        )
+
+        judge_evaluations: list[JudgeEvaluation] = []
+        if judge is not None:
+            for test_index, test_result in enumerate(eval_results.with_skill_results, start=1):
+                judge_evaluations.append(
+                    await judge_test_result(
+                        task,
+                        skill,
+                        test_result,
+                        judge,
+                        judge_model=judge_model,
+                        instance_name=(
+                            f"judge ({skill.metadata.candidate_id or index} test {test_index})"
+                        ),
+                    )
+                )
+
+        judge_score = (
+            sum(item.normalized_score for item in judge_evaluations) / len(judge_evaluations)
+            if judge_evaluations
+            else 0.0
+        )
+        candidate_id = skill.metadata.candidate_id or f"candidate-{index}"
+        candidate_results.append(
+            CandidateEvalResult(
+                candidate_id=candidate_id,
+                skill=skill,
+                test_results=eval_results.with_skill_results,
+                judge_evaluations=judge_evaluations,
+                assertions_passed=assertions_passed,
+                assertions_total=assertions_total,
+                hard_score=hard_score,
+                judge_score=judge_score,
+                average_tokens=avg_tokens,
+                average_turns=avg_turns,
+            )
+        )
+
+    return rank_candidate_results(
+        task,
+        candidate_results,
+        skill_generation_model=skill_generation_model or evaluation_model,
+        evaluation_model=evaluation_model,
+        judge_model=judge_model,
+        judge_strategy=judge_strategy,
+        tests=test_cases,
+        judge_weight=judge_weight,
+    )
+
+
 def get_failure_descriptions(results: EvalResults) -> list[str]:
     """Extract descriptions of failed tests for refinement."""
     failures = []
diff --git a/src/upskill/fastagent_integration.py b/src/upskill/fastagent_integration.py
index 32f8349..44b9ec2 100644
--- a/src/upskill/fastagent_integration.py
+++ b/src/upskill/fastagent_integration.py
@@ -12,5 +12,32 @@ def compose_instruction(instruction: str, skill: Skill | None) -> str:
     """Inject the skill content into an instruction when provided."""
     if not skill:
         return instruction
-    return f"{instruction}\n\n## Skill: {skill.name}\n\n{skill.body}"
+    return compose_instruction_bundle(instruction, [skill])
 
+
+def compose_instruction_bundle(
+    instruction: str,
+    skills: list[Skill],
+    *,
+    mounted_paths: dict[str, str] | None = None,
+) -> str:
+    """Inject one or more skills into the evaluator instruction."""
+    if not skills:
+        return instruction
+
+    sections = [instruction]
+    if mounted_paths:
+        path_lines = [
+            f"- {skill.name}: {mounted_paths[skill.name]}"
+            for skill in skills
+            if skill.name in mounted_paths
+        ]
+        if path_lines:
+            sections.append("## Mounted Skills\n" + "\n".join(path_lines))
+
+    skill_sections = []
+    for skill in skills:
+        skill_sections.append(f"## Skill: {skill.name}\n\n{skill.body}")
+
+    sections.append("\n\n".join(skill_sections))
+    return "\n\n".join(part for part in sections if part)
diff --git a/src/upskill/generate.py b/src/upskill/generate.py
index 1cca065..5f17c39 100644
--- a/src/upskill/generate.py
+++ b/src/upskill/generate.py
@@ -3,9 +3,14 @@
 from __future__ import annotations
 
 from datetime import UTC, datetime
+from typing import Any
 
-from fast_agent.interfaces import AgentProtocol
-from fast_agent.skills.registry import SkillManifest
+try:
+    from fast_agent.interfaces import AgentProtocol
+    from fast_agent.skills.registry import SkillManifest
+except ModuleNotFoundError:  # pragma: no cover - enables unit tests without fast-agent
+    AgentProtocol = Any
+    SkillManifest = Any
 
 from upskill.manifest_utils import parse_skill_manifest_text
 from upskill.models import Skill, SkillMetadata, TestCase, TestCaseSuite
@@ -128,6 +133,35 @@ async def generate_skill(
     )
 
 
+async def generate_skill_candidates(
+    task: str,
+    generator: AgentProtocol,
+    count: int,
+    examples: list[str] | None = None,
+    model: str | None = None,
+) -> list[Skill]:
+    """Generate multiple candidate skills for the same task."""
+
+    candidates: list[Skill] = []
+    total = max(1, count)
+    for index in range(total):
+        variant_task = (
+            f"{task}\n\n"
+            f"Candidate variant {index + 1} of {total}. Produce a distinct but valid skill "
+            "that teaches the same task. Vary structure, examples, and phrasing while keeping "
+            "the behavior correct and practical."
+        )
+        skill = await generate_skill(
+            task=variant_task,
+            examples=examples,
+            generator=generator,
+            model=model,
+        )
+        skill.metadata.candidate_id = f"candidate-{index + 1}"
+        candidates.append(skill)
+    return candidates
+
+
 async def generate_tests(
     task: str,
     generator: AgentProtocol,
diff --git a/src/upskill/logging.py b/src/upskill/logging.py
index 59c723e..cbea94d 100644
--- a/src/upskill/logging.py
+++ b/src/upskill/logging.py
@@ -6,12 +6,28 @@
 import json
 from datetime import datetime
 from pathlib import Path
-
-from fast_agent import ConversationSummary
-from fast_agent.constants import FAST_AGENT_TIMING, FAST_AGENT_USAGE
-from fast_agent.mcp.helpers.content_helpers import get_text
-
-from upskill.models import BatchSummary, ConversationStats, RunMetadata, RunResult, TestResult
+from typing import Any
+
+try:
+    from fast_agent import ConversationSummary
+    from fast_agent.constants import FAST_AGENT_TIMING, FAST_AGENT_USAGE
+    from fast_agent.mcp.helpers.content_helpers import get_text
+except ModuleNotFoundError:  # pragma: no cover - enables unit tests without fast-agent
+    ConversationSummary = Any
+    FAST_AGENT_TIMING = "fast-agent-timing"
+    FAST_AGENT_USAGE = "fast-agent-usage"
+
+    def get_text(content: object) -> str | None:
+        return getattr(content, "text", None)
+
+from upskill.models import (
+    BatchSummary,
+    ConversationStats,
+    RankedSkillBatch,
+    RunMetadata,
+    RunResult,
+    TestResult,
+)
 
 # CSV field names for run summaries (matching skills-test format)
 FIELDNAMES = [
@@ -107,6 +123,27 @@ def load_run_result(run_folder: Path) -> RunResult | None:
         return None
 
 
+def write_ranking_summary(batch_folder: Path, ranking: RankedSkillBatch) -> None:
+    """Write candidate ranking output to JSON."""
+    path = batch_folder / "ranking_summary.json"
+    path.write_text(
+        json.dumps(ranking.model_dump(mode="json"), indent=2),
+        encoding="utf-8",
+    )
+
+
+def load_ranking_summary(batch_folder: Path) -> RankedSkillBatch | None:
+    """Load candidate ranking output from JSON."""
+    path = batch_folder / "ranking_summary.json"
+    if not path.exists():
+        return None
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        return RankedSkillBatch(**data)
+    except (json.JSONDecodeError, ValueError):
+        return None
+
+
 def extract_tokens_from_messages(
     messages: list,
 ) -> tuple[int, int, int, list[dict[str, object]]]:
diff --git a/src/upskill/manifest_utils.py b/src/upskill/manifest_utils.py
index c84c6f2..5724361 100644
--- a/src/upskill/manifest_utils.py
+++ b/src/upskill/manifest_utils.py
@@ -3,8 +3,13 @@
 from __future__ import annotations
 
 from pathlib import Path
+from typing import Any
 
-from fast_agent.skills.registry import SkillManifest, SkillRegistry
+try:
+    from fast_agent.skills.registry import SkillManifest, SkillRegistry
+except ModuleNotFoundError:  # pragma: no cover - enables unit tests without fast-agent
+    SkillManifest = Any
+    SkillRegistry = None
 
 
 def parse_skill_manifest_text(
@@ -21,4 +26,6 @@ def parse_skill_manifest_text(
     Returns:
         Tuple of (SkillManifest | None, error message | None).
     """
+    if SkillRegistry is None:
+        return None, "fast-agent-mcp is required to parse skill manifests."
     return SkillRegistry.parse_manifest_text(manifest_text, path=path)
diff --git a/src/upskill/models.py b/src/upskill/models.py
index cf0a6ea..5e772cc 100644
--- a/src/upskill/models.py
+++ b/src/upskill/models.py
@@ -6,8 +6,9 @@
 import re
 from datetime import datetime
 from pathlib import Path
+from typing import Literal
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
 
 class SkillMetadata(BaseModel):
@@ -20,6 +21,7 @@ class SkillMetadata(BaseModel):
     test_pass_rate: float | None = None
     license: str | None = None
     compatibility: str | None = None
+    candidate_id: str | None = None
 
 
 class ValidationResult(BaseModel):
@@ -31,6 +33,38 @@ class ValidationResult(BaseModel):
     metrics_count: int = 0
     benchmarks_found: list[str] = Field(default_factory=list)
     error_message: str | None = None
+    details: list[str] = Field(default_factory=list)
+
+
+class VerifierSpec(BaseModel):
+    """Deterministic verifier configuration for a test case."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    type: str
+    name: str | None = None
+    values: list[str] = Field(default_factory=list)
+    path: str | None = None
+    text: str | None = None
+    cmd: str | None = None
+    config: dict[str, str | int | float | bool] | None = None
+
+    @field_validator("values", mode="before")
+    @classmethod
+    def coerce_values(cls, value: str | list[str] | None) -> list[str]:
+        if value is None:
+            return []
+        if isinstance(value, str):
+            return [value]
+        return value
+
+
+class CapturedArtifact(BaseModel):
+    """Text artifact captured from a test workspace."""
+
+    path: str
+    content: str
+    truncated: bool = False
 
 
 class ExpectedSpec(BaseModel):
@@ -63,13 +97,41 @@ class TestCase(BaseModel):
 
     input: str  # Task/prompt to give the agent
     context: TestCaseContext | None = None  # Files, env vars, etc.
-    expected: ExpectedSpec  # Expected output checks
+    expected: ExpectedSpec | None = None  # Legacy expected output checks
+    verifiers: list[VerifierSpec] = Field(default_factory=list)
 
     # Custom validator support
     output_file: str | None = None  # File to validate instead of agent output
     validator: str | None = None  # Validator name (e.g., "hf_eval_yaml")
     validator_config: dict[str, str | int | float | bool] | None = None
 
+    @model_validator(mode="after")
+    def validate_expectations(self) -> TestCase:
+        if self.expected is None and not self.verifiers and self.validator is None:
+            raise ValueError(
+                "TestCase requires at least one of expected, verifiers, or validator."
+            )
+        return self
+
+    def effective_verifiers(self) -> list[VerifierSpec]:
+        """Return normalized verifier specs including legacy fields."""
+        effective = list(self.verifiers)
+        if self.expected is not None and self.expected.contains:
+            effective.insert(
+                0,
+                VerifierSpec(type="contains", values=self.expected.contains),
+            )
+        if self.validator is not None:
+            effective.append(
+                VerifierSpec(
+                    type="validator",
+                    name=self.validator,
+                    path=self.output_file,
+                    config=self.validator_config,
+                )
+            )
+        return effective
+
 
 
 
@@ -324,6 +386,40 @@ class TestResult(BaseModel):
 
     # Detailed validation results (for custom validators)
     validation_result: ValidationResult | None = None
+    artifacts: list[CapturedArtifact] = Field(default_factory=list)
+
+
+class JudgeCriterionScore(BaseModel):
+    """Score for a single judge rubric criterion."""
+
+    criterion: str
+    score: int = Field(..., ge=1, le=5)
+    rationale: str
+
+
+class JudgeEvaluation(BaseModel):
+    """Structured LLM-as-a-judge evaluation for one executed test."""
+
+    summary: str
+    criteria: list[JudgeCriterionScore] = Field(default_factory=list)
+
+    @property
+    def total_score(self) -> int:
+        """Return the summed rubric score."""
+        return sum(item.score for item in self.criteria)
+
+    @property
+    def max_score(self) -> int:
+        """Return the maximum possible score."""
+        return len(self.criteria) * 5
+
+    @property
+    def normalized_score(self) -> float:
+        """Return the judge score normalized to 0-1."""
+        max_score = self.max_score
+        if max_score == 0:
+            return 0.0
+        return self.total_score / max_score
 
 
 class EvalResults(BaseModel):
@@ -363,6 +459,136 @@ def is_beneficial(self) -> bool:
         return self.skill_lift > 0.05 or (self.skill_lift >= 0 and self.token_savings > 0.2)
 
 
+class CandidateEvalResult(BaseModel):
+    """Evaluation data for one candidate skill."""
+
+    candidate_id: str
+    skill: Skill
+    test_results: list[TestResult] = Field(default_factory=list)
+    judge_evaluations: list[JudgeEvaluation] = Field(default_factory=list)
+    assertions_passed: int = 0
+    assertions_total: int = 0
+    hard_score: float = 0.0
+    judge_score: float = 0.0
+    token_efficiency_score: float = 0.0
+    composite_score: float = 0.0
+    hard_gate_failed: bool = False
+    average_tokens: float = 0.0
+    average_turns: float = 0.0
+
+
+class RankedSkillResult(BaseModel):
+    """Ranked wrapper around one candidate result."""
+
+    rank: int
+    candidate: CandidateEvalResult
+    judge_model: str | None = None
+    judge_summary: str | None = None
+    score_margin_from_next: float | None = None
+
+
+class RankedSkillBatch(BaseModel):
+    """Full ranking output for one candidate generation batch."""
+
+    task: str
+    skill_generation_model: str
+    evaluation_model: str
+    judge_model: str | None = None
+    judge_strategy: str = "pointwise"
+    candidate_count: int = 0
+    ranked_results: list[RankedSkillResult] = Field(default_factory=list)
+    tests: list[TestCase] = Field(default_factory=list)
+
+    @property
+    def winner(self) -> RankedSkillResult | None:
+        """Return the highest-ranked candidate."""
+        if not self.ranked_results:
+            return None
+        return self.ranked_results[0]
+
+
+class ScenarioJudgeConfig(BaseModel):
+    """Judge configuration for a scenario."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    enabled: bool = False
+    criteria: list[str] | None = None
+
+
+class EvalScenario(BaseModel):
+    """Scenario definition for CI evaluation."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    id: str = Field(..., min_length=1)
+    skills: list[str] = Field(default_factory=list)
+    tests: str
+    judge: ScenarioJudgeConfig | None = None
+    include_baseline: bool = False
+
+
+class EvalManifest(BaseModel):
+    """Top-level CI manifest."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    scenarios: list[EvalScenario] = Field(default_factory=list)
+
+
+class ScenarioVariantResult(BaseModel):
+    """Aggregate result for one scenario variant."""
+
+    variant_id: str
+    variant_type: Literal["bundle", "ablation", "baseline"]
+    skills: list[str] = Field(default_factory=list)
+    omitted_skill: str | None = None
+    passed: bool
+    assertions_passed: int = 0
+    assertions_total: int = 0
+    hard_score: float = 0.0
+    judge_score: float | None = None
+    judge_summary: str | None = None
+    total_tokens: int = 0
+    average_turns: float = 0.0
+    run_folder: str | None = None
+
+
+class ScenarioContribution(BaseModel):
+    """Contribution delta for leaving one skill out of a bundle."""
+
+    skill: str
+    hard_score_delta: float = 0.0
+    judge_score_delta: float | None = None
+    passed_without_skill: bool = False
+
+
+class ScenarioReport(BaseModel):
+    """Report for one selected scenario."""
+
+    scenario_id: str
+    skills: list[str] = Field(default_factory=list)
+    tests_path: str
+    passed: bool
+    bundle: ScenarioVariantResult
+    ablations: list[ScenarioVariantResult] = Field(default_factory=list)
+    baseline: ScenarioVariantResult | None = None
+    contributions: list[ScenarioContribution] = Field(default_factory=list)
+
+
+class CiReport(BaseModel):
+    """Machine-readable report for a CI evaluation run."""
+
+    manifest_path: str
+    scope: str
+    base_ref: str | None = None
+    changed_files: list[str] = Field(default_factory=list)
+    changed_skills: list[str] = Field(default_factory=list)
+    selected_scenarios: list[str] = Field(default_factory=list)
+    success: bool = True
+    scenarios: list[ScenarioReport] = Field(default_factory=list)
+
+
 # Run logging models (similar to skills-test)
 
 
@@ -391,6 +617,16 @@ class RunResult(BaseModel):
     # For plot command: distinguish baseline vs with-skill runs
     run_type: str = "with_skill"  # "with_skill" | "baseline"
     skill_name: str | None = None  # Name of the skill being evaluated
+    judge_model: str | None = None
+    judge_score: float | None = None
+    judge_summary: str | None = None
+    candidate_id: str | None = None
+    rank: int | None = None
+    scenario_id: str | None = None
+    variant_id: str | None = None
+    variant_type: str | None = None
+    skills: list[str] = Field(default_factory=list)
+    omitted_skill: str | None = None
 
 
 class BatchSummary(BaseModel):
diff --git a/src/upskill/verifiers.py b/src/upskill/verifiers.py
new file mode 100644
index 0000000..ef49528
--- /dev/null
+++ b/src/upskill/verifiers.py
@@ -0,0 +1,238 @@
+"""Deterministic verifier execution for upskill test cases."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+from upskill.models import TestCase, ValidationResult, VerifierSpec
+from upskill.validators import get_validator
+
+DEFAULT_COMMAND_TIMEOUT_SECONDS = 60
+MAX_COMMAND_OUTPUT_CHARS = 1200
+
+
+def _build_validation_result(
+    passed: bool,
+    *,
+    error_message: str | None = None,
+    details: list[str] | None = None,
+) -> ValidationResult:
+    return ValidationResult(
+        passed=passed,
+        assertions_passed=1 if passed else 0,
+        assertions_total=1,
+        error_message=error_message,
+        details=details or [],
+    )
+
+
+def _format_command_failure(output: str) -> str:
+    compact = output.strip()
+    if len(compact) > MAX_COMMAND_OUTPUT_CHARS:
+        compact = compact[:MAX_COMMAND_OUTPUT_CHARS].rstrip() + "..."
+    return compact or "command exited with a non-zero status"
+
+
+def _resolve_values(spec: VerifierSpec) -> list[str]:
+    if spec.values:
+        return spec.values
+    if spec.text:
+        return [spec.text]
+    return []
+
+
+def _run_contains_verifier(spec: VerifierSpec, output: str) -> ValidationResult:
+    required = [value for value in _resolve_values(spec) if value.strip()]
+    if not required:
+        return _build_validation_result(False, error_message="contains verifier is missing values")
+
+    output_lower = output.lower()
+    missing = [item for item in required if item.lower() not in output_lower]
+    if missing:
+        return _build_validation_result(
+            False,
+            error_message=f"missing required output text: {missing[0]}",
+            details=[f"missing: {item}" for item in missing],
+        )
+    return _build_validation_result(True)
+
+
+def _require_workspace(spec: VerifierSpec, workspace: Path | None) -> ValidationResult | None:
+    if workspace is not None:
+        return None
+    return _build_validation_result(
+        False,
+        error_message=f"{spec.type} verifier requires a workspace",
+    )
+
+
+def _run_file_exists_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    if not spec.path:
+        return _build_validation_result(False, error_message="file_exists verifier is missing path")
+
+    target = workspace / spec.path
+    if target.exists():
+        return _build_validation_result(True)
+    return _build_validation_result(
+        False,
+        error_message=f"expected file does not exist: {spec.path}",
+    )
+
+
+def _run_file_contains_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    if not spec.path:
+        return _build_validation_result(
+            False,
+            error_message="file_contains verifier is missing path",
+        )
+
+    target = workspace / spec.path
+    if not target.exists():
+        return _build_validation_result(
+            False,
+            error_message=f"expected file does not exist: {spec.path}",
+        )
+
+    required = [value for value in _resolve_values(spec) if value.strip()]
+    if not required:
+        return _build_validation_result(
+            False,
+            error_message="file_contains verifier is missing text or values",
+        )
+
+    content = target.read_text(encoding="utf-8")
+    content_lower = content.lower()
+    missing = [item for item in required if item.lower() not in content_lower]
+    if missing:
+        return _build_validation_result(
+            False,
+            error_message=f"missing required file text: {missing[0]}",
+            details=[f"missing: {item}" for item in missing],
+        )
+    return _build_validation_result(True)
+
+
+def _run_command_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    if not spec.cmd:
+        return _build_validation_result(False, error_message="command verifier is missing cmd")
+
+    timeout_seconds = DEFAULT_COMMAND_TIMEOUT_SECONDS
+    if spec.config and "timeout_seconds" in spec.config:
+        timeout_seconds = int(spec.config["timeout_seconds"])
+
+    completed = subprocess.run(
+        spec.cmd,
+        shell=True,
+        cwd=workspace,
+        text=True,
+        capture_output=True,
+        timeout=timeout_seconds,
+        check=False,
+    )
+    if completed.returncode == 0:
+        return _build_validation_result(True)
+
+    combined_output = "\n".join(
+        part for part in (completed.stdout, completed.stderr) if part
+    )
+    return _build_validation_result(
+        False,
+        error_message=_format_command_failure(combined_output),
+    )
+
+
+def _run_legacy_validator_verifier(spec: VerifierSpec, workspace: Path | None) -> ValidationResult:
+    workspace_error = _require_workspace(spec, workspace)
+    if workspace_error is not None:
+        return workspace_error
+    if not spec.name:
+        return _build_validation_result(False, error_message="validator verifier is missing name")
+
+    validator = get_validator(spec.name)
+    if validator is None:
+        return _build_validation_result(
+            False,
+            error_message=f"unknown validator: {spec.name}",
+        )
+
+    config = spec.config or {}
+    return validator(
+        workspace=workspace,
+        output_file=spec.path or "",
+        **config,
+    )
+
+
+def run_verifier(
+    spec: VerifierSpec,
+    *,
+    output: str,
+    workspace: Path | None,
+) -> ValidationResult:
+    """Run one verifier against the current output/workspace."""
+
+    if spec.type == "contains":
+        return _run_contains_verifier(spec, output)
+    if spec.type == "file_exists":
+        return _run_file_exists_verifier(spec, workspace)
+    if spec.type == "file_contains":
+        return _run_file_contains_verifier(spec, workspace)
+    if spec.type == "command":
+        return _run_command_verifier(spec, workspace)
+    if spec.type == "validator":
+        return _run_legacy_validator_verifier(spec, workspace)
+
+    return _build_validation_result(
+        False,
+        error_message=f"unsupported verifier type: {spec.type}",
+    )
+
+
+def run_verifiers(
+    test_case: TestCase,
+    *,
+    output: str,
+    workspace: Path | None,
+) -> ValidationResult:
+    """Run all verifiers configured for a test case."""
+
+    specs = test_case.effective_verifiers()
+    if not specs:
+        return ValidationResult(
+            passed=False,
+            assertions_passed=0,
+            assertions_total=0,
+            error_message="no verifiers configured",
+        )
+
+    passed = 0
+    total = 0
+    details: list[str] = []
+    error_messages: list[str] = []
+
+    for spec in specs:
+        result = run_verifier(spec, output=output, workspace=workspace)
+        passed += result.assertions_passed
+        total += result.assertions_total
+        if result.error_message:
+            error_messages.append(result.error_message)
+        if result.details:
+            details.extend(result.details)
+
+    return ValidationResult(
+        passed=passed == total,
+        assertions_passed=passed,
+        assertions_total=total,
+        error_message="; ".join(error_messages) if error_messages else None,
+        details=details,
+    )
diff --git a/tests/fixtures/ci_action_repo/.upskill/evals.yaml b/tests/fixtures/ci_action_repo/.upskill/evals.yaml
new file mode 100644
index 0000000..52da53d
--- /dev/null
+++ b/tests/fixtures/ci_action_repo/.upskill/evals.yaml
@@ -0,0 +1,5 @@
+scenarios:
+  - id: fixture-scenario
+    skills:
+      - skills/example-skill
+    tests: evals/example.yaml
diff --git a/tests/fixtures/ci_action_repo/evals/example.yaml b/tests/fixtures/ci_action_repo/evals/example.yaml
new file mode 100644
index 0000000..031a2eb
--- /dev/null
+++ b/tests/fixtures/ci_action_repo/evals/example.yaml
@@ -0,0 +1,6 @@
+cases:
+  - input: "Say fixture"
+    expected:
+      contains:
+        - fixture
+        - response
diff --git a/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md
new file mode 100644
index 0000000..70b1902
--- /dev/null
+++ b/tests/fixtures/ci_action_repo/skills/example-skill/SKILL.md
@@ -0,0 +1,6 @@
+---
+name: example-skill
+description: Example fixture skill for action smoke tests
+---
+
+Respond with the fixture response.
diff --git a/tests/test_ci.py b/tests/test_ci.py
new file mode 100644
index 0000000..35f9e20
--- /dev/null
+++ b/tests/test_ci.py
@@ -0,0 +1,298 @@
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+from click.testing import CliRunner
+
+from upskill.ci import (
+    load_eval_manifest,
+    render_ci_report_markdown,
+    run_ci_suite,
+    select_scenarios,
+    write_ci_report,
+)
+from upskill.cli import main
+from upskill.models import JudgeCriterionScore, JudgeEvaluation
+
+
+def _write_skill(path: Path, name: str, description: str, body: str) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+    (path / "SKILL.md").write_text(
+        "\n".join(
+            [
+                "---",
+                f"name: {name}",
+                f"description: {description}",
+                "---",
+                "",
+                body,
+                "",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+
+def _write_fixture_repo(root: Path) -> Path:
+    skills_dir = root / "skills"
+    _write_skill(
+        skills_dir / "alpha-skill",
+        "alpha-skill",
+        "alpha helper",
+        "Alpha bundle helper.",
+    )
+    _write_skill(
+        skills_dir / "beta-skill",
+        "beta-skill",
+        "beta helper",
+        "Beta bundle helper.",
+    )
+
+    scripts_dir = root / "scripts"
+    scripts_dir.mkdir(parents=True, exist_ok=True)
+    (scripts_dir / "assert_report.py").write_text(
+        "\n".join(
+            [
+                "from __future__ import annotations",
+                "",
+                "from pathlib import Path",
+                "import sys",
+                "",
+                "target = Path(sys.argv[1])",
+                "content = target.read_text(encoding='utf-8').strip()",
+                "if content != 'bundle ok':",
+                "    raise SystemExit(f'unexpected content: {content}')",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    evals_dir = root / "evals"
+    evals_dir.mkdir(parents=True, exist_ok=True)
+    (evals_dir / "bundle.yaml").write_text(
+        "\n".join(
+            [
+                "cases:",
+                "  - input: write the report",
+                "    output_file: report.txt",
+                "    verifiers:",
+                "      - type: file_exists",
+                "        path: report.txt",
+                "      - type: command",
+                "        cmd: python scripts/assert_report.py report.txt",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    manifest_dir = root / ".upskill"
+    manifest_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = manifest_dir / "evals.yaml"
+    manifest_path.write_text(
+        "\n".join(
+            [
+                "scenarios:",
+                "  - id: bundle-scenario",
+                "    skills:",
+                "      - skills/alpha-skill",
+                "      - skills/beta-skill",
+                "    tests: evals/bundle.yaml",
+                "    judge:",
+                "      enabled: true",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    return manifest_path
+
+
+class _FakeEvaluatorClone:
+    shell_runtime_enabled = True
+
+    def __init__(self, parent: _FakeEvaluatorAgent) -> None:
+        self._parent = parent
+        self.instruction = ""
+        self.message_history: list[object] = []
+        self.workspace: Path | None = None
+
+    def enable_shell(self, working_directory: Path) -> None:
+        self.workspace = Path(working_directory)
+
+    def set_instruction(self, instruction: str) -> None:
+        self.instruction = instruction
+
+    async def set_model(self, model: str) -> None:
+        self._parent.model = model
+
+    async def send(self, user_content: str) -> str:
+        assert self.workspace is not None
+        alpha_path = self.workspace / "skills" / "alpha-skill" / "SKILL.md"
+        beta_path = self.workspace / "skills" / "beta-skill" / "SKILL.md"
+        assert alpha_path.exists()
+        assert beta_path.exists()
+
+        has_alpha = "Alpha bundle helper." in self.instruction
+        has_beta = "Beta bundle helper." in self.instruction
+        if has_alpha and has_beta:
+            content = "bundle ok"
+        elif has_alpha:
+            content = "alpha only"
+        elif has_beta:
+            content = "beta only"
+        else:
+            content = "baseline"
+
+        (self.workspace / "report.txt").write_text(content, encoding="utf-8")
+        return f"{user_content}\n{content}"
+
+    async def shutdown(self) -> None:
+        return None
+
+
+class _FakeEvaluatorAgent:
+    def __init__(self) -> None:
+        self.instruction = "Base evaluator instruction"
+        self.model: str | None = None
+
+    async def set_model(self, model: str) -> None:
+        self.model = model
+
+    async def spawn_detached_instance(self, name: str | None = None) -> _FakeEvaluatorClone:
+        return _FakeEvaluatorClone(self)
+
+
+class _FakeJudgeClone:
+    def __init__(self, parent: _FakeJudgeAgent) -> None:
+        self._parent = parent
+
+    async def set_model(self, model: str) -> None:
+        self._parent.model = model
+
+    async def structured(self, prompt: str, schema: type[JudgeEvaluation]):
+        score = 5 if "bundle ok" in prompt else 2
+        result = schema(
+            summary="strong" if score == 5 else "weak",
+            criteria=[
+                JudgeCriterionScore(
+                    criterion=criterion,
+                    score=score,
+                    rationale="test rationale",
+                )
+                for criterion in (
+                    "instruction_quality",
+                    "helpfulness",
+                    "robustness",
+                    "concision",
+                    "generalizability",
+                )
+            ],
+        )
+        return result, None
+
+    async def shutdown(self) -> None:
+        return None
+
+
+class _FakeJudgeAgent:
+    def __init__(self) -> None:
+        self.model: str | None = None
+
+    async def spawn_detached_instance(self, name: str | None = None) -> _FakeJudgeClone:
+        return _FakeJudgeClone(self)
+
+
+def test_manifest_selection_uses_changed_skills(tmp_path) -> None:
+    manifest_path = _write_fixture_repo(tmp_path)
+    manifest = load_eval_manifest(manifest_path)
+
+    selected = select_scenarios(
+        manifest,
+        scope="changed",
+        changed_skills=["skills/beta-skill"],
+    )
+
+    assert [scenario.id for scenario in selected] == ["bundle-scenario"]
+
+
+def test_run_ci_suite_executes_bundle_and_ablations(tmp_path) -> None:
+    manifest_path = _write_fixture_repo(tmp_path)
+    report = asyncio.run(
+        run_ci_suite(
+            manifest_path,
+            evaluator=_FakeEvaluatorAgent(),
+            judge=_FakeJudgeAgent(),
+            scope="all",
+            eval_model="haiku",
+            judge_model="judge-mini",
+            working_dir=tmp_path,
+            runs_dir=tmp_path / "runs",
+        )
+    )
+
+    assert report.success is True
+    assert report.selected_scenarios == ["bundle-scenario"]
+    assert len(report.scenarios) == 1
+
+    scenario = report.scenarios[0]
+    assert scenario.bundle.passed is True
+    assert scenario.bundle.judge_score is not None
+    assert len(scenario.ablations) == 2
+    assert all(item.passed is False for item in scenario.ablations)
+    assert {item.skill for item in scenario.contributions} == {
+        "skills/alpha-skill",
+        "skills/beta-skill",
+    }
+    assert all(item.hard_score_delta > 0 for item in scenario.contributions)
+
+    markdown = render_ci_report_markdown(report)
+    assert "bundle-scenario" in markdown
+    assert "without-alpha-skill" in markdown
+
+    report_path = tmp_path / "report.json"
+    write_ci_report(report_path, report)
+    assert report_path.exists()
+
+
+def test_ci_cli_forwards_options(monkeypatch, tmp_path) -> None:
+    captured: dict[str, object] = {}
+    manifest_path = tmp_path / "evals.yaml"
+    manifest_path.write_text("scenarios: []\n", encoding="utf-8")
+
+    async def _fake_ci_async(**kwargs):
+        captured.update(kwargs)
+
+    monkeypatch.setattr("upskill.cli._ci_async", _fake_ci_async)
+
+    runner = CliRunner()
+    result = runner.invoke(
+        main,
+        [
+            "ci",
+            "--manifest",
+            str(manifest_path),
+            "--scope",
+            "all",
+            "--base-ref",
+            "origin/release",
+            "--eval-model",
+            "haiku",
+            "--judge-model",
+            "judge-mini",
+            "--summary-json",
+            "report.json",
+            "--runs-dir",
+            "runs",
+            "--fail-on-no-scenarios",
+        ],
+    )
+
+    assert result.exit_code == 0, result.output
+    assert captured["manifest_path"] == str(manifest_path)
+    assert captured["scope"] == "all"
+    assert captured["base_ref"] == "origin/release"
+    assert captured["eval_model"] == "haiku"
+    assert captured["judge_model"] == "judge-mini"
+    assert captured["summary_json"] == "report.json"
+    assert captured["runs_dir"] == "runs"
+    assert captured["fail_on_no_scenarios"] is True
diff --git a/tests/test_config.py b/tests/test_config.py
index 2ec6e1f..b8c53fb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -46,3 +46,23 @@ def test_config_save_uses_env_override_path_when_file_is_missing(tmp_path, monke
         saved = yaml.safe_load(f) or {}
 
     assert saved["skill_generation_model"] == "haiku"
+
+
+def test_effective_judge_model_falls_back_to_eval_then_generation_model() -> None:
+    config = Config(skill_generation_model="sonnet", eval_model="haiku")
+
+    assert config.effective_judge_model == "haiku"
+
+    config = Config(skill_generation_model="sonnet", eval_model=None, judge_model=None)
+
+    assert config.effective_judge_model == "sonnet"
+
+
+def test_effective_judge_model_prefers_explicit_judge_model() -> None:
+    config = Config(
+        skill_generation_model="sonnet",
+        eval_model="haiku",
+        judge_model="opus",
+    )
+
+    assert config.effective_judge_model == "opus"
diff --git a/tests/test_ranking.py b/tests/test_ranking.py
new file mode 100644
index 0000000..32af450
--- /dev/null
+++ b/tests/test_ranking.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+from click.testing import CliRunner
+
+from upskill.cli import main
+from upskill.evaluate import build_default_judge_evaluation, rank_candidate_results
+from upskill.logging import load_ranking_summary, write_ranking_summary
+from upskill.models import (
+    CandidateEvalResult,
+    JudgeCriterionScore,
+    JudgeEvaluation,
+    RankedSkillBatch,
+    Skill,
+)
+
+
+def _make_skill(name: str) -> Skill:
+    return Skill(name=name, description=f"{name} desc", body=f"# {name}")
+
+
+def _judge_eval(score: int, summary: str) -> JudgeEvaluation:
+    return JudgeEvaluation(
+        summary=summary,
+        criteria=[
+            JudgeCriterionScore(
+                criterion=criterion,
+                score=score,
+                rationale=f"{criterion} rationale",
+            )
+            for criterion in (
+                "instruction_quality",
+                "helpfulness",
+                "robustness",
+                "concision",
+                "generalizability",
+            )
+        ],
+    )
+
+
+def test_rank_candidate_results_prefers_hard_score_over_judge_score() -> None:
+    strong = CandidateEvalResult(
+        candidate_id="candidate-1",
+        skill=_make_skill("strong-skill"),
+        assertions_passed=9,
+        assertions_total=10,
+        hard_score=0.9,
+        judge_score=0.4,
+        average_tokens=200,
+    )
+    flashy = CandidateEvalResult(
+        candidate_id="candidate-2",
+        skill=_make_skill("flashy-skill"),
+        assertions_passed=6,
+        assertions_total=10,
+        hard_score=0.6,
+        judge_score=1.0,
+        average_tokens=100,
+    )
+
+    ranking = rank_candidate_results(
+        "task",
+        [flashy, strong],
+        skill_generation_model="sonnet",
+        evaluation_model="sonnet",
+        judge_model="haiku",
+        judge_strategy="pointwise",
+        tests=[],
+    )
+
+    assert ranking.winner is not None
+    assert ranking.winner.candidate.candidate_id == "candidate-1"
+    assert ranking.ranked_results[1].candidate.hard_gate_failed is True
+
+
+def test_build_default_judge_evaluation_is_neutral() -> None:
+    result = build_default_judge_evaluation("fallback")
+
+    assert result.summary == "fallback"
+    assert len(result.criteria) == 5
+    assert result.total_score == 15
+    assert result.normalized_score == 0.6
+
+
+def test_ranking_summary_round_trip(tmp_path) -> None:
+    ranking = RankedSkillBatch(
+        task="task",
+        skill_generation_model="sonnet",
+        evaluation_model="haiku",
+        judge_model="haiku",
+        candidate_count=1,
+    )
+
+    write_ranking_summary(tmp_path, ranking)
+    loaded = load_ranking_summary(tmp_path)
+
+    assert loaded is not None
+    assert loaded.model_dump(mode="json") == ranking.model_dump(mode="json")
+
+
+def test_generate_cli_forwards_judge_options(monkeypatch, tmp_path) -> None:
+    captured: dict[str, object] = {}
+
+    async def _fake_generate_async(*args):
+        (
+            task,
+            examples,
+            from_skill,
+            from_trace,
+            model,
+            test_gen_model,
+            output,
+            no_eval,
+            eval_model,
+            candidates,
+            judge_model,
+            rank_with_judge,
+            judge_strategy,
+            runs_dir,
+            log_runs,
+        ) = args
+        captured.update(
+            {
+                "task": task,
+                "examples": examples,
+                "from_skill": from_skill,
+                "from_trace": from_trace,
+                "model": model,
+                "test_gen_model": test_gen_model,
+                "output": output,
+                "no_eval": no_eval,
+                "eval_model": eval_model,
+                "candidates": candidates,
+                "judge_model": judge_model,
+                "rank_with_judge": rank_with_judge,
+                "judge_strategy": judge_strategy,
+                "runs_dir": runs_dir,
+                "log_runs": log_runs,
+            }
+        )
+
+    monkeypatch.setattr("upskill.cli._generate_async", _fake_generate_async)
+
+    runner = CliRunner()
+    result = runner.invoke(
+        main,
+        [
+            "generate",
+            "rank skills",
+            "--candidates",
+            "3",
+            "--judge-model",
+            "haiku",
+            "--rank-with-judge",
+            "--judge-strategy",
+            "pointwise",
+            "--runs-dir",
+            str(tmp_path),
+            "--no-log-runs",
+        ],
+    )
+
+    assert result.exit_code == 0, result.output
+    assert captured["task"] == "rank skills"
+    assert captured["candidates"] == 3
+    assert captured["judge_model"] == "haiku"
+    assert captured["rank_with_judge"] is True
+    assert captured["judge_strategy"] == "pointwise"
+    assert captured["log_runs"] is False
diff --git a/tests/test_verifiers.py b/tests/test_verifiers.py
new file mode 100644
index 0000000..78fc7f6
--- /dev/null
+++ b/tests/test_verifiers.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from upskill.models import TestCase as UpskillTestCase
+from upskill.models import ValidationResult
+from upskill.validators import register_validator
+from upskill.verifiers import run_verifiers
+
+
+@register_validator("test-counting-validator")
+def _test_counting_validator(
+    workspace: Path,
+    output_file: str,
+    **_: object,
+) -> ValidationResult:
+    target = workspace / output_file
+    passed = target.exists()
+    return ValidationResult(
+        passed=passed,
+        assertions_passed=2 if passed else 0,
+        assertions_total=2,
+        error_message=None if passed else f"missing file: {output_file}",
+    )
+
+
+def test_run_verifiers_supports_legacy_expected_contains() -> None:
+    test_case = UpskillTestCase(
+        input="say hello",
+        expected={"contains": ["hello", "world"]},
+    )
+
+    result = run_verifiers(test_case, output="Hello, world!", workspace=None)
+
+    assert result.passed is True
+    assert result.assertions_passed == 1
+    assert result.assertions_total == 1
+
+
+def test_run_verifiers_supports_file_verifiers(tmp_path) -> None:
+    target = tmp_path / "report.txt"
+    target.write_text("bundle ok", encoding="utf-8")
+    test_case = UpskillTestCase(
+        input="write file",
+        verifiers=[
+            {"type": "file_exists", "path": "report.txt"},
+            {"type": "file_contains", "path": "report.txt", "text": "bundle ok"},
+        ],
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is True
+    assert result.assertions_passed == 2
+    assert result.assertions_total == 2
+
+
+def test_run_verifiers_supports_command_verifier(tmp_path) -> None:
+    script = tmp_path / "check.py"
+    script.write_text("print('ok')\n", encoding="utf-8")
+    test_case = UpskillTestCase(
+        input="run assertion script",
+        verifiers=[{"type": "command", "cmd": "python check.py"}],
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is True
+    assert result.assertions_passed == 1
+
+
+def test_run_verifiers_translates_legacy_validator(tmp_path) -> None:
+    target = tmp_path / "artifact.txt"
+    target.write_text("ok", encoding="utf-8")
+    test_case = UpskillTestCase(
+        input="validate artifact",
+        validator="test-counting-validator",
+        output_file="artifact.txt",
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is True
+    assert result.assertions_passed == 2
+    assert result.assertions_total == 2
+
+
+def test_run_verifiers_reports_failures(tmp_path) -> None:
+    test_case = UpskillTestCase(
+        input="write report",
+        verifiers=[
+            {"type": "file_exists", "path": "report.txt"},
+            {"type": "command", "cmd": "python -c 'import sys; sys.exit(1)'"},
+        ],
+    )
+
+    result = run_verifiers(test_case, output="", workspace=tmp_path)
+
+    assert result.passed is False
+    assert result.assertions_passed == 0
+    assert result.assertions_total == 2
+    assert result.error_message is not None