huggingface · burtenshaw · Mar 20, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,3 +40,20 @@ jobs:
 
       - name: Run tests
         run: uv run pytest -v
+
+  action-smoke:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run local upskill action against fixture repo
+        uses: ./
+        with:
+          working-directory: tests/fixtures/ci_action_repo
+          scope: changed
+          base-ref: HEAD
+          summary-json: upskill-report.json
+          fail-on-no-scenarios: "false"
+
+      - name: Verify smoke report exists
+        run: test -f tests/fixtures/ci_action_repo/upskill-report.json
diff --git a/README.md b/README.md
@@ -216,6 +216,71 @@ sonnet
 ]
 ```
 
+### `upskill ci`
+
+Run scenario-based CI evaluation for changed or declared skill bundles.
+
+```bash
+upskill ci [OPTIONS]
+```
+
+**Options:**
+- `--manifest PATH` - Scenario manifest (default: `./.upskill/evals.yaml`)
+- `--scope [changed|all]` - Run only impacted scenarios or the full suite
+- `--base-ref REF` - Base ref for changed-skill selection (default: `origin/main`)
+- `--eval-model MODEL` - Evaluator model override
+- `--judge-model MODEL` - Judge model override
+- `--summary-json PATH` - Output path for the machine-readable report
+- `--runs-dir PATH` - Directory for run artifacts
+- `--fail-on-no-scenarios / --no-fail-on-no-scenarios` - Control empty-selection behavior
+
+**Scenario manifest example:**
+
+```yaml
+scenarios:
+  - id: hf-model-card-readme
+    skills:
+      - skills/hugging-face-evaluation-manager
+      - skills/hf-cli
+    tests: evals/hf-model-card-readme.yaml
+    judge:
+      enabled: true
+```
+
+**Test suite example:**
+
+```yaml
+cases:
+  - input: "Read README and write olmo_7b_evaluations.yaml"
+    output_file: olmo_7b_evaluations.yaml
+    verifiers:
+      - type: file_exists
+        path: olmo_7b_evaluations.yaml
+      - type: command
+        cmd: python test_eval_assertions.py
+```
+
+The CI command runs the full declared bundle, then leave-one-out ablations for each
+contributing skill. Deterministic verifiers gate pass/fail; judge scoring is advisory.
+
+## GitHub Action
+
+Use the reusable action from another repository after `actions/checkout`:
+
+```yaml
+- uses: huggingface/upskill@vX
+  with:
+    working-directory: .
+    manifest-path: .upskill/evals.yaml
+    scope: changed
+    base-ref: origin/main
+    eval-model: haiku
+    judge-model: openai.gpt-4.1-mini
+```
+
+The action installs `upskill` from the tagged action source, writes `upskill-report.json`
+by default, and uploads the JSON report plus run artifacts.
+
 ### `upskill list`
 
 List all generated skills in a tree view.

diff --git a/action.yml b/action.yml
@@ -0,0 +1,119 @@
+name: upskill-ci
+description: Run upskill scenario-based CI evaluation for changed or declared skill bundles.
+
+inputs:
+  manifest-path:
+    description: Path to the upskill scenario manifest, relative to the working directory.
+    required: false
+    default: .upskill/evals.yaml
+  scope:
+    description: Run only changed scenarios or the entire manifest.
+    required: false
+    default: changed
+  base-ref:
+    description: Git base ref used when scope is changed.
+    required: false
+    default: origin/main
+  eval-model:
+    description: Model used for evaluator execution.
+    required: false
+  judge-model:
+    description: Model used for advisory judge scoring.
+    required: false
+  working-directory:
+    description: Repository directory where the manifest and skills live.
+    required: false
+    default: .
+  runs-dir:
+    description: Directory, relative to the working directory, for run artifacts.
+    required: false
+    default: runs
+  summary-json:
+    description: JSON report path, relative to the working directory.
+    required: false
+    default: upskill-report.json
+  fail-on-no-scenarios:
+    description: Exit non-zero when no scenarios are selected.
+    required: false
+    default: "false"
+  upload-artifacts:
+    description: Upload the JSON report and run artifacts.
+    required: false
+    default: "true"
+
+outputs:
+  summary-json:
+    description: Path to the generated JSON report.
+    value: ${{ steps.paths.outputs.summary_json }}
+  runs-dir:
+    description: Path to the generated run artifacts directory.
+    value: ${{ steps.paths.outputs.runs_dir }}
+
+runs:
+  using: composite
+  steps:
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.13"
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+
+    - name: Resolve output paths
+      id: paths
+      shell: bash
+      run: |
+        set -euo pipefail
+        workdir="${{ inputs.working-directory }}"
+        summary_json="${workdir}/${{ inputs.summary-json }}"
+        runs_dir="${workdir}/${{ inputs.runs-dir }}"
+        echo "summary_json=${summary_json}" >> "$GITHUB_OUTPUT"
+        echo "runs_dir=${runs_dir}" >> "$GITHUB_OUTPUT"
+
+    - name: Install upskill from action source
+      shell: bash
+      run: |
+        set -euo pipefail
+        uv pip install --system "${{ github.action_path }}"
+
+    - name: Run upskill ci
+      shell: bash
+      working-directory: ${{ inputs.working-directory }}
+      run: |
+        set -euo pipefail
+        args=(
+          --manifest "${{ inputs.manifest-path }}"
+          --scope "${{ inputs.scope }}"
+          --base-ref "${{ inputs.base-ref }}"
+          --runs-dir "${{ inputs.runs-dir }}"
+          --summary-json "${{ inputs.summary-json }}"
+        )
+        if [[ -n "${{ inputs.eval-model }}" ]]; then
+          args+=(--eval-model "${{ inputs.eval-model }}")
+        fi
+        if [[ -n "${{ inputs.judge-model }}" ]]; then
+          args+=(--judge-model "${{ inputs.judge-model }}")
+        fi
+        if [[ "${{ inputs.fail-on-no-scenarios }}" == "true" ]]; then
+          args+=(--fail-on-no-scenarios)
+        else
+          args+=(--no-fail-on-no-scenarios)
+        fi
+        upskill ci "${args[@]}"
+
+    - name: Upload CI report
+      if: ${{ always() && inputs.upload-artifacts == 'true' }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: upskill-report
+        path: ${{ steps.paths.outputs.summary_json }}
+        if-no-files-found: error
+
+    - name: Upload run artifacts
+      if: ${{ always() && inputs.upload-artifacts == 'true' }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: upskill-runs
+        path: ${{ steps.paths.outputs.runs_dir }}
+        if-no-files-found: warn
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,3 +40,6 @@ target-version = "py313"
 
 [tool.ruff.lint]
 select = ["E", "F", "I", "UP"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/src/upskill/__init__.py b/src/upskill/__init__.py
@@ -2,6 +2,7 @@
 
 __version__ = "0.2.0"
 
+from upskill.ci import load_eval_manifest, run_ci_suite
 from upskill.config import Config
 from upskill.evaluate import evaluate_skill
 from upskill.generate import generate_skill, generate_tests, refine_skill
@@ -16,14 +17,18 @@
 )
 from upskill.models import (
     BatchSummary,
+    CiReport,
     ConversationStats,
+    EvalManifest,
     EvalResults,
+    EvalScenario,
     RunMetadata,
     RunResult,
     Skill,
     SkillMetadata,
     TestCase,
     TestResult,
+    VerifierSpec,
 )
 
 __all__ = [
@@ -39,12 +44,18 @@
     "RunResult",
     "ConversationStats",
     "BatchSummary",
+    "VerifierSpec",
+    "EvalScenario",
+    "EvalManifest",
+    "CiReport",
     # Generation
     "generate_skill",
     "generate_tests",
     "refine_skill",
     # Evaluation
     "evaluate_skill",
+    "run_ci_suite",
+    "load_eval_manifest",
     # Logging
     "create_batch_folder",
     "create_run_folder",

diff --git a/src/upskill/agent_cards/judge.md b/src/upskill/agent_cards/judge.md
@@ -0,0 +1,13 @@
+---
+type: agent
+description: Judge executed skill candidates with a structured rubric.
+---
+You are an expert judge for AI agent skills.
+
+Score the executed skill candidate using the provided rubric only.
+
+Return structured output with:
+- a short summary
+- one entry for each criterion
+- integer scores from 1 to 5
+- concise rationales grounded in the provided test case, output, and validation result