diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 81945b4..3fa9e69 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,27 @@ on:
     branches: [main]
 
 jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13.5"
+
+      - name: Install dependencies
+        run: uv sync --frozen --extra dev
+
+      - name: Check formatting
+        run: uv run scripts/format.py
+
   lint:
     runs-on: ubuntu-latest
     steps:
@@ -14,15 +35,61 @@ jobs:
 
       - name: Install uv
         uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
 
       - name: Set up Python
-        run: uv python install 3.13.5
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13.5"
 
       - name: Install dependencies
-        run: uv sync --extra dev
+        run: uv sync --frozen --extra dev
 
       - name: Lint with ruff
-        run: uv run ruff check src/
+        run: uv run scripts/lint.py
+
+  cpd:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13.5"
+
+      - name: Install dependencies
+        run: uv sync --frozen --extra dev
+
+      - name: Check for duplicated code
+        run: uv run scripts/cpd.py --check
+
+  typecheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13.5"
+
+      - name: Install dependencies
+        run: uv sync --frozen --extra dev
+
+      - name: Run ty
+        run: uv run scripts/typecheck.py
 
   test:
     runs-on: ubuntu-latest
@@ -31,12 +98,16 @@ jobs:
 
       - name: Install uv
         uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
 
       - name: Set up Python
-        run: uv python install 3.13.5
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13.5"
 
       - name: Install dependencies
-        run: uv sync --extra dev
+        run: uv sync --frozen --extra dev
 
       - name: Run tests
         run: uv run pytest -v
diff --git a/.gitignore b/.gitignore
index 6ed73f9..6e1c571 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,7 @@ Thumbs.db
 *.tmp
 *.temp
 *.txt
+.smoke-test/
+.fast-agent/
+.fast-agent-old/
+skills/
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..240d2e4
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,61 @@
+# Agent guidance
+
+## Code quality baseline
+
+- Target Python `3.13`.
+- Keep changes compatible with the repo's `pyproject.toml` settings.
+- Prefer small, typed, composable functions over large command-style blocks.
+- Avoid introducing new complexity suppressions unless there is a strong reason.
+
+## Required local checks
+
+Before finishing a change, run:
+
+```bash
+uv sync --extra dev
+uv run scripts/format.py
+uv run scripts/lint.py
+uv run scripts/typecheck.py
+uv run --extra dev pytest -v
+```
+
+CI enforces the same flow in `.github/workflows/ci.yml`.
+
+## Ruff rules
+
+Formatting and linting are enforced with Ruff.
+
+- Line length: `100`
+- Target version: `py313`
+- Enabled lint families:
+  - `B` - bugbear
+  - `C90` - cyclomatic complexity
+  - `E` - pycodestyle errors
+  - `F` - pyflakes
+  - `I` - import sorting
+  - `RUF` - Ruff-specific rules
+  - `SIM` - simplifications
+  - `TCH` - type-checking import hygiene
+  - `UP` - pyupgrade
+- `E501` is ignored; let `ruff format` own line wrapping.
+- Cyclomatic complexity limit: `15`
+
+## Type checking
+
+- `ty` is required for `src`, `tests`, and `scripts`.
+- Add or improve annotations when touching code that is ambiguous to the type checker.
+- Prefer explicit protocols / typed helper structures over `object` when wiring dynamic APIs.
+- Keep type-only imports behind `TYPE_CHECKING` when appropriate.
+
+## Tests
+
+- Add or update tests for behavior changes.
+- Keep the test suite passing with `pytest`.
+- Use focused unit-style tests for logic changes when possible.
+
+## Practical authoring guidance
+
+- Prefer refactoring over adding broad ignores.
+- If a function is nearing the complexity limit, split it before adding more branches.
+- Keep CLI orchestration, model resolution, and persistence logic separated when possible.
+- When adding developer tooling, update the README and CI together.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..75f563d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,91 @@
+SHELL := /usr/bin/env bash
+
+# Common
+ARTIFACT_REPO ?=
+FLAVOR ?= cpu-basic
+TIMEOUT ?= 45m
+SECRETS ?= HF_TOKEN,OPENAI_API_KEY
+
+# Fast-agent mode (lean)
+SKILLS_DIR ?=
+CARD_DIR ?=
+FAST_AGENT ?=
+FAST_MODEL ?= haiku
+MESSAGE ?= Write a concise conventional commit message for: add password reset endpoint with tests.
+PROMPT_FILE ?=
+PROMPTS_JSONL ?=
+
+.PHONY: \
+	format format-write lint typecheck test check \
+	hf-go-check hf-go-smoke hf-go-prompt hf-go-batch
+
+format:
+	uv run --extra dev scripts/format.py
+
+format-write:
+	uv run --extra dev scripts/format.py --write
+
+lint:
+	uv run --extra dev scripts/lint.py
+
+typecheck:
+	uv run --extra dev scripts/typecheck.py
+
+test:
+	uv run --extra dev pytest -v
+
+check: format lint typecheck test
+
+hf-go-check:
+	@test -n "$(ARTIFACT_REPO)" || (echo "ARTIFACT_REPO is required" && exit 1)
+	@test -n "$(SKILLS_DIR)" || (echo "SKILLS_DIR is required" && exit 1)
+	@test -d "$(SKILLS_DIR)" || (echo "SKILLS_DIR not found: $(SKILLS_DIR)" && exit 1)
+	@test -x scripts/hf/submit_hf_job.sh || (echo "scripts/hf/submit_hf_job.sh missing or not executable" && exit 1)
+	@test -x scripts/hf/job_entrypoint_fast_agent.sh || (echo "scripts/hf/job_entrypoint_fast_agent.sh missing or not executable" && exit 1)
+	@hf auth whoami >/dev/null || (echo "hf auth required: run 'hf auth login'" && exit 1)
+
+hf-go-smoke: hf-go-check
+	@cmd=(scripts/hf/submit_hf_job.sh \
+	  --artifact-repo "$(ARTIFACT_REPO)" \
+	  --skills-dir "$(SKILLS_DIR)" \
+	  --model "$(FAST_MODEL)" \
+	  --message "$(MESSAGE)" \
+	  --flavor "$(FLAVOR)" \
+	  --timeout "$(TIMEOUT)" \
+	  --secrets "$(SECRETS)"); \
+	if [[ -n "$(CARD_DIR)" ]]; then cmd+=(--card-dir "$(CARD_DIR)"); fi; \
+	if [[ -n "$(FAST_AGENT)" ]]; then cmd+=(--agent "$(FAST_AGENT)"); fi; \
+	echo "Running: $${cmd[*]}"; \
+	"$${cmd[@]}"
+
+hf-go-prompt: hf-go-check
+	@test -n "$(PROMPT_FILE)" || (echo "PROMPT_FILE is required" && exit 1)
+	@test -f "$(PROMPT_FILE)" || (echo "PROMPT_FILE not found: $(PROMPT_FILE)" && exit 1)
+	@cmd=(scripts/hf/submit_hf_job.sh \
+	  --artifact-repo "$(ARTIFACT_REPO)" \
+	  --skills-dir "$(SKILLS_DIR)" \
+	  --model "$(FAST_MODEL)" \
+	  --prompt-file "$(PROMPT_FILE)" \
+	  --flavor "$(FLAVOR)" \
+	  --timeout "$(TIMEOUT)" \
+	  --secrets "$(SECRETS)"); \
+	if [[ -n "$(CARD_DIR)" ]]; then cmd+=(--card-dir "$(CARD_DIR)"); fi; \
+	if [[ -n "$(FAST_AGENT)" ]]; then cmd+=(--agent "$(FAST_AGENT)"); fi; \
+	echo "Running: $${cmd[*]}"; \
+	"$${cmd[@]}"
+
+hf-go-batch: hf-go-check
+	@test -n "$(PROMPTS_JSONL)" || (echo "PROMPTS_JSONL is required" && exit 1)
+	@test -f "$(PROMPTS_JSONL)" || (echo "PROMPTS_JSONL not found: $(PROMPTS_JSONL)" && exit 1)
+	@cmd=(scripts/hf/submit_hf_job.sh \
+	  --artifact-repo "$(ARTIFACT_REPO)" \
+	  --skills-dir "$(SKILLS_DIR)" \
+	  --model "$(FAST_MODEL)" \
+	  --prompts-jsonl "$(PROMPTS_JSONL)" \
+	  --flavor "$(FLAVOR)" \
+	  --timeout "$(TIMEOUT)" \
+	  --secrets "$(SECRETS)"); \
+	if [[ -n "$(CARD_DIR)" ]]; then cmd+=(--card-dir "$(CARD_DIR)"); fi; \
+	if [[ -n "$(FAST_AGENT)" ]]; then cmd+=(--agent "$(FAST_AGENT)"); fi; \
+	echo "Running: $${cmd[*]}"; \
+	"$${cmd[@]}"
diff --git a/README.md b/README.md
index 24e9088..e10eb97 100644
--- a/README.md
+++ b/README.md
@@ -4,12 +4,18 @@
 
 Generate and evaluate agent skills based on traces with agents. Create skills with teacher models (expensive/slow) that student models (cheap/fast) can use to perform harder tasks reliably.
 
+
+> [!TIP] 
+> 
+> UPskill v2 - recommended default config file now runs evaluations on Hugging Face Jobs. Make sure
+> to set your `HF_TOKEN` and use `--artifact-repo <dataset-name>` for job creation and result capture  
+
 ## Quick Start
 
 Install upskill:
 
 ```bash
-pip install upskill
+uv pip install upskill
 # or just use uv
 uvx upskill
 ```
@@ -42,6 +48,53 @@ View the results later.
 upskill runs --skill git-commit-messages
 ```
 
+## Development checks
+
+This repo uses a CI flow inspired by `fast-agent` with separate format, lint, typecheck, and test
+stages.
+
+Install dev dependencies:
+
+```bash
+uv sync --extra dev
+```
+
+Run the quality gates locally:
+
+```bash
+uv run scripts/format.py
+uv run scripts/lint.py
+uv run scripts/typecheck.py
+uv run scripts/cpd.py --check
+uv run --extra dev pytest -v
+```
+
+Or use the helper script to run the whole sequence:
+
+```bash
+uv run scripts/check.py
+```
+
+Add `--sync` to include `uv sync --extra dev`, or `--skip-tests` for a faster static-only pass.
+
+To auto-format before re-running checks:
+
+```bash
+uv run --extra dev scripts/format.py --write
+```
+
+Current enforced standards:
+
+- `ruff format --check` for formatting
+- `ruff check` for style, imports, modernization, bugbear, simplify, and import-hygiene rules
+- cyclomatic complexity via Ruff `C90` with `max-complexity = 15`
+- `ty check` across `src`, `tests`, and `scripts`
+- `pmd cpd` via `scripts/cpd.py --check` to flag duplicated code in `src/`
+- `pytest` for the test suite
+
+CI enforcement lives in `.github/workflows/ci.yml` and runs on pushes and pull requests targeting
+`main`.
+
 ## Model Handling Overview
 
 upskill uses distinct phases with explicit model roles:
@@ -82,13 +135,15 @@ upskill generate TASK [OPTIONS]
 
 **Options:**
 - `-e, --example` - Input -> output example (can be repeated)
-- `--tool` - Generate from MCP tool schema (path#tool_name)
 - `-f, --from PATH` - Improve from existing skill dir or agent trace file (auto-detected)
 - `-m, --model MODEL` - Skill generation model (e.g., 'sonnet', 'haiku', 'anthropic.claude-sonnet-4-20250514')
 - `--test-gen-model MODEL` - Override test generation model for this run
 - `-o, --output PATH` - Output directory for skill
 - `--no-eval` - Skip evaluation and refinement
 - `--eval-model MODEL` - Different model to evaluate skill on
+- `--executor [local|jobs]` - Execution backend for evaluation/refinement; overrides config
+- `--artifact-repo TEXT` - Dataset repo for remote fast-agent job artifacts (required with `--executor jobs`)
+- `--max-parallel N` - Max concurrent evaluation executions; overrides config
 - `--runs-dir PATH` - Directory for run logs (default: ./runs)
 - `--log-runs / --no-log-runs` - Log run data (default: enabled)
 
@@ -101,6 +156,9 @@ upskill generate "parse JSON Schema files"
 # Make and evaluate skills for less powerful models
 upskill generate "write git commits" --model sonnet --eval-model haiku
 
+# Remote execution on Hugging Face Jobs
+upskill generate "parse invoices" --executor jobs --artifact-repo <user>/upskill-tests
+
 # Improve an existing skill (auto-detected as directory)
 upskill generate "add more error handling examples" --from ./skills/api-errors/
 
@@ -147,9 +205,11 @@ upskill eval SKILL_PATH [OPTIONS]
 - `-t, --tests PATH` - Test cases JSON file
 - `-m, --model MODEL` - Model(s) to evaluate against (repeatable for multi-model benchmarking)
 - `--test-gen-model MODEL` - Override test generation model when tests must be generated
-- `--runs N` - Number of runs per model (default: 1)
+- `--runs N` - Number of runs per model; overrides config
 - `--no-baseline` - Skip baseline comparison (simple eval mode only; ignored in benchmark mode)
 - `-v, --verbose` - Show per-test results
+- `--executor [local|jobs]` - Execution backend for evaluation; overrides config
+- `--max-parallel N` - Max concurrent evaluation executions; overrides config
 - `--log-runs / --no-log-runs` - Log run data (default: enabled)
 - `--runs-dir PATH` - Directory for run logs
 
@@ -403,7 +463,13 @@ eval_model: haiku               # Default evaluation model (optional)
 test_gen_model: null            # Optional test generation model
 skills_dir: ./skills            # Where to save skills
 runs_dir: ./runs                # Where to save run logs
-max_refine_attempts: 3          # Refinement iterations
+max_refine_attempts: 2          # Refinement iterations
+executor: local                 # Default execution backend
+num_runs: 1                     # Default eval/benchmark runs when --runs is omitted
+max_parallel: 5                 # Default concurrent evaluation executions
+jobs_secrets: HF_TOKEN          # Comma-separated HF Jobs env var names to forward
+jobs_image: ghcr.io/astral-sh/uv:python3.13-bookworm  # HF Jobs container image
+# fastagent_config: ./fastagent.config.yaml  # Optional FastAgent config override
 ```
 
 `test_gen_model` fallback behavior:
@@ -417,6 +483,22 @@ max_refine_attempts: 3          # Refinement iterations
 Backward compatibility: `model` is still accepted in config files as a legacy alias for
 `skill_generation_model`.
 
+CLI flags override config values for execution settings:
+
+- `--executor` overrides `executor`
+- `--runs` overrides `num_runs`
+- `--max-parallel` overrides `max_parallel`
+- `--jobs-secrets` overrides `jobs_secrets`
+
+If you set `executor: jobs`, you still need the required jobs-specific CLI inputs such as
+`--artifact-repo`.
+
+`jobs_secrets` is a comma-separated list of environment variable names to forward into
+remote HF Jobs runs. It should contain secret names such as `HF_TOKEN` or
+`ANTHROPIC_API_KEY`, not literal secret values.
+
+`jobs_image` controls which container image HF Jobs uses for remote execution.
+
 Config lookup order:
 
 1. `UPSKILL_CONFIG` environment variable (path)
diff --git a/fastagent.config.yaml b/fastagent.config.yaml
index 67747f3..d7b381d 100644
--- a/fastagent.config.yaml
+++ b/fastagent.config.yaml
@@ -6,22 +6,7 @@
 # Examples: anthropic.claude-sonnet-4-20250514, openai.gpt-4.1
 # Aliases: haiku, sonnet, opus (Anthropic), gpt-4.1, o3-mini (OpenAI)
 # Local models: generic.<model_name> (e.g., generic.llama3.2:latest)
-default_model: kimi
-
-# Generic provider for local OpenAI-compatible endpoints (Ollama, llama.cpp, etc.)
-# Override with GENERIC_BASE_URL and GENERIC_API_KEY environment variables
-generic:
-  api_key: "local"
-  base_url: "http://localhost:11434/v1"
-
-# MCP timeline display settings
-mcp_timeline:
-  steps: 20
-  step_seconds: 15
-
-# Shell execution settings
-shell_execution:
-  show_bash: true
+#default_model: kimi
 
 # Logging and Console Configuration
 logger:
@@ -29,4 +14,5 @@ logger:
   show_chat: false
   show_tools: false
   truncate_tools: true
-  streaming: none
+  streaming: markdown
+
diff --git a/pyproject.toml b/pyproject.toml
index 160bcec..fb2f0c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.13.5,<3.14"
 dependencies = [
     "click>=8.1",
-    "fast-agent-mcp>=0.4.53",
+    "fast-agent-mcp>=0.6.7",
     "pydantic>=2.0",
     "python-dotenv>=1.0",
     "pyyaml>=6.0",
@@ -21,7 +21,8 @@ Repository = "https://github.com/huggingface/upskill"
 dev = [
     "pytest>=8.0",
     "pytest-asyncio>=0.23",
-    "ruff>=0.4",
+    "ruff>=0.11",
+    "ty>=0.0.23",
 ]
 
 [project.scripts]
@@ -39,4 +40,21 @@ line-length = 100
 target-version = "py313"
 
 [tool.ruff.lint]
-select = ["E", "F", "I", "UP"]
+select = [
+    "B",
+    "C90",
+    "E",
+    "F",
+    "I",
+    "RUF",
+    "SIM",
+    "TCH",
+    "UP",
+]
+ignore = ["E501"]
+
+[tool.ruff.lint.mccabe]
+max-complexity = 15
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..902e2a6
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1 @@
+"""Developer scripts for the upskill repository."""
diff --git a/scripts/check.py b/scripts/check.py
new file mode 100644
index 0000000..a59ba13
--- /dev/null
+++ b/scripts/check.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Final
+
+PROJECT_ROOT: Final = Path(__file__).resolve().parent.parent
+
+
+@dataclass(frozen=True)
+class CheckStep:
+    """A named local quality-gate command."""
+
+    name: str
+    command: tuple[str, ...]
+
+
+def build_check_steps(*, skip_tests: bool = False) -> list[CheckStep]:
+    """Build the local quality-gate command sequence."""
+    python_executable = sys.executable
+    steps = [
+        CheckStep("format", (python_executable, str(PROJECT_ROOT / "scripts" / "format.py"))),
+        CheckStep("lint", (python_executable, str(PROJECT_ROOT / "scripts" / "lint.py"))),
+        CheckStep("typecheck", (python_executable, str(PROJECT_ROOT / "scripts" / "typecheck.py"))),
+        CheckStep(
+            "cpd",
+            (python_executable, str(PROJECT_ROOT / "scripts" / "cpd.py"), "--check"),
+        ),
+    ]
+    if not skip_tests:
+        steps.append(CheckStep("pytest", (python_executable, "-m", "pytest", "-v")))
+    return steps
+
+
+def run_step(step: CheckStep) -> int:
+    """Run a single quality-gate step."""
+    print(f"\n==> {step.name}: {' '.join(step.command)}", flush=True)
+    try:
+        completed = subprocess.run(step.command, cwd=PROJECT_ROOT, check=False)
+    except FileNotFoundError as error:
+        print(f"Error: failed to execute {step.name}: {error}", file=sys.stderr)
+        return 1
+    return completed.returncode
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run the local quality-gate sequence.")
+    parser.add_argument(
+        "--skip-tests",
+        action="store_true",
+        help="Skip pytest after running the static checks.",
+    )
+    parser.add_argument(
+        "--sync",
+        action="store_true",
+        help="Run `uv sync --extra dev` before the quality gates.",
+    )
+    args = parser.parse_args()
+
+    if args.sync:
+        sync_step = CheckStep("sync", ("uv", "sync", "--extra", "dev"))
+        if run_step(sync_step) != 0:
+            return 1
+
+    for step in build_check_steps(skip_tests=args.skip_tests):
+        if run_step(step) != 0:
+            return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/cpd.py b/scripts/cpd.py
new file mode 100644
index 0000000..286a828
--- /dev/null
+++ b/scripts/cpd.py
@@ -0,0 +1,342 @@
+#!/usr/bin/env python3
+"""Copy/Paste Detector (CPD) runner for upskill.
+
+Uses PMD's CPD tool to detect duplicated code in the Python source tree.
+If Java or PMD are not already available, the script downloads them into
+``~/tools`` and reuses them on later runs.
+
+Usage:
+    uv run scripts/cpd.py [--min-tokens N] [--format FORMAT] [--report FILE]
+
+Options:
+    --min-tokens N   Minimum token count for duplication (default: 100)
+    --format FORMAT  Output format: text, csv, xml (default: text)
+    --report FILE    Write report to file (default: stdout)
+    --check          Exit with error code if duplications are found
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import platform
+import shutil
+import subprocess
+import sys
+import tarfile
+import urllib.request
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Final
+
+JRE_VERSION: Final = "17.0.9+9"
+PMD_VERSION: Final = "7.9.0"
+
+TOOLS_DIR: Final = Path.home() / "tools"
+JRE_DIR: Final = TOOLS_DIR / f"jdk-{JRE_VERSION}-jre"
+PMD_DIR: Final = TOOLS_DIR / f"pmd-bin-{PMD_VERSION}"
+
+JRE_URL_TEMPLATE: Final = (
+    "https://github.com/adoptium/temurin17-binaries/releases/download/"
+    "jdk-{version}/OpenJDK17U-jre_{arch}_{os}_hotspot_{archive_version}.tar.gz"
+)
+PMD_URL: Final = (
+    "https://github.com/pmd/pmd/releases/download/pmd_releases%2F"
+    f"{PMD_VERSION}/pmd-dist-{PMD_VERSION}-bin.zip"
+)
+
+CPD_EXCLUSIONS: Final[dict[str, str]] = {}
+
+
+@dataclass(frozen=True)
+class PlatformConfig:
+    """Resolved platform labels for tool downloads."""
+
+    system: str
+    arch: str
+    os_label: str
+    arch_label: str
+
+    @property
+    def archive_version(self) -> str:
+        return JRE_VERSION.replace("+", "_")
+
+    @property
+    def version_label(self) -> str:
+        return JRE_VERSION.replace("+", "%2B")
+
+    @property
+    def java_name(self) -> str:
+        return "java.exe" if self.system == "windows" else "java"
+
+    @property
+    def pmd_name(self) -> str:
+        return "pmd.bat" if self.system == "windows" else "pmd"
+
+    @property
+    def jre_filename(self) -> str:
+        return f"OpenJDK17U-jre_{self.arch_label}_{self.os_label}_hotspot_{self.archive_version}"
+
+    @property
+    def jre_url(self) -> str:
+        return JRE_URL_TEMPLATE.format(
+            version=self.version_label,
+            arch=self.arch_label,
+            os=self.os_label,
+            archive_version=self.archive_version,
+        )
+
+
+def resolve_platform(*, system: str | None = None, arch: str | None = None) -> PlatformConfig:
+    """Resolve download labels for the current platform."""
+    normalized_system = (system or platform.system()).lower()
+    normalized_arch = (arch or platform.machine()).lower()
+
+    arch_label = {
+        "x86_64": "x64",
+        "amd64": "x64",
+        "aarch64": "aarch64",
+        "arm64": "aarch64",
+    }.get(normalized_arch, normalized_arch)
+
+    os_label = {
+        "darwin": "mac",
+        "linux": "linux",
+        "windows": "windows",
+    }.get(normalized_system, normalized_system)
+
+    return PlatformConfig(
+        system=normalized_system,
+        arch=normalized_arch,
+        os_label=os_label,
+        arch_label=arch_label,
+    )
+
+
+def download_file(url: str, destination: Path, description: str) -> None:
+    """Download a file with simple progress reporting."""
+    print(f"Downloading {description}...")
+    try:
+        urllib.request.urlretrieve(url, destination)
+    except Exception as error:  # pragma: no cover - network failures are environment-specific
+        print(f"  Failed to download: {error}", file=sys.stderr)
+        raise SystemExit(1) from error
+    print(f"  Downloaded to {destination}")
+
+
+def extract_tar_archive(archive_path: Path, destination: Path) -> None:
+    """Extract a tar.gz archive while guarding against path traversal."""
+    destination_root = destination.resolve()
+    with tarfile.open(archive_path, "r:gz") as archive:
+        for member in archive.getmembers():
+            member_path = (destination_root / member.name).resolve()
+            try:
+                member_path.relative_to(destination_root)
+            except ValueError as error:
+                message = f"Unsafe path in archive {archive_path}: {member.name}"
+                raise RuntimeError(message) from error
+        archive.extractall(destination_root)
+
+
+def ensure_jre(platform_config: PlatformConfig) -> Path:
+    """Ensure Java is available, downloading a JRE if required."""
+    java_bin = JRE_DIR / "bin" / platform_config.java_name
+    if java_bin.exists():
+        return JRE_DIR
+
+    system_java = shutil.which("java")
+    if system_java:
+        try:
+            result = subprocess.run(
+                [system_java, "-version"],
+                capture_output=True,
+                check=False,
+                text=True,
+            )
+        except OSError:
+            pass
+        else:
+            version_output = result.stderr + result.stdout
+            if "17" in version_output or "21" in version_output:
+                print(f"Using system Java: {system_java}")
+                return Path(system_java).resolve().parent.parent
+
+    TOOLS_DIR.mkdir(parents=True, exist_ok=True)
+    archive_path = TOOLS_DIR / f"{platform_config.jre_filename}.tar.gz"
+    if not archive_path.exists():
+        download_file(platform_config.jre_url, archive_path, f"Java JRE {JRE_VERSION}")
+
+    print("Extracting Java JRE...")
+    extract_tar_archive(archive_path, TOOLS_DIR)
+
+    if JRE_DIR.exists():
+        return JRE_DIR
+
+    for candidate in TOOLS_DIR.iterdir():
+        if candidate.is_dir() and candidate.name.startswith("jdk-17"):
+            return candidate
+
+    message = f"Unable to locate extracted Java runtime under {TOOLS_DIR}"
+    raise RuntimeError(message)
+
+
+def ensure_pmd(platform_config: PlatformConfig) -> Path:
+    """Ensure PMD is available, downloading it if required."""
+    pmd_bin = PMD_DIR / "bin" / platform_config.pmd_name
+    if pmd_bin.exists():
+        return PMD_DIR
+
+    TOOLS_DIR.mkdir(parents=True, exist_ok=True)
+    archive_path = TOOLS_DIR / f"pmd-{PMD_VERSION}.zip"
+    if not archive_path.exists():
+        download_file(PMD_URL, archive_path, f"PMD {PMD_VERSION}")
+
+    print("Extracting PMD...")
+    with zipfile.ZipFile(archive_path, "r") as archive:
+        archive.extractall(TOOLS_DIR)
+
+    if platform_config.system != "windows":
+        pmd_bin.chmod(0o755)
+
+    return PMD_DIR
+
+
+def build_cpd_command(
+    *,
+    platform_config: PlatformConfig,
+    pmd_dir: Path,
+    src_dir: Path,
+    excluded_paths: list[Path],
+    min_tokens: int,
+    output_format: str,
+) -> list[str]:
+    """Build the PMD CPD command line."""
+    pmd_bin = pmd_dir / "bin" / platform_config.pmd_name
+    command = [
+        str(pmd_bin),
+        "cpd",
+        "--language",
+        "python",
+        "--minimum-tokens",
+        str(min_tokens),
+        "--dir",
+        str(src_dir),
+        "--format",
+        output_format,
+    ]
+    for excluded_path in excluded_paths:
+        command.extend(["--exclude", str(excluded_path)])
+    return command
+
+
+def run_cpd(
+    *,
+    platform_config: PlatformConfig,
+    java_home: Path,
+    pmd_dir: Path,
+    src_dir: Path,
+    excluded_paths: list[Path],
+    min_tokens: int = 100,
+    output_format: str = "text",
+) -> tuple[int, str]:
+    """Run CPD and return its exit code and combined output."""
+    env = os.environ.copy()
+    env["JAVA_HOME"] = str(java_home)
+    env["PATH"] = f"{java_home / 'bin'}{os.pathsep}{env.get('PATH', '')}"
+
+    command = build_cpd_command(
+        platform_config=platform_config,
+        pmd_dir=pmd_dir,
+        src_dir=src_dir,
+        excluded_paths=excluded_paths,
+        min_tokens=min_tokens,
+        output_format=output_format,
+    )
+    result = subprocess.run(command, capture_output=True, check=False, text=True, env=env)
+    return result.returncode, result.stdout + result.stderr
+
+
+def resolve_cli_exit_code(*, cpd_exit_code: int, check: bool) -> int:
+    """Map PMD CPD exit codes to the script's CLI exit codes."""
+    if cpd_exit_code == 4:
+        return 1 if check else 0
+    return cpd_exit_code
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Detect duplicated code in upskill source")
+    parser.add_argument(
+        "--min-tokens",
+        type=int,
+        default=100,
+        help="Minimum token count for duplication (default: 100)",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["text", "csv", "xml"],
+        default="text",
+        help="Output format (default: text)",
+    )
+    parser.add_argument(
+        "--report",
+        type=Path,
+        help="Write report to file (default: stdout)",
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Exit with error code if duplications are found",
+    )
+    args = parser.parse_args()
+
+    script_dir = Path(__file__).resolve().parent
+    project_root = script_dir.parent
+    src_dir = project_root / "src"
+    if not src_dir.exists():
+        print(f"Source directory not found: {src_dir}", file=sys.stderr)
+        return 1
+
+    excluded_paths = [project_root / relative_path for relative_path in CPD_EXCLUSIONS]
+    platform_config = resolve_platform()
+
+    print("Checking dependencies...")
+    java_home = ensure_jre(platform_config)
+    pmd_dir = ensure_pmd(platform_config)
+    print()
+
+    print(f"Running CPD on {src_dir} (min-tokens={args.min_tokens})...")
+    if excluded_paths:
+        print("Excluding intentional duplicates:")
+        for relative_path, reason in CPD_EXCLUSIONS.items():
+            print(f"  - {relative_path}: {reason}")
+    print()
+
+    cpd_exit_code, output = run_cpd(
+        platform_config=platform_config,
+        java_home=java_home,
+        pmd_dir=pmd_dir,
+        src_dir=src_dir,
+        excluded_paths=excluded_paths,
+        min_tokens=args.min_tokens,
+        output_format=args.format,
+    )
+
+    if args.report:
+        args.report.write_text(output, encoding="utf-8")
+        print(f"Report written to {args.report}")
+    else:
+        print(output)
+
+    if cpd_exit_code == 4:
+        print("\n⚠️  Duplicated code detected!")
+    elif cpd_exit_code == 0:
+        print("\n✅ No duplicated code found.")
+    else:
+        print(f"\n❌ CPD failed with exit code {cpd_exit_code}", file=sys.stderr)
+
+    return resolve_cli_exit_code(cpd_exit_code=cpd_exit_code, check=args.check)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/format.py b/scripts/format.py
new file mode 100644
index 0000000..39fa78f
--- /dev/null
+++ b/scripts/format.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+
+DEFAULT_PATHS = ["src", "tests", "scripts"]
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run ruff format for the repo.")
+    parser.add_argument(
+        "--write",
+        action="store_true",
+        help="Apply formatting changes instead of checking only.",
+    )
+    parser.add_argument("paths", nargs="*", default=DEFAULT_PATHS, help="Optional paths to format.")
+    args = parser.parse_args()
+
+    command = ["ruff", "format", *args.paths]
+    if not args.write:
+        command.insert(2, "--check")
+
+    try:
+        completed = subprocess.run(command, check=False)
+    except FileNotFoundError:
+        print("Error: `ruff` is not installed in the current environment.", file=sys.stderr)
+        return 1
+
+    return completed.returncode
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/hf/job_entrypoint_eval_fast_agent.sh b/scripts/hf/job_entrypoint_eval_fast_agent.sh
new file mode 100644
index 0000000..98e757f
--- /dev/null
+++ b/scripts/hf/job_entrypoint_eval_fast_agent.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BUNDLE_DIR="${1:?bundle_dir required}"
+OUT_DIR="${2:?out_dir required}"
+
+mkdir -p "$OUT_DIR/results" "$OUT_DIR/logs" "$OUT_DIR/status" "$OUT_DIR/workspaces"
+cp -f "$BUNDLE_DIR/manifest.json" "$OUT_DIR/manifest.json" || true
+
+COMMON=(fast-agent go --skills-dir "$BUNDLE_DIR/skills" --quiet)
+if [[ -f "$BUNDLE_DIR/fastagent.config.yaml" ]]; then
+  COMMON+=(--config-path "$BUNDLE_DIR/fastagent.config.yaml")
+fi
+if [[ -d "$BUNDLE_DIR/cards" ]]; then
+  COMMON+=(--card "$BUNDLE_DIR/cards")
+fi
+if [[ -f "$BUNDLE_DIR/agent.txt" ]]; then
+  COMMON+=(--agent "$(cat "$BUNDLE_DIR/agent.txt")")
+fi
+
+FAST_MODEL="${FAST_MODEL:?FAST_MODEL is required}"
+overall_status=0
+
+for request_dir in "$BUNDLE_DIR"/requests/*; do
+  [[ -d "$request_dir" ]] || continue
+  request_id="$(basename "$request_dir")"
+  prompt_path="$request_dir/prompt.txt"
+  workspace_src="$request_dir/workspace"
+
+  workspace_tmp="$(mktemp -d)"
+  if [[ -d "$workspace_src" ]]; then
+    cp -a "$workspace_src/." "$workspace_tmp/" 2>/dev/null || true
+  fi
+  if [[ -f "$BUNDLE_DIR/fastagent.config.yaml" ]]; then
+    cp -f "$BUNDLE_DIR/fastagent.config.yaml" "$workspace_tmp/fastagent.config.yaml"
+  fi
+
+  cmd=("${COMMON[@]}" --model "$FAST_MODEL" --prompt-file "$prompt_path" --results "$OUT_DIR/results/$request_id.json")
+
+  printf '%s\n' "${cmd[*]}" > "$OUT_DIR/logs/$request_id.command.txt"
+
+  set +e
+  (
+    cd "$workspace_tmp"
+    "${cmd[@]}" >"$OUT_DIR/logs/$request_id.out.txt" 2>"$OUT_DIR/logs/$request_id.err.txt"
+  )
+  status=$?
+  set -e
+
+  printf '%s\n' "$status" > "$OUT_DIR/status/$request_id.exit_code.txt"
+  mkdir -p "$OUT_DIR/workspaces/$request_id"
+  cp -a "$workspace_tmp/." "$OUT_DIR/workspaces/$request_id/" 2>/dev/null || true
+  rm -rf "$workspace_tmp"
+
+  if [[ "$status" -ne 0 ]]; then
+    overall_status="$status"
+  fi
+  if [[ ! -f "$OUT_DIR/results/$request_id.json" ]]; then
+    overall_status=1
+  fi
+done
+
+exit "$overall_status"
diff --git a/scripts/hf/job_entrypoint_fast_agent.sh b/scripts/hf/job_entrypoint_fast_agent.sh
new file mode 100755
index 0000000..26e1980
--- /dev/null
+++ b/scripts/hf/job_entrypoint_fast_agent.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BUNDLE_DIR="${1:?bundle_dir required}"
+OUT_DIR="${2:?out_dir required}"
+
+mkdir -p "$OUT_DIR/results" "$OUT_DIR/logs"
+cp -f "$BUNDLE_DIR/manifest.json" "$OUT_DIR/manifest.json" || true
+
+COMMON=(fast-agent go)
+if [[ -f "$BUNDLE_DIR/fastagent.config.yaml" ]]; then
+  COMMON+=(--config-path "$BUNDLE_DIR/fastagent.config.yaml")
+fi
+COMMON+=(--skills-dir "$BUNDLE_DIR/skills")
+if [[ -d "$BUNDLE_DIR/cards" ]]; then
+  COMMON+=(--card "$BUNDLE_DIR/cards")
+fi
+if [[ -n "${FAST_AGENT:-}" ]]; then
+  COMMON+=(--agent "$FAST_AGENT")
+fi
+
+if [[ -f "$BUNDLE_DIR/prompts.jsonl" ]]; then
+  export BUNDLE_DIR OUT_DIR
+  python - <<'PY'
+import json, os, subprocess, pathlib, sys
+bundle = pathlib.Path(os.environ["BUNDLE_DIR"])
+out = pathlib.Path(os.environ["OUT_DIR"])
+base = ["fast-agent", "go"]
+if (bundle / "fastagent.config.yaml").exists():
+    base += ["--config-path", str(bundle / "fastagent.config.yaml")]
+base += ["--skills-dir", str(bundle / "skills")]
+if (bundle / "cards").exists():
+    base += ["--card", str(bundle / "cards")]
+agent = os.environ.get("FAST_AGENT", "")
+if agent:
+    base += ["--agent", agent]
+default_model = os.environ.get("FAST_MODEL", "")
+
+failures = 0
+summary = []
+for idx, line in enumerate((bundle / "prompts.jsonl").read_text(encoding="utf-8").splitlines(), start=1):
+    line = line.strip()
+    if not line:
+        continue
+    rec = json.loads(line)
+    rid = rec.get("id") or f"case_{idx:03d}"
+    msg = rec.get("message")
+    if not msg:
+        raise SystemExit(f"missing message at line {idx}")
+    model = rec.get("model") or default_model
+    result_path = out / "results" / f"{rid}.json"
+    cmd = base + ["--message", msg, "--results", str(result_path)]
+    if model:
+        cmd += ["--model", model]
+    stdout_path = out / "logs" / f"{rid}.out.txt"
+    stderr_path = out / "logs" / f"{rid}.err.txt"
+    with stdout_path.open("w", encoding="utf-8") as so, stderr_path.open("w", encoding="utf-8") as se:
+        proc = subprocess.run(cmd, stdout=so, stderr=se)
+    summary.append({"id": rid, "exit_code": proc.returncode, "model": model})
+    if proc.returncode != 0:
+        failures += 1
+
+(out / "summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
+if failures:
+    sys.exit(1)
+PY
+else
+  CMD=("${COMMON[@]}" --results "$OUT_DIR/results/default.json")
+  if [[ -n "${FAST_MODEL:-}" ]]; then
+    CMD+=(--model "$FAST_MODEL")
+  fi
+
+  if [[ -n "${FAST_MESSAGE:-}" ]]; then
+    CMD+=(--message "$FAST_MESSAGE")
+  elif [[ -f "$BUNDLE_DIR/prompt.txt" ]]; then
+    CMD+=(--prompt-file "$BUNDLE_DIR/prompt.txt")
+  else
+    echo "No message/prompt input found" >&2
+    exit 2
+  fi
+
+  echo "Running: ${CMD[*]}" | tee "$OUT_DIR/command.txt"
+  ("${CMD[@]}") 2>&1 | tee "$OUT_DIR/logs/default.out.txt"
+fi
diff --git a/scripts/hf/submit_hf_job.sh b/scripts/hf/submit_hf_job.sh
new file mode 100755
index 0000000..468f1e4
--- /dev/null
+++ b/scripts/hf/submit_hf_job.sh
@@ -0,0 +1,256 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+readonly DEFAULT_IMAGE="ghcr.io/astral-sh/uv:python3.13-bookworm"
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  submit_hf_job.sh \
+    --artifact-repo <namespace/repo> \
+    --skills-dir <path> \
+    [--card-dir <path>] \
+    [--agent <name>] \
+    [--model <name>] \
+    [--message <text> | --prompt-file <path> | --prompts-jsonl <path>] \
+    [--flavor cpu-basic] \
+    [--timeout 45m] \
+    [--image ghcr.io/astral-sh/uv:python3.13-bookworm] \
+    [--secrets HF_TOKEN,OPENAI_API_KEY] \
+    [--namespace my-org] \
+    [--yes] \
+    [--json]
+
+Notes:
+  - Artifacts are stored in dataset repo under inputs/<run_id>/ and outputs/<run_id>/
+  - prompts-jsonl mode expects one JSON object per line:
+      {"id":"case1","message":"...","model":"haiku"}
+USAGE
+}
+
+fail() {
+  echo "$*" >&2
+  exit 1
+}
+
+trim() {
+  xargs <<<"$1"
+}
+
+prepare_secret_flags() {
+  IFS=',' read -r -a secret_keys <<< "$SECRETS"
+  secret_flags=()
+  echo "Secrets to forward:"
+  for raw_key in "${secret_keys[@]}"; do
+    key="$(trim "$raw_key")"
+    [[ -n "$key" ]] || continue
+    if [[ -n "${!key:-}" ]]; then
+      echo "  - $key (present locally)"
+    else
+      echo "  - $key (NOT set locally)"
+    fi
+    secret_flags+=(--secrets "$key")
+  done
+}
+
+check_artifact_repo() {
+  hf download "$ARTIFACT_REPO" --repo-type dataset --dry-run --quiet >/dev/null || \
+    fail "Artifact repo $ARTIFACT_REPO is not accessible. Create it first and ensure your current Hugging Face credentials can access it."
+}
+
+submit_bundle_job() {
+  check_artifact_repo
+
+  tar -czf "$tmpdir/bundle.tar.gz" -C "$tmpdir" bundle
+  hf upload "$ARTIFACT_REPO" "$tmpdir/bundle.tar.gz" "inputs/$RUN_ID/bundle.tar.gz" \
+    --repo-type dataset \
+    --commit-message "inputs: $RUN_ID" >/dev/null
+
+  prepare_secret_flags
+
+  if [[ "$AUTO_CONFIRM" != "1" ]]; then
+    read -r -p "Proceed with HF Job submission? [y/N] " confirm
+    [[ "$confirm" =~ ^[Yy]$ ]] || fail "Cancelled."
+  fi
+
+  ns_flags=()
+  if [[ -n "$NAMESPACE" ]]; then
+    ns_flags+=(--namespace "$NAMESPACE")
+  fi
+
+  job_id="$(
+    hf jobs run \
+      --detach \
+      --flavor "$FLAVOR" \
+      --timeout "$TIMEOUT" \
+      "${ns_flags[@]}" \
+      "${secret_flags[@]}" \
+      "${env_flags[@]}" \
+      -- \
+      "$IMAGE" \
+      bash -lc "$job_cmd"
+  )"
+
+  job_id="$(echo "$job_id" | tail -n 1 | xargs)"
+
+  if [[ "$JSON_OUTPUT" == "1" ]]; then
+    cat <<JSON
+{"job_id":"$job_id","run_id":"$RUN_ID","artifact_repo":"$ARTIFACT_REPO"}
+JSON
+  else
+    echo "JOB_ID=$job_id"
+    echo "RUN_ID=$RUN_ID"
+    echo "ARTIFACT_REPO=$ARTIFACT_REPO"
+  fi
+}
+
+ARTIFACT_REPO=""
+SKILLS_DIR=""
+CARD_DIR=""
+AGENT=""
+MODEL=""
+MESSAGE=""
+PROMPT_FILE=""
+PROMPTS_JSONL=""
+FLAVOR="cpu-basic"
+TIMEOUT="45m"
+IMAGE="$DEFAULT_IMAGE"
+SECRETS="HF_TOKEN"
+NAMESPACE=""
+RUN_ID=""
+AUTO_CONFIRM="0"
+JSON_OUTPUT="0"
+tmpdir=""
+job_cmd=""
+secret_flags=()
+env_flags=()
+ns_flags=()
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --artifact-repo) ARTIFACT_REPO="$2"; shift 2 ;;
+    --skills-dir) SKILLS_DIR="$2"; shift 2 ;;
+    --card-dir) CARD_DIR="$2"; shift 2 ;;
+    --agent) AGENT="$2"; shift 2 ;;
+    --model) MODEL="$2"; shift 2 ;;
+    --message) MESSAGE="$2"; shift 2 ;;
+    --prompt-file) PROMPT_FILE="$2"; shift 2 ;;
+    --prompts-jsonl) PROMPTS_JSONL="$2"; shift 2 ;;
+    --flavor) FLAVOR="$2"; shift 2 ;;
+    --timeout) TIMEOUT="$2"; shift 2 ;;
+    --image) IMAGE="$2"; shift 2 ;;
+    --secrets) SECRETS="$2"; shift 2 ;;
+    --namespace) NAMESPACE="$2"; shift 2 ;;
+    --yes) AUTO_CONFIRM="1"; shift 1 ;;
+    --json) JSON_OUTPUT="1"; shift 1 ;;
+    -h|--help) usage; exit 0 ;;
+    *) fail "Unknown arg: $1" ;;
+  esac
+done
+
+[[ -n "$ARTIFACT_REPO" && -n "$SKILLS_DIR" ]] || {
+  usage
+  exit 1
+}
+[[ -d "$SKILLS_DIR" ]] || fail "Skills dir not found: $SKILLS_DIR"
+
+input_modes=0
+[[ -n "$MESSAGE" ]] && input_modes=$((input_modes + 1))
+[[ -n "$PROMPT_FILE" ]] && input_modes=$((input_modes + 1))
+[[ -n "$PROMPTS_JSONL" ]] && input_modes=$((input_modes + 1))
+[[ "$input_modes" -eq 1 ]] || fail "Provide exactly one of --message, --prompt-file, or --prompts-jsonl"
+
+if [[ -n "$PROMPT_FILE" && ! -f "$PROMPT_FILE" ]]; then
+  fail "Prompt file not found: $PROMPT_FILE"
+fi
+if [[ -n "$PROMPTS_JSONL" && ! -f "$PROMPTS_JSONL" ]]; then
+  fail "Prompts JSONL not found: $PROMPTS_JSONL"
+fi
+if [[ -n "$CARD_DIR" && ! -d "$CARD_DIR" ]]; then
+  fail "Card dir not found: $CARD_DIR"
+fi
+if [[ -n "$AGENT" && -z "$CARD_DIR" ]]; then
+  fail "--agent requires --card-dir"
+fi
+
+RUN_ID="${RUN_ID:-$(date -u +'%Y%m%dT%H%M%SZ')_fast-agent}"
+echo "RUN_ID=$RUN_ID"
+
+tmpdir="$(mktemp -d)"
+trap 'rm -rf "$tmpdir"' EXIT
+bundle_dir="$tmpdir/bundle"
+mkdir -p "$bundle_dir"
+
+cp -R "$SKILLS_DIR" "$bundle_dir/skills"
+cp scripts/hf/job_entrypoint_fast_agent.sh "$bundle_dir/job_entrypoint.sh"
+chmod +x "$bundle_dir/job_entrypoint.sh"
+
+if [[ -n "$CARD_DIR" ]]; then
+  cp -R "$CARD_DIR" "$bundle_dir/cards"
+fi
+if [[ -n "$PROMPT_FILE" ]]; then
+  cp "$PROMPT_FILE" "$bundle_dir/prompt.txt"
+fi
+if [[ -n "$PROMPTS_JSONL" ]]; then
+  cp "$PROMPTS_JSONL" "$bundle_dir/prompts.jsonl"
+fi
+if [[ -f "fastagent.config.yaml" ]]; then
+  cp "fastagent.config.yaml" "$bundle_dir/fastagent.config.yaml"
+fi
+
+mode_name="prompts-jsonl"
+if [[ -n "$MESSAGE" ]]; then
+  mode_name="message"
+elif [[ -n "$PROMPT_FILE" ]]; then
+  mode_name="prompt-file"
+fi
+
+cat > "$bundle_dir/manifest.json" <<JSON
+{
+  "run_id": "$RUN_ID",
+  "artifact_repo": "$ARTIFACT_REPO",
+  "skills_dir": "$SKILLS_DIR",
+  "card_dir": "$CARD_DIR",
+  "agent": "$AGENT",
+  "model": "$MODEL",
+  "mode": "$mode_name",
+  "created_at_utc": "$(date -u +'%Y-%m-%dT%H:%M:%SZ')"
+}
+JSON
+
+job_cmd='
+set -euo pipefail
+WORK=/workspace
+mkdir -p "$WORK/out"
+cd "$WORK"
+
+uv pip install --system "huggingface_hub==1.7.2" "fast-agent-mcp==0.6.7"
+
+hf download "$ARTIFACT_REPO" "inputs/$RUN_ID/bundle.tar.gz" --repo-type dataset --local-dir "$WORK"
+tar -xzf "$WORK/inputs/$RUN_ID/bundle.tar.gz" -C "$WORK"
+
+set +e
+bash "$WORK/bundle/job_entrypoint.sh" "$WORK/bundle" "$WORK/out"
+status=$?
+set -e
+
+echo "$status" > "$WORK/out/exit_code.txt"
+
+hf upload "$ARTIFACT_REPO" "$WORK/out" "outputs/$RUN_ID" \
+  --repo-type dataset \
+  --commit-message "outputs: $RUN_ID (exit=$status)"
+
+exit "$status"
+'
+
+env_flags=(
+  --env "ARTIFACT_REPO=$ARTIFACT_REPO"
+  --env "RUN_ID=$RUN_ID"
+  --env "FAST_MODEL=$MODEL"
+  --env "FAST_AGENT=$AGENT"
+)
+if [[ -n "$MESSAGE" ]]; then
+  env_flags+=(--env "FAST_MESSAGE=$MESSAGE")
+fi
+
+submit_bundle_job
diff --git a/scripts/lint.py b/scripts/lint.py
new file mode 100644
index 0000000..2ba01cb
--- /dev/null
+++ b/scripts/lint.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+
+DEFAULT_PATHS = ["src", "tests", "scripts"]
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run ruff lint checks for the repo.")
+    parser.add_argument("--fix", action="store_true", help="Apply safe ruff fixes.")
+    parser.add_argument("paths", nargs="*", default=DEFAULT_PATHS, help="Optional paths to lint.")
+    args = parser.parse_args()
+
+    command = ["ruff", "check", *args.paths]
+    if args.fix:
+        command.insert(2, "--fix")
+
+    try:
+        completed = subprocess.run(command, check=False)
+    except FileNotFoundError:
+        print("Error: `ruff` is not installed in the current environment.", file=sys.stderr)
+        return 1
+
+    return completed.returncode
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/typecheck.py b/scripts/typecheck.py
new file mode 100644
index 0000000..775e753
--- /dev/null
+++ b/scripts/typecheck.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+
+DEFAULT_PATHS = ["src", "tests", "scripts"]
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run ty type checks for the repo.")
+    parser.add_argument(
+        "paths",
+        nargs="*",
+        default=DEFAULT_PATHS,
+        help="Optional paths to type check.",
+    )
+    args = parser.parse_args()
+
+    command = ["ty", "check", *args.paths]
+
+    try:
+        completed = subprocess.run(command, check=False)
+    except FileNotFoundError:
+        print("Error: `ty` is not installed in the current environment.", file=sys.stderr)
+        return 1
+
+    return completed.returncode
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/smoke_test.sh b/smoke_test.sh
new file mode 100755
index 0000000..fed7d08
--- /dev/null
+++ b/smoke_test.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$ROOT_DIR"
+
+TASK="${TASK:-write a good pull request description}"
+MODEL="${MODEL:-qwen35}"
+GENERATE_MODEL="${GENERATE_MODEL:-$MODEL}"
+TEST_GEN_MODEL="${TEST_GEN_MODEL:-opus}"
+START_AT="${START_AT:-prepare}"
+ARTIFACT_REPO="${ARTIFACT_REPO:?Set ARTIFACT_REPO to <namespace>/upskill-evals}"
+JOBS_SECRETS="${JOBS_SECRETS:?Set JOBS_SECRETS, e.g. HF_TOKEN,OPENROUTER_API_KEY}"
+JOBS_TIMEOUT="${JOBS_TIMEOUT:-45m}"
+JOBS_FLAVOR="${JOBS_FLAVOR:-cpu-basic}"
+OUT_ROOT="${OUT_ROOT:-$ROOT_DIR/.smoke-test}"
+SKILL_OUTPUT="${SKILL_OUTPUT:-$OUT_ROOT/generated-skill}"
+LOCAL_RUNS_DIR="${LOCAL_RUNS_DIR:-$OUT_ROOT/local-runs}"
+REMOTE_RUNS_DIR="${REMOTE_RUNS_DIR:-$OUT_ROOT/remote-runs}"
+
+mkdir -p "$OUT_ROOT"
+
+if [[ "$START_AT" != "prepare" && "$START_AT" != "remote" && "$START_AT" != "local" ]]; then
+  echo "START_AT must be one of: prepare, remote, local" >&2
+  exit 1
+fi
+
+has_prepared_skill=0
+if [[ -f "$SKILL_OUTPUT/SKILL.md" && -f "$SKILL_OUTPUT/skill_meta.json" ]]; then
+  has_prepared_skill=1
+fi
+
+echo "== Model secret check =="
+fast-agent check models --for-model "$MODEL" --json || true
+
+if [[ "$START_AT" == "prepare" ]]; then
+  echo
+  echo "== Prepare skill + tests (no eval) =="
+  rm -rf "$SKILL_OUTPUT"
+  mkdir -p "$(dirname "$SKILL_OUTPUT")"
+  export SMOKE_TASK="$TASK"
+  export SMOKE_GENERATE_MODEL="$GENERATE_MODEL"
+  export SMOKE_TEST_GEN_MODEL="$TEST_GEN_MODEL"
+  export SMOKE_SKILL_OUTPUT="$SKILL_OUTPUT"
+  uv run python - <<'PY'
+import asyncio
+import os
+from pathlib import Path
+
+from upskill.cli import _fast_agent_context, _set_agent_model
+from upskill.config import Config
+from upskill.generate import generate_skill, generate_tests
+
+
+async def main() -> None:
+    task = os.environ["SMOKE_TASK"]
+    generate_model = os.environ["SMOKE_GENERATE_MODEL"]
+    test_gen_model = os.environ["SMOKE_TEST_GEN_MODEL"]
+    output_path = Path(os.environ["SMOKE_SKILL_OUTPUT"])
+    config = Config.load()
+
+    async with _fast_agent_context(config) as agent:
+        await _set_agent_model(agent.skill_gen, generate_model)
+        skill = await generate_skill(
+            task=task,
+            generator=agent.skill_gen,
+            model=generate_model,
+        )
+        await _set_agent_model(agent.test_gen, test_gen_model)
+        tests = await generate_tests(
+            task=task,
+            generator=agent.test_gen,
+            model=test_gen_model,
+        )
+    skill.save(output_path, tests=tests)
+    print(f"Prepared skill with tests at {output_path}")
+
+
+asyncio.run(main())
+PY
+  has_prepared_skill=1
+else
+  echo
+  echo "== Reusing prepared skill =="
+  if [[ "$has_prepared_skill" != "1" ]]; then
+    echo "Prepared skill not found at $SKILL_OUTPUT" >&2
+    echo "Run with START_AT=prepare first." >&2
+    exit 1
+  fi
+  echo "Using $SKILL_OUTPUT"
+fi
+
+if [[ "$START_AT" == "prepare" || "$START_AT" == "remote" ]]; then
+  echo
+  echo "== Remote eval via HF Jobs =="
+  uv run upskill eval "$SKILL_OUTPUT" \
+    --executor jobs \
+    --artifact-repo "$ARTIFACT_REPO" \
+    -m "$MODEL" \
+    --wait \
+    --jobs-timeout "$JOBS_TIMEOUT" \
+    --jobs-flavor "$JOBS_FLAVOR" \
+    --jobs-secrets "$JOBS_SECRETS" \
+    --runs-dir "$REMOTE_RUNS_DIR"
+fi
+
+if [[ "$START_AT" == "prepare" || "$START_AT" == "remote" || "$START_AT" == "local" ]]; then
+  echo
+  echo "== Local eval via local shell-out executor =="
+  uv run upskill eval "$SKILL_OUTPUT" \
+    --executor local \
+    -m "$GENERATE_MODEL" \
+    --runs-dir "$LOCAL_RUNS_DIR"
+fi
+
+echo
+echo "Smoke test complete."
+echo "  Skill output:   $SKILL_OUTPUT"
+echo "  Local runs:     $LOCAL_RUNS_DIR"
+echo "  Remote runs:    $REMOTE_RUNS_DIR"
diff --git a/src/upskill/__init__.py b/src/upskill/__init__.py
index 40b0c50..088a9c6 100644
--- a/src/upskill/__init__.py
+++ b/src/upskill/__init__.py
@@ -22,33 +22,32 @@
     RunResult,
     Skill,
     SkillMetadata,
+    SkillRecord,
+    SkillState,
     TestCase,
     TestResult,
 )
 
 __all__ = [
-    # Config
+    "BatchSummary",
     "Config",
-    # Models
+    "ConversationStats",
+    "EvalResults",
+    "RunMetadata",
+    "RunResult",
     "Skill",
     "SkillMetadata",
+    "SkillRecord",
+    "SkillState",
     "TestCase",
     "TestResult",
-    "EvalResults",
-    "RunMetadata",
-    "RunResult",
-    "ConversationStats",
-    "BatchSummary",
-    # Generation
-    "generate_skill",
-    "generate_tests",
-    "refine_skill",
-    # Evaluation
-    "evaluate_skill",
-    # Logging
     "create_batch_folder",
     "create_run_folder",
+    "evaluate_skill",
     "extract_stats_from_summary",
+    "generate_skill",
+    "generate_tests",
+    "refine_skill",
     "summarize_runs_to_csv",
     "write_batch_summary",
     "write_run_metadata",
diff --git a/src/upskill/agent_cards/evaluator.md b/src/upskill/agent_cards/evaluator.md
index ecc2cd6..e321bcc 100644
--- a/src/upskill/agent_cards/evaluator.md
+++ b/src/upskill/agent_cards/evaluator.md
@@ -1,8 +1,24 @@
 ---
+# This file describes the Agent that is used to evaluate the skill. 
+# The system prompt used for the Agent is below the frontmatter.
+# Content included with {{file:}}, {{fileSilent:}} or {{url:https://....}} (good for remote control)
 description: Evaluate skill performance against test cases.
-skills: ["./skills"]
 # you can add mcp servers in here if needed. (reference name from config file)
+
+
+#mcp_connect:
+#  - target: "https://huggingface.co/mcp"
+#  headers:
+#    Authorization: "Bearer ${TOKEN}"
+
+# Note: MCP Servers hosted on Hugging Face get HF_TOKEN handling automatically
+# Target can include npx/uvx package names, or a shell command to start STDIO
+
 ---
-You are an evaluator of skills. You are given a skill and a test case. You need to evaluate the skill on the test case and return a score.
+You are an evaluator of skills. You are given a skill and a test case. 
+
+You need to evaluate the skill on the test case and return a score.
+
+{{agentSkills}}
 
-{{agentSkills}}
\ No newline at end of file
+{{env}}
diff --git a/src/upskill/agent_cards/skill_gen.md b/src/upskill/agent_cards/skill_gen.md
index b0ab3b0..a595a93 100644
--- a/src/upskill/agent_cards/skill_gen.md
+++ b/src/upskill/agent_cards/skill_gen.md
@@ -1,6 +1,9 @@
 ---
 type: agent
 description: Generate skill documents from task descriptions.
+skills: []
+shell: false
+model: $system.skill_gen
 ---
 You generate "skills" - instruction documents that teach AI coding agents how to perform tasks.
 
diff --git a/src/upskill/agent_cards/test_gen.md b/src/upskill/agent_cards/test_gen.md
index 3de8ebe..607a6a1 100644
--- a/src/upskill/agent_cards/test_gen.md
+++ b/src/upskill/agent_cards/test_gen.md
@@ -1,5 +1,6 @@
 ---
 type: agent
 description: Generate test cases for evaluating skills.
+model: $system.test_gen
 ---
 You generate test cases for evaluating AI agent skills. Output only valid JSON.
diff --git a/src/upskill/artifacts.py b/src/upskill/artifacts.py
new file mode 100644
index 0000000..b80da7f
--- /dev/null
+++ b/src/upskill/artifacts.py
@@ -0,0 +1,132 @@
+"""Helpers for evaluation artifact materialization."""
+
+from __future__ import annotations
+
+import json
+import re
+import shutil
+from dataclasses import asdict
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from upskill.executors.contracts import ExecutionRequest
+
+_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
+_AGENT_CARD_FILE_EXTENSIONS = {
+    ".json",
+    ".markdown",
+    ".md",
+    ".yaml",
+    ".yml",
+}
+
+
+def sanitize_artifact_name(value: str) -> str:
+    """Convert a human-facing label into a filesystem-friendly name."""
+    normalized = _NON_ALNUM_RE.sub("-", value.strip().lower()).strip("-")
+    return normalized or "execution"
+
+
+def ensure_directory(path: Path) -> Path:
+    """Create a directory and return it."""
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def validate_workspace_relative_path(relative_path: str) -> Path:
+    """Validate that a workspace file path stays within the workspace root."""
+    normalized = Path(relative_path)
+    if normalized.is_absolute():
+        raise ValueError(f"Workspace file path must be relative: {relative_path}")
+    if any(part == ".." for part in normalized.parts):
+        raise ValueError(f"Workspace file path must not traverse parents: {relative_path}")
+    return normalized
+
+
+def materialize_workspace(workspace_dir: Path, workspace_files: dict[str, str]) -> None:
+    """Write test workspace files into a preserved workspace directory."""
+    ensure_directory(workspace_dir)
+    for relative_path, content in workspace_files.items():
+        file_path = workspace_dir / validate_workspace_relative_path(relative_path)
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        file_path.write_text(content, encoding="utf-8")
+
+
+def bundle_cards(
+    source_dir: Path,
+    destination_dir: Path,
+) -> Path:
+    """Copy the agent card bundle into the artifact directory."""
+    if destination_dir.exists():
+        shutil.rmtree(destination_dir)
+    shutil.copytree(source_dir, destination_dir)
+    return destination_dir
+
+
+def bundle_agent_card(
+    source_dir: Path,
+    destination_dir: Path,
+    *,
+    agent_name: str,
+) -> Path:
+    """Copy only the selected agent card plus shared non-card resources."""
+    if destination_dir.exists():
+        shutil.rmtree(destination_dir)
+    ensure_directory(destination_dir)
+
+    if source_dir.is_file():
+        if source_dir.stem != agent_name:
+            raise FileNotFoundError(
+                f"Requested agent card {agent_name!r} does not match source file {source_dir.name!r}."
+            )
+        shutil.copy2(source_dir, destination_dir / source_dir.name)
+        return destination_dir
+
+    matched_card = False
+    for item in source_dir.iterdir():
+        destination = destination_dir / item.name
+        if item.is_dir():
+            shutil.copytree(item, destination)
+            continue
+        if item.stem == agent_name and item.suffix in _AGENT_CARD_FILE_EXTENSIONS:
+            shutil.copy2(item, destination)
+            matched_card = True
+            continue
+        if item.suffix not in _AGENT_CARD_FILE_EXTENSIONS:
+            shutil.copy2(item, destination)
+
+    if not matched_card:
+        raise FileNotFoundError(
+            f"Could not find an agent card named {agent_name!r} in {source_dir}."
+        )
+    return destination_dir
+
+
+def materialize_skill_bundle(
+    destination_dir: Path,
+    request: ExecutionRequest,
+) -> Path:
+    """Create the explicit skills root for a run."""
+    ensure_directory(destination_dir)
+    if request.skill is not None:
+        request.skill.save(destination_dir / request.skill.name)
+    return destination_dir
+
+
+def write_request_file(path: Path, request: ExecutionRequest) -> None:
+    """Persist request metadata for debugging and provenance."""
+    payload = asdict(request)
+    if request.skill is not None:
+        payload["skill"] = request.skill.model_dump(mode="json")
+    path.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8")
+
+
+def copy_config_file(source: Path, destination: Path) -> Path | None:
+    """Preserve the fast-agent config used for a run when one exists."""
+    if not source.exists():
+        return None
+
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(source, destination)
+    return destination
diff --git a/src/upskill/cli.py b/src/upskill/cli.py
index 1a9d63c..d38396a 100644
--- a/src/upskill/cli.py
+++ b/src/upskill/cli.py
@@ -1,15 +1,15 @@
 """CLI interface for upskill."""
+
 from __future__ import annotations
 
 import asyncio
-import inspect
 import json
 import sys
-from collections.abc import AsyncIterator
+from collections.abc import Callable, Mapping
 from contextlib import asynccontextmanager
 from importlib import resources
 from pathlib import Path
-from typing import TypedDict
+from typing import TYPE_CHECKING, Literal, Protocol, TypedDict, TypeVar, cast
 
 import click
 from dotenv import load_dotenv
@@ -20,8 +20,11 @@
 from rich.tree import Tree
 
 from upskill.config import Config, resolve_upskill_config_path
-from upskill.evaluate import evaluate_skill, get_failure_descriptions
+from upskill.evaluate import build_eval_requests, evaluate_skill, get_failure_descriptions
+from upskill.executors.local_fast_agent import LocalFastAgentExecutor
+from upskill.executors.remote_fast_agent import RemoteFastAgentExecutor
 from upskill.generate import generate_skill, generate_tests, improve_skill, refine_skill
+from upskill.hf_jobs import JobsConfig, verify_artifact_repo_access
 from upskill.logging import (
     aggregate_conversation_stats,
     create_batch_folder,
@@ -33,23 +36,144 @@
     write_run_metadata,
     write_run_result,
 )
-from upskill.model_resolution import ResolvedModels, resolve_models
+from upskill.model_resolution import (
+    ResolvedModels,
+    build_fastagent_model_references,
+    resolve_models,
+)
 from upskill.models import (
     BatchSummary,
+    EvalResults,
     RunMetadata,
     RunResult,
     Skill,
+    SkillRecord,
     TestCase,
     TestResult,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
+
+    from fast_agent.agents.llm_agent import LlmAgent
+    from fast_agent.interfaces import AgentProtocol
+
+    from upskill.executors.base import Executor
+
 load_dotenv()
 
 console = Console()
 
 
+class FastAgentSession(Protocol):
+    """Typed view of the loaded fast-agent session used by upskill."""
+
+    skill_gen: AgentProtocol
+    test_gen: AgentProtocol
+    evaluator: LlmAgent
+
+
+class FastAgentConfig(Protocol):
+    """Typed view of the fast-agent settings object used by upskill."""
+
+    model_references: Mapping[str, Mapping[str, str]]
+
+
+class FastAgentApp(Protocol):
+    """Typed view of the fast-agent app container used by upskill."""
+
+    _config_or_path: FastAgentConfig
+
+
+class FastAgentWithConfig(Protocol):
+    """Typed view of FastAgent for model-reference injection."""
+
+    app: FastAgentApp
+
+
+EvalPlotLabelField = Literal["model", "skill_name"]
+ExecutorName = Literal["local", "jobs"]
+CommandFunction = TypeVar("CommandFunction", bound=Callable[..., object])
+
+
+def _jobs_execution_options(
+    *,
+    executor_help: str,
+    runs_dir_help: str,
+) -> Callable[[CommandFunction], CommandFunction]:
+    """Attach the shared remote-execution CLI options to a command."""
+    # TODO: add a resumable remote-job collection flow before revisiting the wait-by-default
+    # behavior for generate/eval jobs.
+    options = (
+        click.option(
+            "--executor",
+            type=click.Choice(["local", "jobs"]),
+            default=None,
+            help=f"{executor_help}. Overrides `executor` in upskill.config.yaml.",
+        ),
+        click.option(
+            "--artifact-repo",
+            help="Dataset repo for remote job artifacts (required with --executor jobs)",
+        ),
+        click.option(
+            "--wait/--no-wait",
+            default=True,
+            help="Wait for remote jobs and download results (default: wait)",
+        ),
+        click.option(
+            "--jobs-timeout",
+            default="2h",
+            show_default=True,
+            help="HF Jobs timeout for remote runs",
+        ),
+        click.option(
+            "--jobs-flavor",
+            default="cpu-basic",
+            show_default=True,
+            help="HF Jobs hardware flavor for remote runs",
+        ),
+        click.option(
+            "--jobs-secrets",
+            default=None,
+            help=(
+                "Comma-separated HF Job secret names to forward (environment variables). Overrides "
+                "`jobs_secrets` in upskill.config.yaml."
+            ),
+        ),
+        click.option(
+            "--jobs-namespace",
+            help="Hugging Face Jobs namespace (recommended for remote jobs)",
+        ),
+        click.option(
+            "--max-parallel",
+            type=click.IntRange(min=1),
+            default=None,
+            help=(
+                "Maximum concurrent evaluation executions per phase. Overrides "
+                "`max_parallel` in upskill.config.yaml."
+            ),
+        ),
+        click.option("--runs-dir", type=click.Path(), help=runs_dir_help),
+        click.option(
+            "--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)"
+        ),
+    )
+
+    def decorator(function: CommandFunction) -> CommandFunction:
+        wrapped = function
+        for option in reversed(options):
+            wrapped = option(wrapped)
+        return wrapped
+
+    return decorator
+
+
 @asynccontextmanager
-async def _fast_agent_context(config: Config | None = None) -> AsyncIterator[object]:
+async def _fast_agent_context(
+    config: Config | None = None,
+    *,
+    model_references: Mapping[str, Mapping[str, str]] | None = None,
+) -> AsyncIterator[FastAgentSession]:
     config = config or Config.load()
     fast = FastAgent(
         "upskill",
@@ -59,27 +183,41 @@ async def _fast_agent_context(config: Config | None = None) -> AsyncIterator[obj
     )
 
     @fast.agent()
-    async def empty():
+    async def empty() -> None:
         pass
 
     cards = resources.files("upskill").joinpath("agent_cards")
     with resources.as_file(cards) as cards_path:
         fast.load_agents(cards_path)
 
+    _install_fast_agent_model_references(
+        cast("FastAgentWithConfig", fast),
+        model_references=model_references,
+    )
+
     async with fast.run() as agent:
-        yield agent
+        yield cast("FastAgentSession", agent)
 
 
-async def _set_agent_model(agent: object, model: str | None) -> None:
-    """Best-effort model assignment for a fast-agent instance."""
-    if not model:
-        return
-    set_model = getattr(agent, "set_model", None)
-    if not callable(set_model):
+def _install_fast_agent_model_references(
+    fast: FastAgentWithConfig,
+    *,
+    model_references: Mapping[str, Mapping[str, str]] | None,
+) -> None:
+    """Merge upskill's model slots into the fast-agent config before agent creation."""
+    if not model_references:
         return
-    result = set_model(model)
-    if inspect.isawaitable(result):
-        await result
+
+    fast_config = fast.app._config_or_path
+    merged_references = {
+        namespace: dict(entries) for namespace, entries in fast_config.model_references.items()
+    }
+
+    for namespace, entries in model_references.items():
+        namespace_references = merged_references.setdefault(namespace, {})
+        namespace_references.update(entries)
+
+    fast_config.model_references = merged_references
 
 
 def _require_resolved_model(value: str | None, *, field: str, command: str) -> str:
@@ -100,6 +238,106 @@ def _require_resolved_models(values: list[str], *, field: str, command: str) ->
     return values
 
 
+def _require_path(value: Path | None, *, field: str, command: str) -> Path:
+    """Require a resolved filesystem path for logging flows."""
+    if value is None:
+        raise RuntimeError(f"Internal bug: `{command}` requires `{field}` to be set.")
+    return value
+
+
+def _build_executor(
+    name: ExecutorName,
+    *,
+    jobs_config: JobsConfig | None = None,
+    progress_callback: Callable[[str], None] | None = None,
+) -> Executor:
+    """Construct an evaluation executor from a user-facing executor name."""
+    if name == "local":
+        return LocalFastAgentExecutor()
+    if jobs_config is None:
+        raise click.ClickException("The jobs executor requires jobs configuration.")
+    _ensure_jobs_artifact_repo_access(jobs_config)
+    return RemoteFastAgentExecutor(
+        jobs_config=jobs_config,
+        progress_callback=progress_callback,
+    )
+
+
+def _resolve_executor_name(config: Config, cli_executor_name: ExecutorName | None) -> ExecutorName:
+    """Resolve the effective execution backend from CLI override or config."""
+    return cli_executor_name or config.executor
+
+
+def _resolve_num_runs(
+    config: Config,
+    cli_num_runs: int | None,
+    *,
+    command: Literal["eval", "benchmark"],
+) -> int:
+    """Resolve the effective run count from CLI override or config."""
+    if cli_num_runs is not None:
+        return cli_num_runs
+    return config.effective_num_runs(command)
+
+
+def _resolve_max_parallel(config: Config, cli_max_parallel: int | None) -> int:
+    """Resolve the effective concurrency from CLI override or config."""
+    if cli_max_parallel is not None:
+        return cli_max_parallel
+    return config.max_parallel
+
+
+def _resolve_jobs_secrets(config: Config, cli_jobs_secrets: str | None) -> str:
+    """Resolve the effective HF Jobs secret list from CLI override or config."""
+    if cli_jobs_secrets is not None:
+        return cli_jobs_secrets
+    return config.jobs_secrets
+
+
+def _require_jobs_config(
+    *,
+    executor_name: ExecutorName,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str,
+    jobs_namespace: str | None,
+    jobs_image: str,
+) -> JobsConfig | None:
+    """Build jobs config when the jobs executor is selected."""
+    if executor_name != "jobs":
+        return None
+    if not artifact_repo:
+        raise click.ClickException("--artifact-repo is required when using --executor jobs.")
+    return JobsConfig(
+        artifact_repo=artifact_repo,
+        wait=wait,
+        jobs_timeout=jobs_timeout,
+        jobs_flavor=jobs_flavor,
+        jobs_secrets=jobs_secrets,
+        jobs_namespace=jobs_namespace,
+        jobs_image=jobs_image,
+    )
+
+
+def _ensure_jobs_artifact_repo_access(jobs_config: JobsConfig) -> None:
+    """Preflight the jobs artifact repo and surface a CLI-friendly failure."""
+    try:
+        verify_artifact_repo_access(jobs_config.artifact_repo)
+    except RuntimeError as exc:
+        message = (
+            "Artifact repo is not accessible.\n"
+            f"Repo: {jobs_config.artifact_repo}\n"
+            "Create it before submitting jobs and ensure the current Hugging Face "
+            "credentials can access it."
+        )
+        detail = str(exc)
+        if "Repository Not Found" in detail or "404 Not Found" in detail:
+            message += "\nThe dataset repo does not exist or the name is wrong."
+        raise click.ClickException(message) from exc
+
+
 def _print_model_plan(command: str, resolved: ResolvedModels, runs: int | None = None) -> None:
     """Print resolved model plan for command execution."""
     console.print("[dim]Resolved model plan:[/dim]")
@@ -117,12 +355,12 @@ def _print_model_plan(command: str, resolved: ResolvedModels, runs: int | None =
         console.print(f"  Evaluation Model(s): {models}")
         if runs is not None:
             console.print(f"  Runs per model: {runs}")
-        baseline_state = "off (benchmark mode)" if resolved.is_benchmark_mode else (
-            "on" if resolved.run_baseline else "off"
-        )
-        console.print(
-            f"  Baseline: {baseline_state}"
+        baseline_state = (
+            "off (benchmark mode)"
+            if resolved.is_benchmark_mode
+            else ("on" if resolved.run_baseline else "off")
         )
+        console.print(f"  Baseline: {baseline_state}")
         console.print(f"  Test Generation Model: {resolved.test_generation_model}")
 
 
@@ -131,11 +369,16 @@ def _render_bar(value: float, width: int = 20) -> str:
     if width <= 0:
         return ""
     clamped = max(0.0, min(1.0, value))
-    filled = int(round(clamped * width))
+    filled = round(clamped * width)
     empty = width - filled
     return "█" * filled + "░" * empty
 
 
+def _print_eval_progress(message: str) -> None:
+    """Render a lightweight evaluation progress line."""
+    console.print(f"[dim]{message}[/dim]")
+
+
 class EvalPlotResult(TypedDict):
     """Structured plot data for eval runs."""
 
@@ -169,6 +412,562 @@ def _select_baseline_run(
     return baseline_runs[-1]
 
 
+def _build_logged_run_result(
+    *,
+    model: str,
+    task: str,
+    batch_id: str,
+    run_number: int,
+    test_results: list[TestResult],
+    assertions_total: int,
+    passed: bool,
+    run_type: str,
+    skill_name: str,
+) -> RunResult:
+    """Construct a persisted run summary from reconstructed test results."""
+    assertions_passed = 0
+    computed_assertions_total = 0
+    for result in test_results:
+        if result.validation_result is not None:
+            assertions_passed += result.validation_result.assertions_passed
+            computed_assertions_total += result.validation_result.assertions_total
+            continue
+
+        assertions_passed += int(result.success)
+        computed_assertions_total += 1
+
+    return RunResult(
+        metadata=RunMetadata(
+            model=model,
+            task=task,
+            batch_id=batch_id,
+            run_number=run_number,
+        ),
+        stats=aggregate_conversation_stats(test_results),
+        passed=passed,
+        assertions_passed=assertions_passed,
+        assertions_total=computed_assertions_total or assertions_total,
+        run_type=run_type,
+        skill_name=skill_name,
+    )
+
+
+def _persist_logged_run(run_folder: Path, run_result: RunResult) -> None:
+    """Write the standard metadata and result files for a run."""
+    write_run_metadata(run_folder, run_result.metadata)
+    write_run_result(run_folder, run_result)
+
+
+def _persist_comparison_run_results(
+    *,
+    batch_folder: Path,
+    model: str,
+    task: str,
+    batch_id: str,
+    first_run_number: int,
+    results: EvalResults,
+    assertions_total: int,
+    run_baseline: bool,
+    with_skill_passed: bool,
+    skill_name: str,
+) -> list[RunResult]:
+    """Persist baseline/with-skill summaries for one evaluation pass."""
+    persisted_results: list[RunResult] = []
+    run_number = first_run_number
+
+    if run_baseline:
+        baseline_result = _build_logged_run_result(
+            model=model,
+            task=task,
+            batch_id=batch_id,
+            run_number=run_number,
+            test_results=results.baseline_results,
+            assertions_total=assertions_total,
+            passed=results.baseline_success_rate > 0.5,
+            run_type="baseline",
+            skill_name=skill_name,
+        )
+        _persist_logged_run(create_run_folder(batch_folder, run_number), baseline_result)
+        persisted_results.append(baseline_result)
+        run_number += 1
+
+    with_skill_result = _build_logged_run_result(
+        model=model,
+        task=task,
+        batch_id=batch_id,
+        run_number=run_number,
+        test_results=results.with_skill_results,
+        assertions_total=assertions_total,
+        passed=with_skill_passed,
+        run_type="with_skill",
+        skill_name=skill_name,
+    )
+    _persist_logged_run(create_run_folder(batch_folder, run_number), with_skill_result)
+    persisted_results.append(with_skill_result)
+    return persisted_results
+
+
+def _load_test_cases_from_payload(data: object) -> list[TestCase]:
+    """Normalize test case JSON payloads into ``TestCase`` objects."""
+    payload: object
+    if isinstance(data, dict):
+        mapping = cast("dict[object, object]", data)
+        payload = mapping.get("cases", data)
+    else:
+        payload = data
+    if not isinstance(payload, list):
+        raise click.ClickException("Test payload must be a list or an object with `cases`.")
+    return [TestCase.model_validate(test_case) for test_case in payload]
+
+
+async def _load_test_cases(
+    *,
+    config: Config,
+    skill_record: SkillRecord,
+    tests_path: str | None,
+    test_gen_model: str,
+    model_references: Mapping[str, Mapping[str, str]],
+) -> tuple[list[TestCase], str]:
+    """Load explicit, persisted, or generated test cases for a skill."""
+    if tests_path:
+        with open(tests_path, encoding="utf-8") as file_obj:
+            data = json.load(file_obj)
+        return _load_test_cases_from_payload(data), f"tests file: {tests_path}"
+
+    if skill_record.state.tests:
+        return skill_record.state.tests, "skill_meta.json"
+
+    async with _fast_agent_context(config, model_references=model_references) as agent:
+        console.print(f"Generating test cases from skill with {test_gen_model}...", style="dim")
+        test_cases = await generate_tests(
+            skill_record.skill.description,
+            generator=agent.test_gen,
+        )
+    return test_cases, "generated"
+
+
+def _count_invalid_expected_cases(test_cases: list[TestCase]) -> int:
+    """Count generated or loaded tests missing enough expected strings."""
+    invalid_expected = 0
+    for test_case in test_cases:
+        expected_values = [value.strip() for value in test_case.expected.contains if value.strip()]
+        if len(expected_values) < 2:
+            invalid_expected += 1
+    return invalid_expected
+
+
+def _raise_on_execution_errors(results: EvalResults, *, context: str) -> None:
+    """Raise a CLI-friendly error when evaluation batches contain execution failures."""
+    execution_errors: list[str] = []
+    for phase_label, phase_results in (
+        ("with-skill", results.with_skill_results),
+        ("baseline", results.baseline_results),
+    ):
+        for index, result in enumerate(phase_results, start=1):
+            if result.error is None:
+                continue
+            execution_errors.append(f"{phase_label} test {index}: {result.error}")
+
+    if not execution_errors:
+        return
+
+    preview = "\n".join(f"  - {message}" for message in execution_errors[:3])
+    remaining = len(execution_errors) - 3
+    remainder = f"\n  ... and {remaining} more" if remaining > 0 else ""
+    raise click.ClickException(f"{context} encountered execution errors:\n{preview}{remainder}")
+
+
+def _load_trace_context(trace_path: Path) -> str:
+    """Load a trace file into a prompt-sized context snippet."""
+    trace_content = trace_path.read_text(encoding="utf-8")
+    if trace_path.suffix.lower() != ".json":
+        return trace_content[:4000]
+
+    try:
+        trace_data = json.loads(trace_content)
+    except json.JSONDecodeError:
+        return trace_content[:4000]
+    return json.dumps(trace_data, indent=2)[:4000]
+
+
+async def _create_generate_skill_record(
+    *,
+    task: str,
+    examples: list[str] | None,
+    from_skill: str | None,
+    from_trace: str | None,
+    agent: FastAgentSession,
+    skill_gen_model: str,
+) -> tuple[SkillRecord, str]:
+    """Create or improve the skill record used by ``generate``."""
+    if from_trace:
+        trace_path = Path(from_trace)
+        console.print(f"Generating skill from trace: {from_trace}", style="dim")
+        task_with_trace = (
+            f"{task}\n\nBased on this agent trace:\n\n{_load_trace_context(trace_path)}"
+        )
+        console.print(f"Generating skill with {skill_gen_model}...", style="dim")
+        return (
+            await generate_skill(
+                task=task_with_trace,
+                examples=examples,
+                generator=agent.skill_gen,
+                model=skill_gen_model,
+            ),
+            task_with_trace,
+        )
+
+    if from_skill:
+        existing_skill = SkillRecord.load(Path(from_skill))
+        console.print(
+            f"Improving [bold]{existing_skill.skill.name}[/bold] with {skill_gen_model}...",
+            style="dim",
+        )
+        return (
+            await improve_skill(
+                existing_skill,
+                instructions=task,
+                generator=agent.skill_gen,
+                model=skill_gen_model,
+            ),
+            task,
+        )
+
+    console.print(f"Generating skill with {skill_gen_model}...", style="dim")
+    return (
+        await generate_skill(
+            task=task,
+            examples=examples,
+            generator=agent.skill_gen,
+            model=skill_gen_model,
+        ),
+        task,
+    )
+
+
+async def _submit_remote_eval_jobs(
+    *,
+    skill: Skill,
+    test_cases: list[TestCase],
+    model: str,
+    jobs_config: JobsConfig,
+    fastagent_config_path: Path,
+    cards_path: Path,
+    artifact_root: Path,
+    run_baseline: bool,
+    operation: str,
+) -> list[str]:
+    """Submit remote fast-agent requests for an evaluation batch."""
+    _ensure_jobs_artifact_repo_access(jobs_config)
+    remote_executor = RemoteFastAgentExecutor(
+        jobs_config=jobs_config,
+        progress_callback=_print_eval_progress,
+    )
+    requests = build_eval_requests(
+        skill=skill,
+        test_cases=test_cases,
+        model=model,
+        fastagent_config_path=fastagent_config_path,
+        cards_source_dir=cards_path,
+        artifact_root=artifact_root,
+        run_baseline=run_baseline,
+        operation=operation,
+    )
+    job_refs: list[str] = []
+    for pending_request in requests:
+        submission = await remote_executor.submit(pending_request.request)
+        job_refs.append(submission.job_id)
+    return job_refs
+
+
+async def _submit_generate_jobs_eval(
+    *,
+    skill: Skill,
+    test_cases: list[TestCase],
+    model: str,
+    jobs_config: JobsConfig,
+    config: Config,
+    cards_path: Path,
+    batch_folder: Path,
+) -> list[str]:
+    """Submit generate-time remote fast-agent requests without waiting for results."""
+    return await _submit_remote_eval_jobs(
+        skill=skill,
+        test_cases=test_cases,
+        model=model,
+        jobs_config=jobs_config,
+        fastagent_config_path=config.effective_fastagent_config,
+        cards_path=cards_path,
+        artifact_root=batch_folder / "remote_downloads" / "attempt_1",
+        run_baseline=True,
+        operation="generate",
+    )
+
+
+async def _run_generate_refinement_loop(
+    *,
+    skill_record: SkillRecord,
+    task: str,
+    test_cases: list[TestCase],
+    executor: Executor,
+    config: Config,
+    cards_path: Path,
+    batch_id: str,
+    batch_folder: Path,
+    skill_gen_model: str,
+    log_runs: bool,
+    max_parallel: int,
+    agent: FastAgentSession,
+) -> tuple[SkillRecord, EvalResults | None, list[RunResult]]:
+    """Run generate-time eval/refinement attempts on the main model."""
+    run_results: list[RunResult] = []
+    prev_success_rate = 0.0
+    results: EvalResults | None = None
+    attempts = max(1, config.max_refine_attempts)
+
+    for attempt in range(attempts):
+        attempt_number = attempt + 1
+        console.print(f"Evaluating on {skill_gen_model}... (attempt {attempt_number})", style="dim")
+        console.print("[dim]Starting evaluation run...[/dim]")
+
+        results = await evaluate_skill(
+            skill_record.skill,
+            test_cases=test_cases,
+            executor=executor,
+            model=skill_gen_model,
+            fastagent_config_path=config.effective_fastagent_config,
+            cards_source_dir=cards_path,
+            artifact_root=batch_folder / f"attempt_{attempt_number}",
+            max_parallel=max_parallel,
+            progress_callback=_print_eval_progress,
+            operation="generate",
+        )
+        _raise_on_execution_errors(results, context="Generate refinement")
+
+        if log_runs:
+            run_results.extend(
+                _persist_comparison_run_results(
+                    batch_folder=batch_folder,
+                    model=skill_gen_model,
+                    task=task,
+                    batch_id=batch_id,
+                    first_run_number=attempt * 2 + 1,
+                    results=results,
+                    assertions_total=len(test_cases),
+                    run_baseline=True,
+                    with_skill_passed=results.is_beneficial,
+                    skill_name=skill_record.skill.name,
+                )
+            )
+
+        lift = results.skill_lift
+        lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
+
+        if results.is_beneficial:
+            console.print(
+                f"  {results.baseline_success_rate:.0%} -> "
+                f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]"
+            )
+            break
+
+        console.print(
+            f"  {results.baseline_success_rate:.0%} -> "
+            f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough"
+        )
+
+        if abs(results.with_skill_success_rate - prev_success_rate) < 0.05:
+            console.print("  [yellow]Plateaued, stopping[/yellow]")
+            break
+
+        prev_success_rate = results.with_skill_success_rate
+        if attempt >= attempts - 1:
+            continue
+
+        console.print("Refining...", style="dim")
+        failures = get_failure_descriptions(results)
+        skill_record = await refine_skill(
+            skill_record,
+            failures,
+            generator=agent.skill_gen,
+            model=skill_gen_model,
+        )
+
+    return skill_record, results, run_results
+
+
+async def _run_generate_extra_eval(
+    *,
+    skill_record: SkillRecord,
+    task: str,
+    test_cases: list[TestCase],
+    executor: Executor,
+    config: Config,
+    cards_path: Path,
+    batch_id: str,
+    batch_folder: Path,
+    model: str,
+    log_runs: bool,
+    max_parallel: int,
+    first_run_number: int,
+) -> tuple[EvalResults, list[RunResult]]:
+    """Run the optional cross-model eval pass for ``generate``."""
+    console.print(f"Evaluating on {model}...", style="dim")
+    results = await evaluate_skill(
+        skill_record.skill,
+        test_cases,
+        executor=executor,
+        model=model,
+        fastagent_config_path=config.effective_fastagent_config,
+        cards_source_dir=cards_path,
+        artifact_root=batch_folder / f"eval_{model}",
+        max_parallel=max_parallel,
+        progress_callback=_print_eval_progress,
+        operation="generate",
+    )
+    _raise_on_execution_errors(results, context=f"Generate eval on {model}")
+
+    run_results: list[RunResult] = []
+    if log_runs:
+        run_results = _persist_comparison_run_results(
+            batch_folder=batch_folder,
+            model=model,
+            task=task,
+            batch_id=batch_id,
+            first_run_number=first_run_number,
+            results=results,
+            assertions_total=len(test_cases),
+            run_baseline=True,
+            with_skill_passed=results.is_beneficial,
+            skill_name=skill_record.skill.name,
+        )
+
+    lift = results.skill_lift
+    lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
+    console.print(
+        f"  {results.baseline_success_rate:.0%} -> "
+        f"{results.with_skill_success_rate:.0%} ({lift_str})"
+    )
+    return results, run_results
+
+
+async def _run_with_skill_benchmark(
+    *,
+    skill_record: SkillRecord,
+    evaluation_models: list[str],
+    num_runs: int,
+    test_cases: list[TestCase],
+    executor: Executor,
+    config: Config,
+    cards_path: Path,
+    batch_id: str,
+    batch_folder: Path,
+    verbose: bool,
+    log_runs: bool,
+    max_parallel: int,
+) -> tuple[dict[str, list[RunResult]], list[RunResult]]:
+    """Run a with-skill-only benchmark matrix across models and runs."""
+    skill = skill_record.skill
+    model_results: dict[str, list[RunResult]] = {model: [] for model in evaluation_models}
+    all_run_results: list[RunResult] = []
+
+    for model in evaluation_models:
+        console.print(f"[bold]{model}[/bold]")
+
+        for run_num in range(1, num_runs + 1):
+            run_folder = create_run_folder(batch_folder, len(all_run_results) + 1)
+            results = await evaluate_skill(
+                skill,
+                test_cases,
+                executor=executor,
+                model=model,
+                fastagent_config_path=config.effective_fastagent_config,
+                cards_source_dir=cards_path,
+                artifact_root=run_folder / "eval",
+                run_baseline=False,
+                max_parallel=max_parallel,
+                progress_callback=_print_eval_progress if verbose else None,
+                operation="benchmark",
+            )
+            _raise_on_execution_errors(results, context=f"Benchmark run on {model}")
+            run_result = _build_logged_run_result(
+                model=model,
+                task=skill.description,
+                batch_id=batch_id,
+                run_number=run_num,
+                test_results=results.with_skill_results,
+                assertions_total=len(test_cases),
+                passed=results.with_skill_success_rate > 0.5,
+                run_type="with_skill",
+                skill_name=skill.name,
+            )
+
+            if log_runs:
+                _persist_logged_run(run_folder, run_result)
+
+            model_results[model].append(run_result)
+            all_run_results.append(run_result)
+
+            if verbose:
+                status = "[green]PASS[/green]" if run_result.passed else "[red]FAIL[/red]"
+                console.print(
+                    f"  Run {run_num}: {status} "
+                    f"({run_result.assertions_passed}/{run_result.assertions_total} "
+                    "assertions passed)"
+                )
+
+        console.print()
+
+    return model_results, all_run_results
+
+
+def _print_benchmark_summary(model_results: dict[str, list[RunResult]]) -> None:
+    """Render the standard per-model benchmark summary."""
+    console.print("\n[bold]Summary[/bold]\n")
+    for model, results in model_results.items():
+        total_runs = len(results)
+        passed_runs = sum(1 for result in results if result.passed)
+        avg_tokens = (
+            sum(result.stats.total_tokens for result in results) / total_runs if total_runs else 0
+        )
+        avg_turns = sum(result.stats.turns for result in results) / total_runs if total_runs else 0
+        pass_rate = passed_runs / total_runs if total_runs else 0
+        if pass_rate > 0.5:
+            pass_rate_style = "green"
+        elif pass_rate > 0:
+            pass_rate_style = "yellow"
+        else:
+            pass_rate_style = "red"
+
+        console.print(f"[bold]{model}[/bold]")
+        console.print(
+            "  Runs: "
+            f"{total_runs} | Passed: {passed_runs} ([{pass_rate_style}]"
+            f"{pass_rate:.0%}[/{pass_rate_style}])"
+        )
+        console.print(f"  Avg tokens: {avg_tokens:.0f} | Avg turns: {avg_turns:.1f}")
+        console.print()
+
+
+def _write_benchmark_summary(
+    *,
+    batch_folder: Path,
+    batch_id: str,
+    evaluation_models: list[str],
+    task: str,
+    all_run_results: list[RunResult],
+) -> None:
+    """Persist the standard benchmark batch summary."""
+    summary = BatchSummary(
+        batch_id=batch_id,
+        model=", ".join(evaluation_models),
+        task=task,
+        total_runs=len(all_run_results),
+        passed_runs=sum(1 for result in all_run_results if result.passed),
+        results=all_run_results,
+    )
+    write_batch_summary(batch_folder, summary)
+
+
 def _load_eval_results(runs_path: Path) -> list[EvalPlotResult]:
     """Load eval results from batch summaries or run folders."""
     results: list[EvalPlotResult] = []
@@ -238,7 +1037,6 @@ def main():
 @main.command()
 @click.argument("task")
 @click.option("-e", "--example", multiple=True, help="Input -> output example")
-@click.option("--tool", help="Generate from MCP tool schema (path#tool_name)")
 @click.option(
     "-f",
     "--from",
@@ -258,18 +1056,27 @@ def main():
 @click.option("-o", "--output", type=click.Path(), help="Output directory for skill")
 @click.option("--no-eval", is_flag=True, help="Skip eval and refinement")
 @click.option("--eval-model", help="Optional extra cross-model eval pass after generation")
-@click.option("--runs-dir", type=click.Path(), help="Directory for run logs (default: ./runs)")
-@click.option("--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)")
+@_jobs_execution_options(
+    executor_help="Execution backend for evaluation/refinement runs",
+    runs_dir_help="Directory for run logs (default: ./runs)",
+)
 def generate(
     task: str,
     example: tuple[str, ...],
-    tool: str | None,  # noqa: ARG001
     from_source: str | None,
     model: str | None,
     test_gen_model: str | None,
     output: str | None,
     no_eval: bool,
     eval_model: str | None,
+    executor: ExecutorName | None,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str | None,
+    jobs_namespace: str | None,
+    max_parallel: int | None,
     runs_dir: str | None,
     log_runs: bool,
 ):
@@ -287,6 +1094,10 @@ def generate(
 
         upskill generate "validate forms" -o ./my-skills/validation
 
+        # Remote execution on Hugging Face Jobs:
+
+        upskill generate "parse invoices" --executor jobs --artifact-repo <user>/upskill-tests
+
         # Improve an existing skill (auto-detects directory):
 
         upskill generate "add more error handling examples" --from ./skills/api-errors/
@@ -324,6 +1135,14 @@ def generate(
             output,
             no_eval,
             eval_model,
+            executor,
+            artifact_repo,
+            wait,
+            jobs_timeout,
+            jobs_flavor,
+            jobs_secrets,
+            jobs_namespace,
+            max_parallel,
             runs_dir,
             log_runs,
         )
@@ -340,11 +1159,32 @@ async def _generate_async(
     output: str | None,
     no_eval: bool,
     eval_model: str | None,
+    executor_name: ExecutorName | None,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str | None,
+    jobs_namespace: str | None,
+    max_parallel: int | None,
     runs_dir: str | None,
     log_runs: bool,
 ):
     """Async implementation of generate command."""
     config = Config.load()
+    executor_name = _resolve_executor_name(config, executor_name)
+    max_parallel = _resolve_max_parallel(config, max_parallel)
+    jobs_secrets = _resolve_jobs_secrets(config, jobs_secrets)
+    jobs_config = _require_jobs_config(
+        executor_name=executor_name,
+        artifact_repo=artifact_repo,
+        wait=wait,
+        jobs_timeout=jobs_timeout,
+        jobs_flavor=jobs_flavor,
+        jobs_secrets=jobs_secrets,
+        jobs_namespace=jobs_namespace,
+        jobs_image=config.jobs_image,
+    )
     resolved = resolve_models(
         "generate",
         config=config,
@@ -363,310 +1203,148 @@ async def _generate_async(
         command="generate",
     )
     extra_eval_model = resolved.extra_eval_model
+    model_references = build_fastagent_model_references(config=config, resolved=resolved)
 
     _print_model_plan("generate", resolved)
 
-    # Setup run logging if enabled
-    batch_id = None
-    batch_folder = None
+    # Setup artifact storage and optional run logging
+    runs_path = Path(runs_dir) if runs_dir else config.runs_dir
+    batch_id, batch_folder = create_batch_folder(runs_path)
     run_results: list[RunResult] = []
-
+    console.print(f"Artifacts saved to: {batch_folder}", style="dim")
     if log_runs:
-        runs_path = Path(runs_dir) if runs_dir else config.runs_dir
-        batch_id, batch_folder = create_batch_folder(runs_path)
         console.print(f"Logging runs to: {batch_folder}", style="dim")
 
-    async with _fast_agent_context(config) as agent:
-        # Generate from trace file
-        if from_trace:
-            console.print(f"Generating skill from trace: {from_trace}", style="dim")
-            trace_path = Path(from_trace)
-            with open(trace_path, encoding="utf-8") as f:
-                trace_content = f.read()
-
-            # Try to parse as JSON, otherwise use as plain text
-            if trace_path.suffix.lower() == ".json":
-                try:
-                    trace_data = json.loads(trace_content)
-                    trace_context = json.dumps(trace_data, indent=2)[:4000]
-                except json.JSONDecodeError:
-                    trace_context = trace_content[:4000]
-            else:
-                # Plain text, markdown, etc.
-                trace_context = trace_content[:4000]
-
-            task = f"{task}\n\nBased on this agent trace:\n\n{trace_context}"
-            console.print(f"Generating skill with {skill_gen_model}...", style="dim")
-            await _set_agent_model(agent.skill_gen, skill_gen_model)
-            skill = await generate_skill(
+    async with _fast_agent_context(config, model_references=model_references) as agent:
+        cards = resources.files("upskill").joinpath("agent_cards")
+        with resources.as_file(cards) as cards_path:
+            skill_record, eval_task = await _create_generate_skill_record(
                 task=task,
                 examples=examples,
-                generator=agent.skill_gen,
-                model=skill_gen_model,
-            )
-        # Improve existing skill
-        elif from_skill:
-            existing_skill = Skill.load(Path(from_skill))
-            console.print(
-                f"Improving [bold]{existing_skill.name}[/bold] with {skill_gen_model}...",
-                style="dim",
-            )
-            await _set_agent_model(agent.skill_gen, skill_gen_model)
-            skill = await improve_skill(
-                existing_skill,
-                instructions=task,
-                generator=agent.skill_gen,
-                model=skill_gen_model,
-            )
-        else:
-            console.print(f"Generating skill with {skill_gen_model}...", style="dim")
-            await _set_agent_model(agent.skill_gen, skill_gen_model)
-            skill = await generate_skill(
-                task=task,
-                examples=examples,
-                generator=agent.skill_gen,
-                model=skill_gen_model,
-            )
-        if no_eval:
-            _save_and_display(skill, output, config)
-            return
-
-        console.print("Generating test cases...", style="dim")
-        await _set_agent_model(agent.test_gen, test_gen_model)
-        test_cases = await generate_tests(task, generator=agent.test_gen, model=test_gen_model)
-
-        # Eval loop with refinement (on skill generation model)
-        prev_success_rate = 0.0
-        results = None
-        attempts = max(1, config.max_refine_attempts)
-        for attempt in range(attempts):
-            console.print(
-                f"Evaluating on {skill_gen_model}... (attempt {attempt + 1})",
-                style="dim",
+                from_skill=from_skill,
+                from_trace=from_trace,
+                agent=agent,
+                skill_gen_model=skill_gen_model,
             )
 
-            # Create run folder for logging (2 folders per attempt: baseline + with_skill)
-            run_folder = None
-            if log_runs and batch_folder:
-                baseline_run_num = attempt * 2 + 1
-                run_folder = create_run_folder(batch_folder, baseline_run_num)
-                write_run_metadata(
-                    run_folder,
-                    RunMetadata(
-                        model=skill_gen_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=baseline_run_num,
-                    ),
-                )
-
-            console.print("[dim]Starting evaluation run...[/dim]")
-
-            results = await evaluate_skill(
-                skill,
-                test_cases=test_cases,
-                evaluator=agent.evaluator,
-                model=skill_gen_model,
-                show_baseline_progress=False,
+            console.print(f"Generating test cases with {test_gen_model}...", style="dim")
+            test_cases = await generate_tests(
+                eval_task,
+                generator=agent.test_gen,
             )
-
-            # Log run results (both baseline and with-skill for plot command)
-            if log_runs and run_folder:
-                # Log baseline result
-                baseline_result = RunResult(
-                    metadata=RunMetadata(
-                        model=skill_gen_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=baseline_run_num,
-                    ),
-                    stats=aggregate_conversation_stats(results.baseline_results),
-                    passed=results.baseline_success_rate > 0.5,
-                    assertions_passed=int(results.baseline_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="baseline",
-                    skill_name=skill.name,
-                )
-                write_run_result(run_folder, baseline_result)
-                run_results.append(baseline_result)
-
-                # Log with-skill result (in a separate folder)
-                with_skill_folder = create_run_folder(batch_folder, attempt * 2 + 2)
-                with_skill_result = RunResult(
-                    metadata=RunMetadata(
-                        model=skill_gen_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=attempt * 2 + 2,
-                    ),
-                    stats=aggregate_conversation_stats(results.with_skill_results),
-                    passed=results.is_beneficial,
-                    assertions_passed=int(results.with_skill_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="with_skill",
-                    skill_name=skill.name,
+            skill_record.state.tests = list(test_cases)
+
+            if no_eval:
+                _save_and_display(skill_record, output, config, artifact_path=batch_folder)
+                return
+
+            if executor_name == "jobs" and not wait:
+                if jobs_config is None:
+                    raise RuntimeError("Jobs config was not initialized.")
+                job_refs = await _submit_generate_jobs_eval(
+                    skill=skill_record.skill,
+                    test_cases=test_cases,
+                    model=skill_gen_model,
+                    jobs_config=jobs_config,
+                    config=config,
+                    cards_path=cards_path,
+                    batch_folder=batch_folder,
                 )
-                write_run_metadata(with_skill_folder, with_skill_result.metadata)
-                write_run_result(with_skill_folder, with_skill_result)
-                run_results.append(with_skill_result)
-
-            lift = results.skill_lift
-            lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
-
-            if results.is_beneficial:
                 console.print(
-                    f"  {results.baseline_success_rate:.0%} -> "
-                    f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]"
+                    "[yellow]Remote fast-agent requests submitted without --wait; "
+                    "refinement is skipped for this run.[/yellow]"
                 )
-                break
-
-            console.print(
-                f"  {results.baseline_success_rate:.0%} -> "
-                f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough"
+                console.print(f"Remote fast-agent job id(s): {', '.join(job_refs)}")
+                _save_and_display(skill_record, output, config, artifact_path=batch_folder)
+                return
+
+            executor = _build_executor(
+                executor_name,
+                jobs_config=jobs_config,
+                progress_callback=_print_eval_progress,
             )
 
-            if abs(results.with_skill_success_rate - prev_success_rate) < 0.05:
-                console.print("  [yellow]Plateaued, stopping[/yellow]")
-                break
-
-            prev_success_rate = results.with_skill_success_rate
-
-            if attempt < attempts - 1:
-                console.print("Refining...", style="dim")
-                failures = get_failure_descriptions(results)
-                await _set_agent_model(agent.skill_gen, skill_gen_model)
-                skill = await refine_skill(
-                    skill,
-                    failures,
-                    generator=agent.skill_gen,
-                    model=skill_gen_model,
-                )
-
-        # If eval_model specified, also eval on that model
-        eval_results = None
-        if extra_eval_model:
-            console.print(f"Evaluating on {extra_eval_model}...", style="dim")
-
-            # Create run folder for eval model
-            run_folder = None
-            if log_runs and batch_folder:
-                run_number = len(run_results) + 1
-                run_folder = create_run_folder(batch_folder, run_number)
-                write_run_metadata(
-                    run_folder,
-                    RunMetadata(
-                        model=extra_eval_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=run_number,
-                    ),
-                )
-
-            eval_results = await evaluate_skill(
-                skill,
-                test_cases,
-                evaluator=agent.evaluator,
-                model=extra_eval_model,
-                show_baseline_progress=False,
+            skill_record, results, run_results = await _run_generate_refinement_loop(
+                skill_record=skill_record,
+                task=eval_task,
+                test_cases=test_cases,
+                executor=executor,
+                config=config,
+                cards_path=cards_path,
+                batch_id=batch_id,
+                batch_folder=batch_folder,
+                skill_gen_model=skill_gen_model,
+                log_runs=log_runs,
+                max_parallel=max_parallel,
+                agent=agent,
             )
 
-            # Log eval run results (both baseline and with-skill)
-            if log_runs and run_folder:
-                # Log baseline result
-                baseline_result = RunResult(
-                    metadata=RunMetadata(
-                        model=extra_eval_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=run_number,
-                    ),
-                    stats=aggregate_conversation_stats(eval_results.baseline_results),
-                    passed=eval_results.baseline_success_rate > 0.5,
-                    assertions_passed=int(eval_results.baseline_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="baseline",
-                    skill_name=skill.name,
-                )
-                write_run_result(run_folder, baseline_result)
-                run_results.append(baseline_result)
-
-                # Log with-skill result
-                with_skill_folder = create_run_folder(batch_folder, run_number + 1)
-                with_skill_result = RunResult(
-                    metadata=RunMetadata(
-                        model=extra_eval_model,
-                        task=task,
-                        batch_id=batch_id or "",
-                        run_number=run_number + 1,
-                    ),
-                    stats=aggregate_conversation_stats(eval_results.with_skill_results),
-                    passed=eval_results.is_beneficial,
-                    assertions_passed=int(eval_results.with_skill_success_rate * len(test_cases)),
-                    assertions_total=len(test_cases),
-                    run_type="with_skill",
-                    skill_name=skill.name,
+            # If eval_model specified, also eval on that model
+            eval_results = None
+            if extra_eval_model:
+                eval_results, extra_run_results = await _run_generate_extra_eval(
+                    skill_record=skill_record,
+                    task=eval_task,
+                    test_cases=test_cases,
+                    executor=executor,
+                    config=config,
+                    cards_path=cards_path,
+                    batch_id=batch_id,
+                    batch_folder=batch_folder,
+                    model=extra_eval_model,
+                    log_runs=log_runs,
+                    max_parallel=max_parallel,
+                    first_run_number=len(run_results) + 1,
                 )
-                write_run_metadata(with_skill_folder, with_skill_result.metadata)
-                write_run_result(with_skill_folder, with_skill_result)
-                run_results.append(with_skill_result)
-
-            lift = eval_results.skill_lift
-            lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}"
-            console.print(
-                f"  {eval_results.baseline_success_rate:.0%} -> "
-                f"{eval_results.with_skill_success_rate:.0%} ({lift_str})"
-            )
+                run_results.extend(extra_run_results)
 
-        # Write batch summary
-        if log_runs and batch_folder and batch_id:
-            summary = BatchSummary(
-                batch_id=batch_id,
-                model=skill_gen_model,
-                task=task,
-                total_runs=len(run_results),
-                passed_runs=sum(1 for r in run_results if r.passed),
-                results=run_results,
-            )
-            write_batch_summary(batch_folder, summary)
+            # Write batch summary
+            if log_runs:
+                summary = BatchSummary(
+                    batch_id=batch_id,
+                    model=skill_gen_model,
+                    task=eval_task,
+                    total_runs=len(run_results),
+                    passed_runs=sum(1 for r in run_results if r.passed),
+                    results=run_results,
+                )
+                write_batch_summary(batch_folder, summary)
 
-    if not no_eval and skill is not None:
+    if not no_eval:
         if results:
-            skill.metadata.test_pass_rate = results.with_skill_success_rate
+            skill_record.state.metadata.test_pass_rate = results.with_skill_success_rate
         else:
             console.print(
                 "[yellow]No evaluation results available; skipping report output.[/yellow]"
             )
 
         _save_and_display(
-            skill,
+            skill_record,
             output,
             config,
             results,
             eval_results,
             skill_gen_model,
             extra_eval_model,
+            batch_folder,
         )
 
 
-
-
-
 def _save_and_display(
-    skill: Skill,
+    skill_record: SkillRecord,
     output: str | None,
     config: Config,
-    results=None,
-    eval_results=None,
+    results: EvalResults | None = None,
+    eval_results: EvalResults | None = None,
     skill_gen_model: str | None = None,
     eval_model: str | None = None,
+    artifact_path: Path | None = None,
 ):
     """Save skill and display summary."""
-    if output:
-        output_path = Path(output)
-    else:
-        output_path = config.skills_dir / skill.name
+    skill = skill_record.skill
+    output_path = Path(output) if output else config.skills_dir / skill.name
 
-    skill.save(output_path)
+    skill_record.save(output_path)
 
     console.print("[dim]Rendering report output...[/dim]")
 
@@ -733,6 +1411,8 @@ def _save_and_display(
 
     console.print()
     console.print(f"Saved to {output_path}")
+    if artifact_path is not None:
+        console.print(f"Artifacts saved to {artifact_path}")
 
 
 @main.command("eval")
@@ -749,23 +1429,39 @@ def _save_and_display(
     "--test-gen-model",
     help="Override test generation model when tests must be generated",
 )
-@click.option("--runs", "num_runs", type=int, default=1, help="Number of runs per model")
+@click.option(
+    "--runs",
+    "num_runs",
+    type=click.IntRange(min=1),
+    default=None,
+    help="Number of runs per model. Overrides `num_runs` in upskill.config.yaml.",
+)
 @click.option(
     "--no-baseline",
     is_flag=True,
     help="Skip baseline comparison in simple eval mode (ignored in benchmark mode)",
 )
 @click.option("-v", "--verbose", is_flag=True, help="Show per-test results")
-@click.option("--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)")
-@click.option("--runs-dir", type=click.Path(), help="Directory for run logs")
+@_jobs_execution_options(
+    executor_help="Execution backend for evaluation runs",
+    runs_dir_help="Directory for run logs",
+)
 def eval_cmd(
     skill_path: str,
     tests: str | None,
     models: tuple[str, ...],
     test_gen_model: str | None,
-    num_runs: int,
+    num_runs: int | None,
     no_baseline: bool,
     verbose: bool,
+    executor: ExecutorName | None,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str | None,
+    jobs_namespace: str | None,
+    max_parallel: int | None,
     log_runs: bool,
     runs_dir: str | None,
 ):
@@ -803,27 +1499,62 @@ def eval_cmd(
             num_runs,
             no_baseline,
             verbose,
+            executor,
+            artifact_repo,
+            wait,
+            jobs_timeout,
+            jobs_flavor,
+            jobs_secrets,
+            jobs_namespace,
+            max_parallel,
             log_runs,
             runs_dir,
         )
     )
 
 
-async def _eval_async(
+async def _eval_async(  # noqa: C901
     skill_path: str,
     tests: str | None,
     models: list[str] | None,
     test_gen_model: str | None,
-    num_runs: int,
+    num_runs: int | None,
     no_baseline: bool,
     verbose: bool,
+    executor_name: ExecutorName | None,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str | None,
+    jobs_namespace: str | None,
+    max_parallel: int | None,
     log_runs: bool,
     runs_dir: str | None,
 ):
     """Async implementation of eval command."""
-    from upskill.evaluate import run_test
-
     config = Config.load()
+    executor_name = _resolve_executor_name(config, executor_name)
+    num_runs = _resolve_num_runs(config, num_runs, command="eval")
+    max_parallel = _resolve_max_parallel(config, max_parallel)
+    jobs_secrets = _resolve_jobs_secrets(config, jobs_secrets)
+    jobs_config = _require_jobs_config(
+        executor_name=executor_name,
+        artifact_repo=artifact_repo,
+        wait=wait,
+        jobs_timeout=jobs_timeout,
+        jobs_flavor=jobs_flavor,
+        jobs_secrets=jobs_secrets,
+        jobs_namespace=jobs_namespace,
+        jobs_image=config.jobs_image,
+    )
+    executor = None
+    if executor_name == "local" or wait:
+        executor = _build_executor(
+            executor_name,
+            jobs_config=jobs_config,
+            progress_callback=_print_eval_progress,
+        )
     resolved = resolve_models(
         "eval",
         config=config,
@@ -842,6 +1573,7 @@ async def _eval_async(
         field="test_generation_model",
         command="eval",
     )
+    model_references = build_fastagent_model_references(config=config, resolved=resolved)
 
     _print_model_plan("eval", resolved, runs=num_runs)
     if resolved.is_benchmark_mode and no_baseline:
@@ -852,202 +1584,110 @@ async def _eval_async(
     skill_dir = Path(skill_path)
 
     try:
-        skill = Skill.load(skill_dir)
+        skill_record = SkillRecord.load(skill_dir)
     except FileNotFoundError:
         console.print(f"[red]No SKILL.md found in {skill_dir}[/red]")
         sys.exit(1)
+    skill = skill_record.skill
 
-    async with _fast_agent_context(config) as agent:
-        # Load test cases
-        test_cases: list[TestCase] = []
-        if tests:
-            with open(tests, encoding="utf-8") as f:
-                data = json.load(f)
-            if "cases" in data:
-                test_cases = [TestCase(**tc) for tc in data["cases"]]
-            else:
-                test_cases = [TestCase(**tc) for tc in data]
-            test_source = f"tests file: {tests}"
-        elif skill.tests:
-            test_cases = skill.tests
-            test_source = "skill_meta.json"
-        else:
-            console.print("Generating test cases from skill...", style="dim")
-            await _set_agent_model(agent.test_gen, test_gen_model)
-            test_cases = await generate_tests(
-                skill.description,
-                generator=agent.test_gen,
-                model=test_gen_model,
-            )
-            test_source = "generated"
+    test_cases, test_source = await _load_test_cases(
+        config=config,
+        skill_record=skill_record,
+        tests_path=tests,
+        test_gen_model=test_gen_model,
+        model_references=model_references,
+    )
 
-        invalid_expected = 0
-        for tc in test_cases:
-            expected_values = [value.strip() for value in tc.expected.contains if value.strip()]
-            if len(expected_values) < 2:
-                invalid_expected += 1
-        console.print(
-            f"[dim]Loaded {len(test_cases)} test case(s) from {test_source}[/dim]"
-        )
-        if invalid_expected:
-            console.print(
-                f"[yellow]{invalid_expected} test case(s) missing expected strings[/yellow]"
+    invalid_expected = _count_invalid_expected_cases(test_cases)
+    console.print(f"[dim]Loaded {len(test_cases)} test case(s) from {test_source}[/dim]")
+    if invalid_expected:
+        console.print(f"[yellow]{invalid_expected} test case(s) missing expected strings[/yellow]")
+
+    runs_path = Path(runs_dir) if runs_dir else config.runs_dir
+    batch_id, batch_folder = create_batch_folder(runs_path)
+    console.print(f"Artifacts saved to: {batch_folder}", style="dim")
+    if log_runs:
+        console.print(f"Logging to: {batch_folder}", style="dim")
+
+    if executor_name == "jobs" and not wait:
+        if jobs_config is None:
+            raise RuntimeError("Jobs config was not initialized.")
+        cards = resources.files("upskill").joinpath("agent_cards")
+        with resources.as_file(cards) as cards_path:
+            if resolved.is_benchmark_mode:
+                submitted_job_refs: list[str] = []
+
+                for model in evaluation_models:
+                    console.print(f"[bold]{model}[/bold]")
+                    for run_num in range(1, num_runs + 1):
+                        job_refs = await _submit_remote_eval_jobs(
+                            skill=skill,
+                            test_cases=test_cases,
+                            model=model,
+                            jobs_config=jobs_config,
+                            fastagent_config_path=config.effective_fastagent_config,
+                            cards_path=cards_path,
+                            artifact_root=batch_folder
+                            / "remote_downloads"
+                            / model
+                            / f"run_{run_num}",
+                            run_baseline=False,
+                            operation="benchmark",
+                        )
+                        submitted_job_refs.extend(job_refs)
+                        console.print(f"Remote fast-agent job id(s): {', '.join(job_refs)}")
+                console.print(
+                    f"Submitted remote fast-agent job id(s): {', '.join(submitted_job_refs)}"
+                )
+                return
+
+            job_refs = await _submit_remote_eval_jobs(
+                skill=skill,
+                test_cases=test_cases,
+                model=evaluation_models[0],
+                jobs_config=jobs_config,
+                fastagent_config_path=config.effective_fastagent_config,
+                cards_path=cards_path,
+                artifact_root=batch_folder / "remote_downloads",
+                run_baseline=resolved.run_baseline,
+                operation="eval",
             )
+        console.print(f"Remote fast-agent job id(s): {', '.join(job_refs)}")
+        return
 
-        # Setup run logging
-        batch_id = None
-        batch_folder = None
-        if log_runs:
-            runs_path = Path(runs_dir) if runs_dir else config.runs_dir
-            batch_id, batch_folder = create_batch_folder(runs_path)
-            console.print(f"Logging to: {batch_folder}", style="dim")
+    if executor is None:
+        raise RuntimeError("Local executor was not initialized.")
 
+    cards = resources.files("upskill").joinpath("agent_cards")
+    with resources.as_file(cards) as cards_path:
         if resolved.is_benchmark_mode:
-            # Benchmark mode: multiple models and/or runs
             console.print(
                 f"\nEvaluating [bold]{skill.name}[/bold] across {len(evaluation_models)} model(s)"
             )
-            console.print(
-                f"  {len(test_cases)} test case(s), "
-                f"{num_runs} run(s) per model\n"
+            console.print(f"  {len(test_cases)} test case(s), {num_runs} run(s) per model\n")
+            model_results, all_run_results = await _run_with_skill_benchmark(
+                skill_record=skill_record,
+                evaluation_models=evaluation_models,
+                num_runs=num_runs,
+                test_cases=test_cases,
+                executor=executor,
+                config=config,
+                cards_path=cards_path,
+                batch_id=batch_id,
+                batch_folder=batch_folder,
+                verbose=verbose,
+                log_runs=log_runs,
+                max_parallel=max_parallel,
             )
-
-            model_results: dict[str, list[RunResult]] = {m: [] for m in evaluation_models}
-            all_run_results: list[RunResult] = []
-
-            for model in evaluation_models:
-                console.print(f"[bold]{model}[/bold]")
-
-                for run_num in range(1, num_runs + 1):
-                    run_folder = None
-                    if log_runs and batch_folder:
-                        run_folder = create_run_folder(
-                            batch_folder, len(all_run_results) + 1
-                        )
-
-                    # Run each test case
-                    total_assertions_passed = 0
-                    total_assertions = 0
-                    all_passed = True
-                    run_test_results: list[TestResult] = []
-
-                    for tc_idx, tc in enumerate(test_cases, 1):
-                        if verbose:
-                            console.print(
-                                f"  Running test {tc_idx}/{len(test_cases)}...",
-                                style="dim",
-                            )
-
-                        try:
-                            result = await run_test(
-                                tc,
-                                evaluator=agent.evaluator,
-                                skill=skill,
-                                model=model,
-                                instance_name=(
-                                    f"eval ({model} run {run_num} test {tc_idx})"
-                                ),
-                            )
-                        except Exception as e:
-                            console.print(f"  [red]Test error: {e}[/red]")
-                            result = TestResult(test_case=tc, success=False, error=str(e))
-
-                        # Extract assertion counts
-                        if result.validation_result:
-                            total_assertions_passed += result.validation_result.assertions_passed
-                            total_assertions += result.validation_result.assertions_total
-                            if verbose and result.validation_result.error_message:
-                                console.print(
-                                    f"    Validation: {result.validation_result.error_message}",
-                                    style="dim",
-                                )
-                        elif result.error:
-                            if verbose:
-                                console.print(f"    Error: {result.error}", style="dim")
-                            total_assertions += 1
-                        else:
-                            total_assertions += 1
-                            if result.success:
-                                total_assertions_passed += 1
-
-                        run_test_results.append(result)
-                        if not result.success:
-                            all_passed = False
-
-                    aggregated_stats = aggregate_conversation_stats(run_test_results)
-
-                    run_result = RunResult(
-                        metadata=RunMetadata(
-                            model=model,
-                            task=skill.description,
-                            batch_id=batch_id or "",
-                            run_number=run_num,
-                        ),
-                        stats=aggregated_stats,
-                        passed=all_passed,
-                        assertions_passed=total_assertions_passed,
-                        assertions_total=total_assertions,
-                        run_type="with_skill",
-                        skill_name=skill.name,
-                    )
-
-                    if run_folder:
-                        write_run_metadata(run_folder, run_result.metadata)
-                        write_run_result(run_folder, run_result)
-
-                    model_results[model].append(run_result)
-                    all_run_results.append(run_result)
-
-                    # Display progress
-                    status = "[green]PASS[/green]" if all_passed else "[red]FAIL[/red]"
-                    if verbose:
-                        console.print(
-                            f"  Run {run_num}: {status} "
-                            f"({total_assertions_passed}/{total_assertions} assertions passed)"
-                        )
-
-                console.print()
-
-            # Summary report
-            console.print("\n[bold]Summary[/bold]\n")
-
-            for model, results in model_results.items():
-                total_runs = len(results)
-                passed_runs = sum(1 for r in results if r.passed)
-                avg_tokens = (
-                    sum(r.stats.total_tokens for r in results) / total_runs if total_runs else 0
-                )
-                avg_turns = sum(r.stats.turns for r in results) / total_runs if total_runs else 0
-
-                pass_rate = passed_runs / total_runs if total_runs else 0
-                pass_rate_str = f"{pass_rate:.0%}"
-                if pass_rate > 0.5:
-                    pass_rate_style = "green"
-                elif pass_rate > 0:
-                    pass_rate_style = "yellow"
-                else:
-                    pass_rate_style = "red"
-
-                console.print(f"[bold]{model}[/bold]")
-                console.print(
-                    "  Runs: "
-                    f"{total_runs} | Passed: {passed_runs} ([{pass_rate_style}]"
-                    f"{pass_rate_str}[/{pass_rate_style}])"
-                )
-                console.print(f"  Avg tokens: {avg_tokens:.0f} | Avg turns: {avg_turns:.1f}")
-                console.print()
-
-            # Write batch summary
-            if log_runs and batch_folder and batch_id:
-                summary = BatchSummary(
+            _print_benchmark_summary(model_results)
+            if log_runs:
+                _write_benchmark_summary(
+                    batch_folder=batch_folder,
                     batch_id=batch_id,
-                    model=", ".join(evaluation_models),
+                    evaluation_models=evaluation_models,
                     task=skill.description,
-                    total_runs=len(all_run_results),
-                    passed_runs=sum(1 for r in all_run_results if r.passed),
-                    results=all_run_results,
+                    all_run_results=all_run_results,
                 )
-                write_batch_summary(batch_folder, summary)
 
         else:
             # Simple eval mode: single model, single run
@@ -1057,64 +1697,41 @@ async def _eval_async(
             results = await evaluate_skill(
                 skill,
                 test_cases,
-                evaluator=agent.evaluator,
+                executor=executor,
                 model=model,
+                fastagent_config_path=config.effective_fastagent_config,
+                cards_source_dir=cards_path,
+                artifact_root=batch_folder / "eval",
                 run_baseline=resolved.run_baseline,
-                show_baseline_progress=verbose,
+                max_parallel=max_parallel,
+                progress_callback=_print_eval_progress,
+                operation="eval",
             )
+            _raise_on_execution_errors(results, context=f"Evaluation on {model}")
 
             # Log results (both baseline and with-skill)
             run_results: list[RunResult] = []
-            if log_runs and batch_folder:
-                # Log baseline result
-                if resolved.run_baseline:
-                    baseline_folder = create_run_folder(batch_folder, 1)
-                    baseline_result = RunResult(
-                        metadata=RunMetadata(
-                            model=model,
-                            task=skill.description,
-                            batch_id=batch_id or "",
-                            run_number=1,
-                        ),
-                        stats=aggregate_conversation_stats(results.baseline_results),
-                        passed=results.baseline_success_rate > 0.5,
-                        assertions_passed=int(results.baseline_success_rate * len(test_cases)),
-                        assertions_total=len(test_cases),
-                        run_type="baseline",
-                        skill_name=skill.name,
-                    )
-                    write_run_metadata(baseline_folder, baseline_result.metadata)
-                    write_run_result(baseline_folder, baseline_result)
-                    run_results.append(baseline_result)
-
-                # Log with-skill result
-                with_skill_folder = create_run_folder(
-                    batch_folder,
-                    2 if resolved.run_baseline else 1,
-                )
-                with_skill_result = RunResult(
-                    metadata=RunMetadata(
-                        model=model,
-                        task=skill.description,
-                        batch_id=batch_id or "",
-                        run_number=2 if resolved.run_baseline else 1,
-                    ),
-                    stats=aggregate_conversation_stats(results.with_skill_results),
-                    passed=results.is_beneficial
-                    if resolved.run_baseline
-                    else results.with_skill_success_rate > 0.5,
-                    assertions_passed=int(results.with_skill_success_rate * len(test_cases)),
+            if log_runs:
+                run_results = _persist_comparison_run_results(
+                    batch_folder=batch_folder,
+                    model=model,
+                    task=skill.description,
+                    batch_id=batch_id,
+                    first_run_number=1,
+                    results=results,
                     assertions_total=len(test_cases),
-                    run_type="with_skill",
+                    run_baseline=resolved.run_baseline,
+                    with_skill_passed=(
+                        results.is_beneficial
+                        if resolved.run_baseline
+                        else results.with_skill_success_rate > 0.5
+                    ),
                     skill_name=skill.name,
                 )
-                write_run_metadata(with_skill_folder, with_skill_result.metadata)
-                write_run_result(with_skill_folder, with_skill_result)
-                run_results.append(with_skill_result)
 
                 # Write batch summary
                 summary = BatchSummary(
-                    batch_id=batch_id or "",
+                    batch_id=batch_id,
                     model=model,
                     task=skill.description,
                     total_runs=len(run_results),
@@ -1126,7 +1743,8 @@ async def _eval_async(
             if verbose and resolved.run_baseline:
                 console.print()
                 for i, (with_r, base_r) in enumerate(
-                    zip(results.with_skill_results, results.baseline_results), 1
+                    zip(results.with_skill_results, results.baseline_results, strict=True),
+                    1,
                 ):
                     base_icon = "[green]OK[/green]" if base_r.success else "[red]FAIL[/red]"
                     skill_icon = "[green]OK[/green]" if with_r.success else "[red]FAIL[/red]"
@@ -1171,6 +1789,7 @@ async def _eval_async(
                 console.print(f"  with skill {with_skill_bar}  {with_skill_rate:>5.0%}")
                 console.print(f"  tokens: {results.with_skill_total_tokens}")
 
+            console.print(f"\nArtifacts saved to: {batch_folder}")
             if resolved.run_baseline:
                 if results.is_beneficial:
                     console.print("\n[green]Recommendation: keep skill[/green]")
@@ -1184,10 +1803,7 @@ async def _eval_async(
 def list_cmd(skills_dir: str | None, verbose: bool):
     """List generated skills."""
     config = Config.load()
-    if skills_dir:
-        path = Path(skills_dir)
-    else:
-        path = config.skills_dir
+    path = Path(skills_dir) if skills_dir else config.skills_dir
 
     if not path.exists():
         console.print(f"No skills directory found at {path}")
@@ -1266,22 +1882,76 @@ def list_cmd(skills_dir: str | None, verbose: bool):
     required=True,
     help="Evaluation model(s) to benchmark (repeatable)",
 )
-@click.option("--runs", "num_runs", type=int, default=3, help="Runs per model (default: 3)")
+@click.option(
+    "--runs",
+    "num_runs",
+    type=click.IntRange(min=1),
+    default=None,
+    help="Runs per model. Overrides `num_runs` in upskill.config.yaml.",
+)
 @click.option("-t", "--tests", type=click.Path(exists=True), help="Test cases JSON file")
 @click.option(
     "--test-gen-model",
     help="Override test generation model when tests must be generated",
 )
+@click.option(
+    "--executor",
+    type=click.Choice(["local", "jobs"]),
+    default=None,
+    help="Execution backend for benchmark runs. Overrides `executor` in upskill.config.yaml.",
+)
+@click.option("--artifact-repo", help="Dataset repo for remote fast-agent job artifacts")
+@click.option(
+    "--wait/--no-wait", default=True, help="Wait for remote fast-agent jobs and download results"
+)
+@click.option(
+    "--jobs-timeout",
+    default="2h",
+    show_default=True,
+    help="HF Jobs timeout for remote fast-agent runs",
+)
+@click.option(
+    "--jobs-flavor",
+    default="cpu-basic",
+    show_default=True,
+    help="HF Jobs hardware flavor for remote fast-agent runs",
+)
+@click.option(
+    "--jobs-secrets",
+    default=None,
+    help=(
+        "Comma-separated HF Job secret names to forward. Overrides "
+        "`jobs_secrets` in upskill.config.yaml."
+    ),
+)
+@click.option("--jobs-namespace", help="Optional Hugging Face Jobs namespace")
 @click.option("-o", "--output", type=click.Path(), help="Output directory for results")
 @click.option("-v", "--verbose", is_flag=True, help="Show per-run details")
+@click.option(
+    "--max-parallel",
+    type=click.IntRange(min=1),
+    default=None,
+    help=(
+        "Maximum concurrent evaluation executions per phase. Overrides "
+        "`max_parallel` in upskill.config.yaml."
+    ),
+)
 def benchmark_cmd(
     skill_path: str,
     models: tuple[str, ...],
-    num_runs: int,
+    num_runs: int | None,
     tests: str | None,
     test_gen_model: str | None,
+    executor: ExecutorName | None,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str | None,
+    jobs_namespace: str | None,
     output: str | None,
     verbose: bool,
+    max_parallel: int | None,
 ):
     """Benchmark a skill across multiple models.
 
@@ -1306,8 +1976,16 @@ def benchmark_cmd(
             test_gen_model,
             num_runs,
             tests,
+            executor,
+            artifact_repo,
+            wait,
+            jobs_timeout,
+            jobs_flavor,
+            jobs_secrets,
+            jobs_namespace,
             output,
             verbose,
+            max_parallel,
         )
     )
 
@@ -1316,15 +1994,45 @@ async def _benchmark_async(
     skill_path: str,
     models: list[str],
     test_gen_model: str | None,
-    num_runs: int,
+    num_runs: int | None,
     tests_path: str | None,
+    executor_name: ExecutorName | None,
+    artifact_repo: str | None,
+    wait: bool,
+    jobs_timeout: str,
+    jobs_flavor: str,
+    jobs_secrets: str | None,
+    jobs_namespace: str | None,
     output_dir: str | None,
     verbose: bool,
+    max_parallel: int | None,
 ):
     """Async implementation of benchmark command."""
-    from upskill.evaluate import run_test
-
     config = Config.load()
+    executor_name = _resolve_executor_name(config, executor_name)
+    num_runs = _resolve_num_runs(config, num_runs, command="benchmark")
+    max_parallel = _resolve_max_parallel(config, max_parallel)
+    jobs_secrets = _resolve_jobs_secrets(config, jobs_secrets)
+    jobs_config = _require_jobs_config(
+        executor_name=executor_name,
+        artifact_repo=artifact_repo,
+        wait=wait,
+        jobs_timeout=jobs_timeout,
+        jobs_flavor=jobs_flavor,
+        jobs_secrets=jobs_secrets,
+        jobs_namespace=jobs_namespace,
+        jobs_image=config.jobs_image,
+    )
+    if executor_name == "jobs" and not wait:
+        raise click.ClickException(
+            "`benchmark --executor jobs` currently requires `--wait` to assemble results from "
+            "downloaded fast-agent artifacts."
+        )
+    executor = _build_executor(
+        executor_name,
+        jobs_config=jobs_config,
+        progress_callback=_print_eval_progress,
+    )
     resolved = resolve_models(
         "benchmark",
         config=config,
@@ -1342,169 +2050,56 @@ async def _benchmark_async(
         field="test_generation_model",
         command="benchmark",
     )
+    model_references = build_fastagent_model_references(config=config, resolved=resolved)
 
     _print_model_plan("benchmark", resolved, runs=num_runs)
 
-    skill = Skill.load(Path(skill_path))
+    skill_record = SkillRecord.load(Path(skill_path))
+    skill = skill_record.skill
 
-    async with _fast_agent_context(config) as agent:
-        # Load test cases
-        if tests_path:
-            with open(tests_path, encoding="utf-8") as f:
-                data = json.load(f)
-            if "cases" in data:
-                test_cases = [TestCase(**tc) for tc in data["cases"]]
-            else:
-                test_cases = [TestCase(**tc) for tc in data]
-        elif skill.tests:
-            test_cases = skill.tests
-        else:
-            console.print("Generating test cases from skill...", style="dim")
-            await _set_agent_model(agent.test_gen, test_gen_model)
-            test_cases = await generate_tests(
-                skill.description,
-                generator=agent.test_gen,
-                model=test_gen_model,
-            )
+    cards = resources.files("upskill").joinpath("agent_cards")
+    with resources.as_file(cards) as cards_path:
+        test_cases, _ = await _load_test_cases(
+            config=config,
+            skill_record=skill_record,
+            tests_path=tests_path,
+            test_gen_model=test_gen_model,
+            model_references=model_references,
+        )
 
         # Setup output directory
-        if output_dir:
-            out_path = Path(output_dir)
-        else:
-            out_path = config.runs_dir
+        out_path = Path(output_dir) if output_dir else config.runs_dir
 
         batch_id, batch_folder = create_batch_folder(out_path)
         console.print(f"Results will be saved to: {batch_folder}", style="dim")
 
-        # Track results per model
-        model_results: dict[str, list[RunResult]] = {m: [] for m in evaluation_models}
-        all_run_results: list[RunResult] = []
-
         console.print(
             f"\nBenchmarking [bold]{skill.name}[/bold] across {len(evaluation_models)} model(s)"
         )
         console.print(f"  {len(test_cases)} test case(s), {num_runs} run(s) per model\n")
-
-        for model in evaluation_models:
-            console.print(f"[bold]{model}[/bold]")
-
-            for run_num in range(1, num_runs + 1):
-                run_folder = create_run_folder(batch_folder, len(all_run_results) + 1)
-
-                # Run each test case
-                total_assertions_passed = 0
-                total_assertions = 0
-                all_passed = True
-                run_results: list[TestResult] = []
-
-                for tc_idx, tc in enumerate(test_cases, 1):
-                    if verbose:
-                        console.print(f"  Running test {tc_idx}/{len(test_cases)}...", style="dim")
-
-                    try:
-                        result = await run_test(
-                            tc,
-                            evaluator=agent.evaluator,
-                            skill=skill,
-                            model=model,
-                            instance_name=(
-                                f"benchmark ({model} run {run_num} test {tc_idx})"
-                            ),
-                        )
-                    except Exception as e:
-                        console.print(f"  [red]Test error: {e}[/red]")
-                        result = TestResult(test_case=tc, success=False, error=str(e))
-
-                    # Extract assertion counts from validation result
-                    if result.validation_result:
-                        total_assertions_passed += result.validation_result.assertions_passed
-                        total_assertions += result.validation_result.assertions_total
-                        if verbose and result.validation_result.error_message:
-                            console.print(
-                                f"    Validation: {result.validation_result.error_message}",
-                                style="dim",
-                            )
-                    elif result.error:
-                        if verbose:
-                            console.print(f"    Error: {result.error}", style="dim")
-                        # Legacy: count as 1 assertion (failed)
-                        total_assertions += 1
-                    else:
-                        # Legacy: count as 1 assertion
-                        total_assertions += 1
-                        if result.success:
-                            total_assertions_passed += 1
-
-                    run_results.append(result)
-
-                    if not result.success:
-                        all_passed = False
-
-                aggregated_stats = aggregate_conversation_stats(run_results)
-
-                # Create run result
-                run_result = RunResult(
-                    metadata=RunMetadata(
-                        model=model,
-                        task=skill.description,
-                        batch_id=batch_id,
-                        run_number=run_num,
-                    ),
-                    stats=aggregated_stats,
-                    passed=all_passed,
-                    assertions_passed=total_assertions_passed,
-                    assertions_total=total_assertions,
-                    run_type="with_skill",
-                    skill_name=skill.name,
-                )
-
-                write_run_metadata(run_folder, run_result.metadata)
-                write_run_result(run_folder, run_result)
-                model_results[model].append(run_result)
-                all_run_results.append(run_result)
-
-                # Display progress
-                status = "[green]PASS[/green]" if all_passed else "[red]FAIL[/red]"
-                if verbose:
-                    console.print(
-                        f"  Run {run_num}: {status} "
-                        f"({total_assertions_passed}/{total_assertions} assertions passed)"
-                    )
-
-        console.print("\n[bold]Summary[/bold]\n")
-
-        for model, results in model_results.items():
-            total_runs = len(results)
-            passed_runs = sum(1 for r in results if r.passed)
-            avg_tokens = (
-                sum(r.stats.total_tokens for r in results) / total_runs if total_runs else 0
-            )
-            avg_turns = (
-                sum(r.stats.turns for r in results) / total_runs if total_runs else 0
-            )
-
-            pass_rate = passed_runs / total_runs if total_runs else 0
-            pass_rate_str = f"{pass_rate:.0%}"
-            pass_rate_style = "green" if pass_rate > 0.5 else "yellow" if pass_rate > 0 else "red"
-
-            console.print(f"[bold]{model}[/bold]")
-            console.print(
-                "  Runs: "
-                f"{total_runs} | Passed: {passed_runs} ([{pass_rate_style}]"
-                f"{pass_rate_str}[/{pass_rate_style}])"
-            )
-            console.print(f"  Avg tokens: {avg_tokens:.0f} | Avg turns: {avg_turns:.1f}")
-            console.print()
-
-        summary = BatchSummary(
+        model_results, all_run_results = await _run_with_skill_benchmark(
+            skill_record=skill_record,
+            evaluation_models=evaluation_models,
+            num_runs=num_runs,
+            test_cases=test_cases,
+            executor=executor,
+            config=config,
+            cards_path=cards_path,
+            batch_id=batch_id,
+            batch_folder=batch_folder,
+            verbose=verbose,
+            log_runs=True,
+            max_parallel=max_parallel,
+        )
+        _print_benchmark_summary(model_results)
+        _write_benchmark_summary(
+            batch_folder=batch_folder,
             batch_id=batch_id,
-            model=", ".join(evaluation_models),
+            evaluation_models=evaluation_models,
             task=skill.description,
-            total_runs=len(all_run_results),
-            passed_runs=sum(1 for r in all_run_results if r.passed),
-            results=all_run_results,
+            all_run_results=all_run_results,
         )
-        write_batch_summary(batch_folder, summary)
+
 
 @main.command("runs")
 @click.option("-d", "--dir", "runs_dir", type=click.Path(exists=True), help="Runs directory")
@@ -1582,13 +2177,13 @@ def runs_cmd(
         sys.exit(0)
 
     # Aggregate by model and skill (take most recent / highest)
-    aggregated: dict[tuple[str, str], dict] = {}
+    aggregated: dict[tuple[str, str], EvalPlotResult] = {}
     for r in all_results:
         key = (r["model"], r["skill_name"])
         if key not in aggregated or r["with_skill_rate"] > aggregated[key]["with_skill_rate"]:
             aggregated[key] = r
 
-    results_list = list(aggregated.values())
+    results_list: list[EvalPlotResult] = list(aggregated.values())
 
     # Determine display mode
     unique_skills = set(r["skill_name"] for r in results_list)
@@ -1598,7 +2193,7 @@ def runs_cmd(
 
     if len(unique_skills) == 1 and len(unique_models) >= 1:
         # Single skill, multiple models - use Panel
-        skill_name = list(unique_skills)[0]
+        skill_name = next(iter(unique_skills))
 
         # Build content for panel
         content_lines = []
@@ -1611,7 +2206,7 @@ def runs_cmd(
 
     elif len(unique_models) == 1 and len(unique_skills) >= 1:
         # Single model, multiple skills - use Panel
-        model_name = list(unique_models)[0]
+        model_name = next(iter(unique_models))
 
         content_lines = []
         for r in sorted(results_list, key=lambda x: x["skill_name"]):
@@ -1667,10 +2262,10 @@ def plot_cmd(
 def _format_comparison_bars(
     result: EvalPlotResult,
     metric: str,
-    label_field: str = "model",
+    label_field: EvalPlotLabelField = "model",
 ) -> str:
     """Format baseline vs with-skill comparison bars for a single result as string."""
-    label = result[label_field]
+    label = result["skill_name"] if label_field == "skill_name" else result["model"]
     has_baseline = result["has_baseline"]
     lines = [f"[bold]{label}[/bold]"]
 
@@ -1694,8 +2289,7 @@ def _format_comparison_bars(
             )
         else:
             lines.append(
-                "  with skill "
-                f"{with_skill_bar}  {with_skill_val:>5.0%}  [dim](no baseline)[/dim]"
+                f"  with skill {with_skill_bar}  {with_skill_val:>5.0%}  [dim](no baseline)[/dim]"
             )
     else:  # tokens
         with_skill_val = result["with_skill_tokens"]
@@ -1720,8 +2314,7 @@ def _format_comparison_bars(
         else:
             with_skill_bar = _render_bar(1.0 if with_skill_val > 0 else 0)
             lines.append(
-                "  with skill "
-                f"{with_skill_bar}  {with_skill_val:>6}  [dim](no baseline)[/dim]"
+                f"  with skill {with_skill_bar}  {with_skill_val:>6}  [dim](no baseline)[/dim]"
             )
 
     return "\n".join(lines)
@@ -1730,7 +2323,7 @@ def _format_comparison_bars(
 def _print_comparison_bars(
     result: EvalPlotResult,
     metric: str,
-    label_field: str = "model",
+    label_field: EvalPlotLabelField = "model",
 ) -> None:
     """Print baseline vs with-skill comparison bars for a single result."""
     console.print(_format_comparison_bars(result, metric, label_field))
diff --git a/src/upskill/config.py b/src/upskill/config.py
index 01f1f5c..9b86b56 100644
--- a/src/upskill/config.py
+++ b/src/upskill/config.py
@@ -5,6 +5,7 @@
 import os
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Literal
 
 import yaml
 from pydantic import AliasChoices, BaseModel, ConfigDict, Field
@@ -130,9 +131,32 @@ class Config(BaseModel):
     )
 
     # Generation settings
-    auto_eval: bool = Field(default=True, description="Run eval after generation")
     max_refine_attempts: int = Field(default=2, description="Max refinement iterations")
 
+    # Execution settings
+    executor: Literal["local", "jobs"] = Field(
+        default="local",
+        description="Default execution backend for evaluation and refinement",
+    )
+    num_runs: int | None = Field(
+        default=None,
+        ge=1,
+        description="Default runs per model for eval/benchmark when CLI --runs is omitted",
+    )
+    max_parallel: int = Field(
+        default=5,
+        ge=1,
+        description="Maximum concurrent evaluation executions per phase",
+    )
+    jobs_secrets: str = Field(
+        default="HF_TOKEN",
+        description="Comma-separated env var names to forward to HF Jobs when using executor=jobs",
+    )
+    jobs_image: str = Field(
+        default="ghcr.io/astral-sh/uv:python3.13-bookworm",
+        description="Container image used for HF Jobs when using executor=jobs",
+    )
+
     # FastAgent settings
     fastagent_config: Path | None = Field(default=None, description="Path to fastagent.config.yaml")
 
@@ -176,6 +200,14 @@ def effective_eval_model(self) -> str:
         """Get the model to use for evaluation."""
         return self.eval_model or self.skill_generation_model
 
+    def effective_num_runs(self, command: Literal["eval", "benchmark"]) -> int:
+        """Get the number of runs to use when CLI ``--runs`` is omitted."""
+        if self.num_runs is not None:
+            return self.num_runs
+        if command == "benchmark":
+            return 3
+        return 1
+
     @property
     def model(self) -> str:
         """Backward-compatible alias for ``skill_generation_model``."""
diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py
index fdc07c4..b5412ca 100644
--- a/src/upskill/evaluate.py
+++ b/src/upskill/evaluate.py
@@ -1,29 +1,22 @@
-"""Skill evaluation - compare agent performance with and without skills using FastAgent."""
+"""Skill evaluation orchestration backed by an execution backend."""
 
 from __future__ import annotations
 
 import asyncio
+import json
 import logging
-import shutil
-import tempfile
-from collections.abc import Generator
-from contextlib import contextmanager, nullcontext
-from pathlib import Path
-
-from fast_agent import ConversationSummary
-from fast_agent.agents.llm_agent import LlmAgent
-
-try:
-    from fast_agent.ui.rich_progress import progress_display
-except Exception:  # pragma: no cover - defensive import for older fast-agent versions
-    progress_display = None
-
-from upskill.fastagent_integration import (
-    compose_instruction,
-)
-from upskill.logging import extract_stats_from_summary
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pathlib import Path
+
+    from upskill.executors.base import Executor
+
+from upskill.artifacts import ensure_directory, sanitize_artifact_name
+from upskill.executors.contracts import ExecutionRequest
 from upskill.models import (
-    ConversationStats,
     EvalResults,
     ExpectedSpec,
     Skill,
@@ -33,49 +26,117 @@
 )
 from upskill.validators import get_validator
 
+logger = logging.getLogger(__name__)
 
-def _hide_progress_task(task_name: str | None) -> None:
-    """Best-effort hide of a completed task from the shared progress display."""
-    if not task_name or progress_display is None:
-        return
-    hide_task = getattr(progress_display, "hide_task", None)
-    if not callable(hide_task):
-        return
-    try:
-        hide_task(task_name)
-    except Exception:
-        # Progress cleanup is best-effort and should never fail evaluations.
-        return
 
-logger = logging.getLogger(__name__)
+@dataclass(frozen=True, slots=True)
+class PendingEvaluationRequest:
+    """A single evaluation request prepared for backend submission."""
 
-PROMPT = (
-    "You are an evaluator of skills. You are given a skill and a test case. "
-    "You need to evaluate the skill on the test case and return a score."
-)
+    phase_label: str
+    test_index: int
+    request: ExecutionRequest
 
 
-@contextmanager
-def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Generator[Path]:
-    """Create an isolated workspace for a test run.
+def _format_execution_error(
+    error: str,
+    *,
+    metadata: dict[str, str | int | float | bool | None] | None,
+) -> str:
+    """Append useful execution identifiers to surfaced backend errors."""
+    if metadata is None:
+        return error
+
+    job_id = metadata.get("job_id")
+    if isinstance(job_id, str) and job_id:
+        return f"{error} (job {job_id})"
+    return error
+
+
+def _write_test_result_summary(path: Path, result: TestResult) -> None:
+    """Persist a per-test result summary alongside raw artifacts."""
+    path.write_text(
+        json.dumps(result.model_dump(mode="json"), indent=2),
+        encoding="utf-8",
+    )
 
-    Args:
-        base_dir: Optional parent directory for the workspace
-        cleanup: Whether to clean up the workspace after (default True)
 
-    Yields:
-        Path to the temporary workspace directory
-    """
-    workspace = tempfile.mkdtemp(dir=base_dir, prefix="upskill_run_")
-    workspace_path = Path(workspace)
+def _load_test_result_summary(path: Path) -> TestResult | None:
+    """Load a persisted per-test result summary."""
+    if not path.exists():
+        return None
     try:
-        yield workspace_path
-    finally:
-        if cleanup:
-            try:
-                shutil.rmtree(workspace_path, ignore_errors=True)
-            except Exception:
-                pass  # Ignore cleanup errors
+        return TestResult.model_validate_json(path.read_text(encoding="utf-8"))
+    except ValueError:
+        return None
+
+
+def apply_eval_metrics(results: EvalResults, test_cases: list[TestCase]) -> EvalResults:
+    """Populate aggregate metrics on an ``EvalResults`` instance."""
+    successes = sum(1 for r in results.with_skill_results if r.success)
+    results.with_skill_success_rate = successes / len(test_cases) if test_cases else 0
+    results.with_skill_total_tokens = sum(r.stats.total_tokens for r in results.with_skill_results)
+    results.with_skill_avg_turns = (
+        sum(r.stats.turns for r in results.with_skill_results) / len(test_cases)
+        if test_cases
+        else 0
+    )
+
+    if results.baseline_results:
+        successes = sum(1 for r in results.baseline_results if r.success)
+        results.baseline_success_rate = successes / len(test_cases) if test_cases else 0
+        results.baseline_total_tokens = sum(r.stats.total_tokens for r in results.baseline_results)
+        results.baseline_avg_turns = (
+            sum(r.stats.turns for r in results.baseline_results) / len(test_cases)
+            if test_cases
+            else 0
+        )
+
+    return results
+
+
+def load_eval_results_from_artifact_root(
+    *,
+    skill_name: str,
+    model: str,
+    artifact_root: Path,
+) -> EvalResults | None:
+    """Reconstruct eval results from persisted per-test summaries."""
+    if not artifact_root.exists():
+        return None
+
+    with_skill_results = [
+        loaded
+        for loaded in (
+            _load_test_result_summary(summary_path)
+            for summary_path in sorted(
+                (artifact_root / "with-skill").glob("test_*/test_result.json")
+            )
+        )
+        if loaded is not None
+    ]
+    baseline_results = [
+        loaded
+        for loaded in (
+            _load_test_result_summary(summary_path)
+            for summary_path in sorted((artifact_root / "baseline").glob("test_*/test_result.json"))
+        )
+        if loaded is not None
+    ]
+
+    if not with_skill_results and not baseline_results:
+        return None
+
+    reconstructed = EvalResults(
+        skill_name=skill_name,
+        model=model,
+        with_skill_results=with_skill_results,
+        baseline_results=baseline_results,
+    )
+    test_cases = [result.test_case for result in with_skill_results]
+    if not test_cases:
+        test_cases = [result.test_case for result in baseline_results]
+    return apply_eval_metrics(reconstructed, test_cases)
 
 
 def check_expected(
@@ -115,208 +176,285 @@ def check_expected(
     return True, None
 
 
+def format_test_prompt(test_case: TestCase) -> str:
+    """Build the evaluator prompt, preserving legacy inline file context."""
+    prompt = test_case.input
+    if test_case.context and test_case.context.files:
+        for filename, content in test_case.context.files.items():
+            prompt += f"\n\n```{filename}\n{content}\n```"
+    return prompt
+
+
+def build_eval_execution_request(
+    test_case: TestCase,
+    *,
+    skill: Skill | None,
+    model: str,
+    fastagent_config_path: Path,
+    cards_source_dir: Path,
+    artifact_dir: Path,
+    agent_name: str = "evaluator",
+    instance_name: str | None = None,
+    operation: str = "eval",
+) -> ExecutionRequest:
+    """Build the normalized execution request for a single evaluation test."""
+    workspace_files = (
+        dict(test_case.context.files) if test_case.context and test_case.context.files else {}
+    )
+    normalized_artifact_dir = artifact_dir.resolve()
+    return ExecutionRequest(
+        prompt=format_test_prompt(test_case),
+        model=model,
+        agent=agent_name,
+        fastagent_config_path=fastagent_config_path.resolve(),
+        artifact_dir=normalized_artifact_dir,
+        cards_source_dir=cards_source_dir.resolve(),
+        label=instance_name or (skill.name if skill else "baseline"),
+        skill=skill,
+        workspace_files=workspace_files,
+        metadata={
+            "instance_name": instance_name,
+            "operation": operation,
+            "skill_name": skill.name if skill else None,
+            "has_validator": bool(test_case.validator),
+        },
+    )
+
+
+def build_eval_requests(
+    *,
+    skill: Skill,
+    test_cases: list[TestCase],
+    model: str,
+    fastagent_config_path: Path,
+    cards_source_dir: Path,
+    artifact_root: Path,
+    run_baseline: bool = True,
+    operation: str = "eval",
+) -> list[PendingEvaluationRequest]:
+    """Build all execution requests needed for an evaluation run."""
+    requests: list[PendingEvaluationRequest] = []
+
+    for phase_label, batch_skill in _iter_evaluation_phases(skill, run_baseline):
+        batch_root = ensure_directory(artifact_root / sanitize_artifact_name(phase_label))
+        for index, test_case in enumerate(test_cases, start=1):
+            instance_name = f"eval ({phase_label} test {index})"
+            requests.append(
+                PendingEvaluationRequest(
+                    phase_label=phase_label,
+                    test_index=index,
+                    request=build_eval_execution_request(
+                        test_case,
+                        skill=batch_skill,
+                        model=model,
+                        fastagent_config_path=fastagent_config_path,
+                        cards_source_dir=cards_source_dir,
+                        artifact_dir=batch_root / f"test_{index}",
+                        instance_name=instance_name,
+                        operation=operation,
+                    ),
+                )
+            )
+
+    return requests
+
+
+def _iter_evaluation_phases(
+    skill: Skill,
+    run_baseline: bool,
+) -> list[tuple[str, Skill | None]]:
+    phases: list[tuple[str, Skill | None]] = [("with-skill", skill)]
+    if run_baseline:
+        phases.append(("baseline", None))
+    return phases
+
+
 async def _run_test_with_evaluator(
     test_case: TestCase,
-    evaluator: LlmAgent,
-    instruction: str | None,
+    executor: Executor,
     *,
-    use_workspace: bool | None = None,
+    skill: Skill | None,
+    model: str,
+    fastagent_config_path: Path,
+    cards_source_dir: Path,
+    artifact_dir: Path,
+    agent_name: str = "evaluator",
     instance_name: str | None = None,
+    operation: str = "eval",
 ) -> TestResult:
-    """Run a single test case using a provided evaluator agent."""
-    user_content = test_case.input
-    if test_case.context and test_case.context.files:
-        for filename, content in test_case.context.files.items():
-            user_content += f"\n\n```{filename}\n{content}\n```"
-
-    # Determine if we need workspace isolation
-    needs_workspace = use_workspace if use_workspace is not None else bool(test_case.validator)
-
-    async def _run_in_workspace(workspace: Path | None) -> TestResult:
-        clone: LlmAgent | None = None
-        try:
-            clone = await evaluator.spawn_detached_instance(name=instance_name)
-            if workspace is not None:
-                enable_shell = getattr(clone, "enable_shell", None)
-                shell_enabled = getattr(clone, "shell_runtime_enabled", False)
-                if shell_enabled and callable(enable_shell):
-                    enable_shell(working_directory=workspace)
-
-            if instruction is None:
-                clone.set_instruction("")
-            else:
-                clone.set_instruction(instruction)
-            output = await clone.send(user_content)
-            stats = ConversationStats()
-
-            # Extract stats from agent history
-            try:
-                history = clone.message_history
-                summary = ConversationSummary(messages=history)
-                stats = extract_stats_from_summary(summary)
-            except Exception as exc:
-                logger.exception("Failed to extract stats from evaluator history", exc_info=exc)
-
-            # Check expected with custom validator support
-            if workspace and test_case.validator:
-                success, validation_result = check_expected(
-                    output or "",
-                    test_case.expected,
-                    workspace,
-                    test_case,
-                )
-            else:
-                success, validation_result = check_expected(
-                    output or "",
-                    test_case.expected,
-                    None,
-                    test_case,
-                )
+    """Run a single test case through the configured executor."""
+    request = build_eval_execution_request(
+        test_case,
+        skill=skill,
+        model=model,
+        fastagent_config_path=fastagent_config_path,
+        cards_source_dir=cards_source_dir,
+        artifact_dir=artifact_dir,
+        agent_name=agent_name,
+        instance_name=instance_name,
+        operation=operation,
+    )
+    normalized_artifact_dir = request.artifact_dir
 
-            return TestResult(
-                test_case=test_case,
-                success=success,
-                output=output,
-                tokens_used=stats.total_tokens,
-                turns=stats.turns,
-                stats=stats,
-                validation_result=validation_result,
-            )
-        except Exception as exc:
-            return TestResult(test_case=test_case, success=False, error=str(exc))
-        finally:
-            if clone is not None:
-                try:
-                    await clone.shutdown()
-                except Exception as exc:
-                    logger.exception("Failed to shutdown evaluator clone", exc_info=exc)
-            _hide_progress_task(instance_name)
-
-    if needs_workspace:
-        with isolated_workspace() as workspace:
-            return await _run_in_workspace(workspace)
-    return await _run_in_workspace(None)
+    try:
+        handle = await executor.execute(request)
+        execution_result = await executor.collect(handle)
+    except Exception as exc:
+        logger.exception("Evaluation execution failed", exc_info=exc)
+        result = TestResult(test_case=test_case, success=False, error=str(exc))
+        _write_test_result_summary(normalized_artifact_dir / "test_result.json", result)
+        return result
+
+    if execution_result.error is not None:
+        result = TestResult(
+            test_case=test_case,
+            success=False,
+            output=execution_result.output_text,
+            tokens_used=execution_result.stats.total_tokens,
+            turns=execution_result.stats.turns,
+            error=_format_execution_error(
+                execution_result.error,
+                metadata=execution_result.metadata,
+            ),
+            stats=execution_result.stats,
+        )
+        _write_test_result_summary(normalized_artifact_dir / "test_result.json", result)
+        return result
+
+    success, validation_result = check_expected(
+        execution_result.output_text or "",
+        test_case.expected,
+        execution_result.workspace_dir,
+        test_case,
+    )
+    result = TestResult(
+        test_case=test_case,
+        success=success,
+        output=execution_result.output_text,
+        tokens_used=execution_result.stats.total_tokens,
+        turns=execution_result.stats.turns,
+        stats=execution_result.stats,
+        validation_result=validation_result,
+    )
+    _write_test_result_summary(normalized_artifact_dir / "test_result.json", result)
+    return result
 
 
 async def run_test(
     test_case: TestCase,
-    evaluator: LlmAgent,
+    executor: Executor,
     skill: Skill | None,
-    use_workspace: bool | None = None,
-    model: str | None = None,
+    *,
+    model: str,
+    fastagent_config_path: Path,
+    cards_source_dir: Path,
+    artifact_dir: Path,
     instance_name: str | None = None,
+    operation: str = "eval",
 ) -> TestResult:
-    """Run a single test case using an evaluator agent.
+    """Run a single test case via the execution backend.
 
     Args:
         test_case: The test case to run
-        evaluator: Evaluator agent to run the test case
+        executor: Execution backend to use
         skill: Optional skill to inject (None for baseline)
-        use_workspace: Force workspace isolation (auto-detected from test_case.validator)
         model: Model to evaluate with for this test case
+        fastagent_config_path: Fast-agent config to pass through to execution
+        cards_source_dir: Source directory for bundled agent cards
+        artifact_dir: Output directory for raw execution artifacts
         instance_name: Optional evaluator instance display name
+        operation: High-level command family for labeling submitted jobs
     """
-
-    try:
-        if model is not None:
-            await evaluator.set_model(model)
-        instruction = compose_instruction(evaluator.instruction, skill) if skill else None
-        return await _run_test_with_evaluator(
-            test_case,
-            evaluator,
-            instruction,
-            use_workspace=use_workspace,
-            instance_name=instance_name,
-        )
-    except Exception as exc:
-        return TestResult(test_case=test_case, success=False, error=str(exc))
+    return await _run_test_with_evaluator(
+        test_case,
+        executor,
+        skill=skill,
+        model=model,
+        fastagent_config_path=fastagent_config_path,
+        cards_source_dir=cards_source_dir,
+        artifact_dir=artifact_dir,
+        instance_name=instance_name,
+        operation=operation,
+    )
 
 
 async def evaluate_skill(
     skill: Skill,
     test_cases: list[TestCase],
-    evaluator: LlmAgent,
-    model: str | None = None,
+    executor: Executor,
+    *,
+    model: str,
+    fastagent_config_path: Path,
+    cards_source_dir: Path,
+    artifact_root: Path,
     run_baseline: bool = True,
-    show_baseline_progress: bool = False,
+    max_parallel: int = 5,
+    progress_callback: Callable[[str], None] | None = None,
+    operation: str = "eval",
 ) -> EvalResults:
     """Evaluate a skill against test cases using FastAgent.
 
     Args:
         skill: The skill to evaluate
         test_cases: Test cases to run
-        evaluator: Evaluator agent to run the test cases
-        model: Model to evaluate on (defaults to config.eval_model)
+        executor: Execution backend to use
+        model: Model to evaluate on
+        fastagent_config_path: Fast-agent config path to propagate
+        cards_source_dir: Source directory for evaluator cards
+        artifact_root: Artifact root for preserved raw execution outputs
         run_baseline: Whether to also run without the skill
-        show_baseline_progress: Whether to render baseline progress output
+        max_parallel: Maximum number of concurrent test executions
+        progress_callback: Optional callback for lightweight progress updates
+        operation: High-level command family for labeling submitted jobs
 
     Returns:
         EvalResults comparing skill vs baseline
     """
-
     results = EvalResults(skill_name=skill.name, model=model)
-
-    base_instruction = evaluator.instruction
+    semaphore = asyncio.Semaphore(max_parallel)
+    ensure_directory(artifact_root)
 
     async def _run_batch(
-        instruction: str | None,
+        batch_skill: Skill | None,
         label: str,
     ) -> list[TestResult]:
-        tasks = []
-        for index, tc in enumerate(test_cases, start=1):
+        batch_root = ensure_directory(artifact_root / sanitize_artifact_name(label))
+
+        async def _run_single(index: int, test_case: TestCase) -> TestResult:
             instance_name = f"eval ({label} test {index})"
-            tasks.append(
-                _run_test_with_evaluator(
-                    tc,
-                    evaluator,
-                    instruction,
+            test_artifact_dir = batch_root / f"test_{index}"
+            if progress_callback is not None:
+                progress_callback(f"starting {label} test {index}/{len(test_cases)}")
+            async with semaphore:
+                result = await run_test(
+                    test_case,
+                    executor,
+                    batch_skill,
+                    model=model,
+                    fastagent_config_path=fastagent_config_path,
+                    cards_source_dir=cards_source_dir,
+                    artifact_dir=test_artifact_dir,
                     instance_name=instance_name,
+                    operation=operation,
                 )
-            )
+            if progress_callback is not None:
+                status = "ok" if result.success else "failed"
+                progress_callback(f"finished {label} test {index}/{len(test_cases)} ({status})")
+            return result
+
+        tasks = [
+            asyncio.create_task(_run_single(index, test_case))
+            for index, test_case in enumerate(test_cases, start=1)
+        ]
         return await asyncio.gather(*tasks)
 
-    if model is not None:
-        await evaluator.set_model(model)
-
     # Run with skill
-    skill_instruction = compose_instruction(base_instruction, skill)
-    results.with_skill_results = await _run_batch(skill_instruction, "with-skill")
-
-    # Calculate with-skill metrics
-    successes = sum(1 for r in results.with_skill_results if r.success)
-    results.with_skill_success_rate = successes / len(test_cases) if test_cases else 0
-    results.with_skill_total_tokens = sum(
-        r.stats.total_tokens for r in results.with_skill_results
-    )
-    results.with_skill_avg_turns = (
-        sum(r.stats.turns for r in results.with_skill_results) / len(test_cases)
-        if test_cases
-        else 0
-    )
+    results.with_skill_results = await _run_batch(skill, "with-skill")
 
     # Run baseline if requested
     if run_baseline:
-        pause_cm = nullcontext()
-        if not show_baseline_progress and progress_display is not None:
-            paused = getattr(progress_display, "paused", None)
-            if callable(paused):
-                pause_cm = paused()
-
-        with pause_cm:
-            results.baseline_results = await _run_batch(None, "baseline")
-
-        successes = sum(1 for r in results.baseline_results if r.success)
-        results.baseline_success_rate = successes / len(test_cases) if test_cases else 0
-        results.baseline_total_tokens = sum(
-            r.stats.total_tokens for r in results.baseline_results
-        )
-        results.baseline_avg_turns = (
-            sum(r.stats.turns for r in results.baseline_results) / len(test_cases)
-            if test_cases
-            else 0
-        )
-
-    return results
+        results.baseline_results = await _run_batch(None, "baseline")
+    return apply_eval_metrics(results, test_cases)
 
 
 def get_failure_descriptions(results: EvalResults) -> list[str]:
diff --git a/src/upskill/executors/__init__.py b/src/upskill/executors/__init__.py
new file mode 100644
index 0000000..916d7ba
--- /dev/null
+++ b/src/upskill/executors/__init__.py
@@ -0,0 +1 @@
+"""Execution backends for evaluation flows."""
diff --git a/src/upskill/executors/base.py b/src/upskill/executors/base.py
new file mode 100644
index 0000000..d3bad5e
--- /dev/null
+++ b/src/upskill/executors/base.py
@@ -0,0 +1,21 @@
+"""Internal executor protocol for evaluation runs."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Protocol
+
+if TYPE_CHECKING:
+    from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult
+
+
+class Executor(Protocol):
+    """Internal execution interface used by evaluation orchestration."""
+
+    async def execute(self, request: ExecutionRequest) -> ExecutionHandle:
+        """Start execution for a single request."""
+
+    async def collect(self, handle: ExecutionHandle) -> ExecutionResult:
+        """Wait for a previously started execution and collect artifacts/results."""
+
+    async def cancel(self, handle: ExecutionHandle) -> None:
+        """Cancel a previously started execution."""
diff --git a/src/upskill/executors/contracts.py b/src/upskill/executors/contracts.py
new file mode 100644
index 0000000..2c7f680
--- /dev/null
+++ b/src/upskill/executors/contracts.py
@@ -0,0 +1,53 @@
+"""Execution request and result contracts for evaluation."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from upskill.models import ConversationStats, Skill
+
+if TYPE_CHECKING:
+    import asyncio
+    from pathlib import Path
+
+ExecutionMetadataValue = str | int | float | bool | None
+
+
+@dataclass(slots=True)
+class ExecutionRequest:
+    """Semantic execution request for a single evaluation run."""
+
+    prompt: str
+    model: str
+    agent: str
+    fastagent_config_path: Path
+    artifact_dir: Path
+    cards_source_dir: Path
+    label: str
+    skill: Skill | None = None
+    workspace_files: dict[str, str] = field(default_factory=dict)
+    metadata: dict[str, ExecutionMetadataValue] = field(default_factory=dict)
+
+
+@dataclass(slots=True)
+class ExecutionResult:
+    """Collected execution result plus preserved artifact paths."""
+
+    output_text: str | None
+    raw_results_path: Path | None
+    stdout_path: Path
+    stderr_path: Path
+    artifact_dir: Path
+    workspace_dir: Path
+    stats: ConversationStats = field(default_factory=ConversationStats)
+    error: str | None = None
+    metadata: dict[str, ExecutionMetadataValue] = field(default_factory=dict)
+
+
+@dataclass(slots=True)
+class ExecutionHandle:
+    """In-flight execution handle."""
+
+    request: ExecutionRequest
+    task: asyncio.Task[ExecutionResult]
diff --git a/src/upskill/executors/local_fast_agent.py b/src/upskill/executors/local_fast_agent.py
new file mode 100644
index 0000000..29ec02d
--- /dev/null
+++ b/src/upskill/executors/local_fast_agent.py
@@ -0,0 +1,142 @@
+"""Local shell-out executor for fast-agent-backed evaluation."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from dataclasses import replace
+
+from upskill.artifacts import (
+    bundle_agent_card,
+    copy_config_file,
+    ensure_directory,
+    materialize_skill_bundle,
+    materialize_workspace,
+    write_request_file,
+)
+from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult
+from upskill.fast_agent_cli import build_fast_agent_command
+from upskill.models import ConversationStats
+from upskill.result_parsing import parse_fast_agent_results
+
+
+class LocalFastAgentExecutor:
+    """Execute evaluation requests by shelling out to ``fast-agent`` locally."""
+
+    def __init__(self, *, fast_agent_bin: str = "fast-agent") -> None:
+        self._fast_agent_bin = fast_agent_bin
+
+    async def execute(self, request: ExecutionRequest) -> ExecutionHandle:
+        """Start a local subprocess execution."""
+        task = asyncio.create_task(self._run_request(request))
+        return ExecutionHandle(request=request, task=task)
+
+    async def collect(self, handle: ExecutionHandle) -> ExecutionResult:
+        """Collect a previously started subprocess execution."""
+        return await handle.task
+
+    async def cancel(self, handle: ExecutionHandle) -> None:
+        """Cancel a previously started subprocess execution."""
+        handle.task.cancel()
+        try:
+            await handle.task
+        except asyncio.CancelledError:
+            return
+
+    async def _run_request(self, request: ExecutionRequest) -> ExecutionResult:
+        artifact_dir = ensure_directory(request.artifact_dir.resolve())
+        normalized_request = replace(
+            request,
+            fastagent_config_path=request.fastagent_config_path.resolve(),
+            artifact_dir=artifact_dir,
+            cards_source_dir=request.cards_source_dir.resolve(),
+        )
+        workspace_dir = ensure_directory(artifact_dir / "workspace")
+        materialize_workspace(workspace_dir, normalized_request.workspace_files)
+
+        cards_dir = bundle_agent_card(
+            normalized_request.cards_source_dir,
+            artifact_dir / "cards",
+            agent_name=normalized_request.agent,
+        )
+        skills_dir = materialize_skill_bundle(artifact_dir / "skills", normalized_request)
+        preserved_config_path = copy_config_file(
+            normalized_request.fastagent_config_path,
+            artifact_dir / "fastagent.config.yaml",
+        )
+        workspace_config_path = copy_config_file(
+            normalized_request.fastagent_config_path,
+            workspace_dir / "fastagent.config.yaml",
+        )
+        del preserved_config_path, workspace_config_path
+
+        request_path = artifact_dir / "request.json"
+        write_request_file(request_path, normalized_request)
+
+        prompt_path = artifact_dir / "prompt.txt"
+        prompt_path.write_text(normalized_request.prompt, encoding="utf-8")
+
+        results_path = artifact_dir / "results.json"
+        stdout_path = artifact_dir / "stdout.txt"
+        stderr_path = artifact_dir / "stderr.txt"
+        command = build_fast_agent_command(
+            normalized_request,
+            config_path=normalized_request.fastagent_config_path
+            if normalized_request.fastagent_config_path.exists()
+            else None,
+            cards_dir=cards_dir,
+            skills_dir=skills_dir,
+            prompt_path=prompt_path,
+            results_path=results_path,
+            fast_agent_bin=self._fast_agent_bin,
+        )
+        command_path = artifact_dir / "command.json"
+        command_path.write_text(json.dumps(command, indent=2), encoding="utf-8")
+
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            cwd=workspace_dir,
+            stdin=asyncio.subprocess.DEVNULL,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout_bytes, stderr_bytes = await process.communicate()
+        stdout_text = stdout_bytes.decode("utf-8", errors="replace")
+        stderr_text = stderr_bytes.decode("utf-8", errors="replace")
+        stdout_path.write_text(stdout_text, encoding="utf-8")
+        stderr_path.write_text(stderr_text, encoding="utf-8")
+
+        error: str | None = None
+        parsed_output: str | None = None
+        parsed_stats = None
+
+        if not results_path.exists():
+            error = "fast-agent run did not produce a results artifact."
+        else:
+            try:
+                parsed = parse_fast_agent_results(results_path)
+            except Exception as exc:
+                error = f"Failed to parse fast-agent results: {exc}"
+            else:
+                parsed_output = parsed.output_text
+                parsed_stats = parsed.stats
+
+        if process.returncode != 0:
+            exit_error = f"fast-agent exited with code {process.returncode}."
+            error = f"{error} {exit_error}".strip() if error else exit_error
+
+        result = ExecutionResult(
+            output_text=parsed_output,
+            raw_results_path=results_path if results_path.exists() else None,
+            stdout_path=stdout_path,
+            stderr_path=stderr_path,
+            artifact_dir=artifact_dir,
+            workspace_dir=workspace_dir,
+            stats=parsed_stats or ConversationStats(),
+            error=error,
+            metadata={
+                **normalized_request.metadata,
+                "return_code": process.returncode,
+            },
+        )
+        return result
diff --git a/src/upskill/executors/remote_fast_agent.py b/src/upskill/executors/remote_fast_agent.py
new file mode 100644
index 0000000..fef26be
--- /dev/null
+++ b/src/upskill/executors/remote_fast_agent.py
@@ -0,0 +1,322 @@
+"""Remote HF Jobs-backed executor for fast-agent evaluation."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import shutil
+import tarfile
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from upskill.artifacts import (
+    bundle_agent_card,
+    copy_config_file,
+    ensure_directory,
+    materialize_skill_bundle,
+    materialize_workspace,
+    write_request_file,
+)
+from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult
+from upskill.hf_jobs import (
+    JobsConfig,
+    SubmittedJob,
+    _make_run_id,
+    _sanitize_hf_job_label_value,
+    _submit_bundle_job,
+    parse_duration_seconds,
+    wait_for_job_outputs,
+)
+from upskill.models import ConversationStats
+from upskill.result_parsing import parse_fast_agent_results
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+class RemoteFastAgentExecutor:
+    """Execute evaluation requests by submitting one HF job per request."""
+
+    def __init__(
+        self,
+        *,
+        jobs_config: JobsConfig,
+        progress_callback: Callable[[str], None] | None = None,
+    ) -> None:
+        self._jobs_config = jobs_config
+        self._progress_callback = progress_callback
+
+    async def execute(self, request: ExecutionRequest) -> ExecutionHandle:
+        """Start a remote job-backed execution."""
+        task = asyncio.create_task(asyncio.to_thread(self._run_request_sync, request))
+        return ExecutionHandle(request=request, task=task)
+
+    async def submit(self, request: ExecutionRequest) -> SubmittedJob:
+        """Submit a remote execution request without waiting for results."""
+        return await asyncio.to_thread(self._submit_request_sync, request)
+
+    async def collect(self, handle: ExecutionHandle) -> ExecutionResult:
+        """Collect a previously started remote execution."""
+        return await handle.task
+
+    async def cancel(self, handle: ExecutionHandle) -> None:
+        """Cancel a previously started remote execution."""
+        handle.task.cancel()
+        try:
+            await handle.task
+        except asyncio.CancelledError:
+            return
+
+    def _submit_request_sync(self, request: ExecutionRequest) -> SubmittedJob:
+        normalized_request, artifact_dir = self._prepare_request(request)
+        return self._submit_prepared_request(normalized_request, artifact_dir)
+
+    def _submit_prepared_request(
+        self,
+        request: ExecutionRequest,
+        artifact_dir: Path,
+    ) -> SubmittedJob:
+        temp_root, bundle_archive = self._create_bundle_archive(request)
+        try:
+            submission = self._submit_bundle(request, bundle_archive)
+        finally:
+            shutil.rmtree(temp_root, ignore_errors=True)
+
+        request_path = artifact_dir / "submitted_job.json"
+        request_path.write_text(
+            json.dumps(
+                {
+                    "job_id": submission.job_id,
+                    "run_id": submission.run_id,
+                    "artifact_repo": submission.artifact_repo,
+                },
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+        return submission
+
+    def _run_request_sync(self, request: ExecutionRequest) -> ExecutionResult:
+        normalized_request, artifact_dir = self._prepare_request(request)
+        workspace_dir = artifact_dir / "workspace"
+        submission = self._submit_prepared_request(normalized_request, artifact_dir)
+
+        remote_output_dir = wait_for_job_outputs(
+            submission,
+            destination_root=artifact_dir / "remote_download",
+            wait_timeout_seconds=parse_duration_seconds(self._jobs_config.jobs_timeout),
+            progress_callback=self._progress_callback,
+        )
+
+        stdout_path = artifact_dir / "stdout.txt"
+        stderr_path = artifact_dir / "stderr.txt"
+        results_path = artifact_dir / "results.json"
+        self._materialize_remote_outputs(
+            remote_output_dir=remote_output_dir,
+            artifact_dir=artifact_dir,
+            workspace_dir=workspace_dir,
+            stdout_path=stdout_path,
+            stderr_path=stderr_path,
+            results_path=results_path,
+        )
+
+        exit_code = self._read_exit_code(remote_output_dir)
+        error: str | None = None
+        parsed_output: str | None = None
+        parsed_stats = ConversationStats()
+
+        if not results_path.exists():
+            error = "fast-agent run did not produce a results artifact."
+        else:
+            try:
+                parsed = parse_fast_agent_results(results_path)
+            except Exception as exc:
+                error = f"Failed to parse fast-agent results: {exc}"
+            else:
+                parsed_output = parsed.output_text
+                parsed_stats = parsed.stats
+
+        if exit_code not in {"", "0"}:
+            exit_error = f"fast-agent exited with code {exit_code}."
+            error = f"{error} {exit_error}".strip() if error else exit_error
+
+        metadata = {
+            **normalized_request.metadata,
+            "job_id": submission.job_id,
+            "run_id": submission.run_id,
+            "return_code": int(exit_code) if exit_code else None,
+        }
+        return ExecutionResult(
+            output_text=parsed_output,
+            raw_results_path=results_path if results_path.exists() else None,
+            stdout_path=stdout_path,
+            stderr_path=stderr_path,
+            artifact_dir=artifact_dir,
+            workspace_dir=workspace_dir,
+            stats=parsed_stats,
+            error=error,
+            metadata=metadata,
+        )
+
+    def _prepare_request(self, request: ExecutionRequest) -> tuple[ExecutionRequest, Path]:
+        artifact_dir = ensure_directory(request.artifact_dir.resolve())
+        normalized_request = ExecutionRequest(
+            prompt=request.prompt,
+            model=request.model,
+            agent=request.agent,
+            fastagent_config_path=request.fastagent_config_path.resolve(),
+            artifact_dir=artifact_dir,
+            cards_source_dir=request.cards_source_dir.resolve(),
+            label=request.label,
+            skill=request.skill,
+            workspace_files=dict(request.workspace_files),
+            metadata=dict(request.metadata),
+        )
+        workspace_dir = ensure_directory(artifact_dir / "workspace")
+        materialize_workspace(workspace_dir, normalized_request.workspace_files)
+
+        bundle_agent_card(
+            normalized_request.cards_source_dir,
+            artifact_dir / "cards",
+            agent_name=normalized_request.agent,
+        )
+        materialize_skill_bundle(artifact_dir / "skills", normalized_request)
+        copy_config_file(
+            normalized_request.fastagent_config_path,
+            artifact_dir / "fastagent.config.yaml",
+        )
+        copy_config_file(
+            normalized_request.fastagent_config_path,
+            workspace_dir / "fastagent.config.yaml",
+        )
+
+        request_path = artifact_dir / "request.json"
+        write_request_file(request_path, normalized_request)
+        (artifact_dir / "prompt.txt").write_text(normalized_request.prompt, encoding="utf-8")
+        return normalized_request, artifact_dir
+
+    def _submit_bundle(self, request: ExecutionRequest, bundle_archive: Path) -> SubmittedJob:
+        run_id = _make_run_id("request", request.model, request.label)
+        labels = self._build_job_labels(request, run_id=run_id)
+        submission = _submit_bundle_job(
+            bundle_archive=bundle_archive,
+            jobs_config=self._jobs_config,
+            run_id=run_id,
+            model=request.model,
+            labels=labels,
+        )
+        if self._progress_callback is not None:
+            self._progress_callback(
+                f"submitted remote request {request.label} as job "
+                f"{submission.job_id} (run_id={submission.run_id})"
+            )
+        return submission
+
+    def _build_job_labels(self, request: ExecutionRequest, *, run_id: str) -> dict[str, str]:
+        operation = request.metadata.get("operation")
+        labels = {
+            "upskill-agent": _sanitize_hf_job_label_value(request.agent, default="agent"),
+            "upskill-executor": "remote-fast-agent",
+            "upskill-model": _sanitize_hf_job_label_value(request.model, default="model"),
+            "upskill-operation": _sanitize_hf_job_label_value(
+                operation if isinstance(operation, str) else "eval",
+                default="eval",
+            ),
+            "upskill-request": _sanitize_hf_job_label_value(request.label, default="request"),
+            "upskill-run-id": _sanitize_hf_job_label_value(run_id, default="run"),
+        }
+        if request.skill is not None:
+            labels["upskill-skill"] = _sanitize_hf_job_label_value(
+                request.skill.name,
+                default="skill",
+            )
+        return labels
+
+    def _create_bundle_archive(self, request: ExecutionRequest) -> tuple[Path, Path]:
+        temp_root = Path(tempfile.mkdtemp(prefix="upskill_hf_request_"))
+        bundle_root = temp_root / "bundle"
+        ensure_directory(bundle_root)
+        ensure_directory(bundle_root / "skills")
+        if request.skill is not None:
+            request.skill.save(bundle_root / "skills" / request.skill.name)
+        bundle_agent_card(
+            request.cards_source_dir,
+            bundle_root / "cards",
+            agent_name=request.agent,
+        )
+        copy_config_file(request.fastagent_config_path, bundle_root / "fastagent.config.yaml")
+        (bundle_root / "agent.txt").write_text(request.agent, encoding="utf-8")
+
+        request_dir = ensure_directory(bundle_root / "requests" / "request_1")
+        (request_dir / "prompt.txt").write_text(request.prompt, encoding="utf-8")
+        request_workspace_dir = ensure_directory(request_dir / "workspace")
+        materialize_workspace(request_workspace_dir, request.workspace_files)
+        (bundle_root / "manifest.json").write_text(
+            json.dumps(
+                {
+                    "request_count": 1,
+                    "requests": [
+                        {
+                            "id": "request_1",
+                            "index": 1,
+                            "has_workspace_files": bool(request.workspace_files),
+                        }
+                    ],
+                },
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+
+        entrypoint_source = (
+            Path(__file__).resolve().parents[3]
+            / "scripts"
+            / "hf"
+            / "job_entrypoint_eval_fast_agent.sh"
+        )
+        shutil.copy2(entrypoint_source, bundle_root / "job_entrypoint.sh")
+        bundle_archive = temp_root / "bundle.tar.gz"
+        with tarfile.open(bundle_archive, "w:gz") as archive:
+            archive.add(bundle_root, arcname="bundle")
+        return temp_root, bundle_archive
+
+    def _materialize_remote_outputs(
+        self,
+        *,
+        remote_output_dir: Path,
+        artifact_dir: Path,
+        workspace_dir: Path,
+        stdout_path: Path,
+        stderr_path: Path,
+        results_path: Path,
+    ) -> None:
+        preserved_output_dir = artifact_dir / "remote_output"
+        if preserved_output_dir.exists():
+            shutil.rmtree(preserved_output_dir)
+        shutil.copytree(remote_output_dir, preserved_output_dir)
+
+        remote_stdout = remote_output_dir / "logs" / "request_1.out.txt"
+        remote_stderr = remote_output_dir / "logs" / "request_1.err.txt"
+        remote_results = remote_output_dir / "results" / "request_1.json"
+        remote_workspace = remote_output_dir / "workspaces" / "request_1"
+
+        if remote_stdout.exists():
+            shutil.copy2(remote_stdout, stdout_path)
+        else:
+            stdout_path.write_text("", encoding="utf-8")
+        if remote_stderr.exists():
+            shutil.copy2(remote_stderr, stderr_path)
+        else:
+            stderr_path.write_text("", encoding="utf-8")
+        if remote_results.exists():
+            shutil.copy2(remote_results, results_path)
+        if remote_workspace.exists():
+            shutil.rmtree(workspace_dir, ignore_errors=True)
+            shutil.copytree(remote_workspace, workspace_dir)
+
+    def _read_exit_code(self, remote_output_dir: Path) -> str:
+        status_path = remote_output_dir / "status" / "request_1.exit_code.txt"
+        if not status_path.exists():
+            return ""
+        return status_path.read_text(encoding="utf-8").strip()
diff --git a/src/upskill/fast_agent_cli.py b/src/upskill/fast_agent_cli.py
new file mode 100644
index 0000000..208ad3e
--- /dev/null
+++ b/src/upskill/fast_agent_cli.py
@@ -0,0 +1,44 @@
+"""Helpers for building fast-agent CLI invocations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from upskill.executors.contracts import ExecutionRequest
+
+
+def build_fast_agent_command(
+    request: ExecutionRequest,
+    *,
+    config_path: Path | None,
+    cards_dir: Path,
+    skills_dir: Path,
+    prompt_path: Path,
+    results_path: Path,
+    fast_agent_bin: str = "fast-agent",
+) -> list[str]:
+    """Build the canonical fast-agent automation command for a request."""
+    command = [fast_agent_bin, "go"]
+    if config_path is not None:
+        command.extend(["--config-path", str(config_path)])
+    command.extend(
+        [
+            "--card",
+            str(cards_dir),
+            "--agent",
+            request.agent,
+            "--model",
+            request.model,
+            "--skills-dir",
+            str(skills_dir),
+            "--prompt-file",
+            str(prompt_path),
+            "--results",
+            str(results_path),
+            "--quiet",
+        ]
+    )
+    return command
diff --git a/src/upskill/fastagent_integration.py b/src/upskill/fastagent_integration.py
index 32f8349..a7357aa 100644
--- a/src/upskill/fastagent_integration.py
+++ b/src/upskill/fastagent_integration.py
@@ -13,4 +13,3 @@ def compose_instruction(instruction: str, skill: Skill | None) -> str:
     if not skill:
         return instruction
     return f"{instruction}\n\n## Skill: {skill.name}\n\n{skill.body}"
-
diff --git a/src/upskill/generate.py b/src/upskill/generate.py
index 1cca065..8abc943 100644
--- a/src/upskill/generate.py
+++ b/src/upskill/generate.py
@@ -1,14 +1,16 @@
-"""Skill generation from task descriptions using FastAgent."""
+"""Skill generation from task descriptions using fast-agent."""
 
 from __future__ import annotations
 
 from datetime import UTC, datetime
-
-from fast_agent.interfaces import AgentProtocol
-from fast_agent.skills.registry import SkillManifest
+from typing import TYPE_CHECKING
 
 from upskill.manifest_utils import parse_skill_manifest_text
-from upskill.models import Skill, SkillMetadata, TestCase, TestCaseSuite
+from upskill.models import Skill, SkillMetadata, SkillRecord, SkillState, TestCase, TestCaseSuite
+
+if TYPE_CHECKING:
+    from fast_agent.interfaces import AgentProtocol
+    from fast_agent.skills.registry import SkillManifest
 
 # Few-shot examples for test generation
 TEST_EXAMPLES = """
@@ -72,31 +74,36 @@
     "## Your Task\n\n"
     f"Task: {TASK_PLACEHOLDER}\n\n"
     "Generate test cases that verify the agent can apply the skill correctly.\n\n"
-
     "Each TestCase MUST include at least a list of expected strings in the expected field.\n"
     "Focus on practical scenarios that test understanding of the core concepts."
 )
 
+
 def _build_skill_from_manifest(
     manifest: SkillManifest,
     *,
     model: str | None,
-    source_task: str,
-    base_skill: Skill | None = None,
-) -> Skill:
-    references = base_skill.references if base_skill else {}
-    scripts = base_skill.scripts if base_skill else {}
-    return Skill(
-        name=manifest.name,
-        description=manifest.description,
-        body=manifest.body,
-        ## treating these as future for now as skill generator doesn't generate additional artifacts
-        references=references,
-        scripts=scripts,
-        metadata=SkillMetadata(
-            generated_by=model,
-            generated_at=datetime.now(UTC),
-            source_task=source_task,
+    source_task: str | None,
+    base_skill: SkillRecord | None = None,
+) -> SkillRecord:
+    references = base_skill.skill.references if base_skill else {}
+    scripts = base_skill.skill.scripts if base_skill else {}
+    return SkillRecord(
+        skill=Skill(
+            name=manifest.name,
+            description=manifest.description,
+            body=manifest.body,
+            ## treating these as future for now as skill generator doesn't generate additional artifacts
+            references=references,
+            scripts=scripts,
+        ),
+        state=SkillState(
+            metadata=SkillMetadata(
+                generated_by=model,
+                generated_at=datetime.now(UTC),
+                source_task=source_task,
+            ),
+            tests=list(base_skill.state.tests) if base_skill else [],
         ),
     )
 
@@ -106,8 +113,8 @@ async def generate_skill(
     generator: AgentProtocol,
     examples: list[str] | None = None,
     model: str | None = None,
-) -> Skill:
-    """Generate a skill from a task description using FastAgent."""
+) -> SkillRecord:
+    """Generate a skill from a task description using fast-agent."""
 
     prompt = f"Create a skill document that teaches an AI agent how to: {task}"
     if examples:
@@ -115,7 +122,6 @@ async def generate_skill(
             f"- {ex}" for ex in examples
         )
 
-
     skill_text = await generator.send(prompt)
     manifest, error = parse_skill_manifest_text(skill_text)
     if manifest is None:
@@ -131,9 +137,8 @@ async def generate_skill(
 async def generate_tests(
     task: str,
     generator: AgentProtocol,
-    model: str | None = None,
 ) -> list[TestCase]:
-    """Generate synthetic test cases from a task description using FastAgent."""
+    """Generate synthetic test cases from a task description using fast-agent."""
 
     prompt = TEST_GENERATION_PROMPT.replace(TASK_PLACEHOLDER, task)
 
@@ -162,18 +167,18 @@ async def generate_tests(
 
 
 async def refine_skill(
-    skill: Skill,
+    skill: SkillRecord,
     failures: list[str],
     generator: AgentProtocol,
     model: str | None = None,
-) -> Skill:
-    """Refine a skill based on evaluation failures using FastAgent."""
+) -> SkillRecord:
+    """Refine a skill based on evaluation failures using fast-agent."""
 
     prompt = f"""Improve this skill based on failures:
 
-Name: {skill.name}
-Description: {skill.description}
-Body: {skill.body[:500]}...
+Name: {skill.skill.name}
+Description: {skill.skill.description}
+Body: {skill.skill.body[:500]}...
 
 Failures:
 {chr(10).join(f"- {f}" for f in failures[:3])}
@@ -190,7 +195,7 @@ async def refine_skill(
     return _build_skill_from_manifest(
         manifest,
         model=model,
-        source_task=skill.metadata.source_task,
+        source_task=skill.state.metadata.source_task,
         base_skill=skill,
     )
 
@@ -225,11 +230,11 @@ async def refine_skill(
 
 
 async def improve_skill(
-    skill: Skill,
+    skill: SkillRecord,
     instructions: str,
     generator: AgentProtocol,
     model: str | None = None,
-) -> Skill:
+) -> SkillRecord:
     """Improve an existing skill based on instructions.
 
     Args:
@@ -245,13 +250,12 @@ async def improve_skill(
     # model = model or config.skill_generation_model
 
     prompt = IMPROVE_PROMPT.format(
-        name=skill.name,
-        description=skill.description,
-        body=skill.body,
+        name=skill.skill.name,
+        description=skill.skill.description,
+        body=skill.skill.body,
         instructions=instructions,
     )
 
-
     skill_text = await generator.send(prompt)
     manifest, error = parse_skill_manifest_text(skill_text)
     if manifest is None:
@@ -260,6 +264,6 @@ async def improve_skill(
     return _build_skill_from_manifest(
         manifest,
         model=model,
-        source_task=f"Improved from {skill.name}: {instructions}",
+        source_task=f"Improved from {skill.skill.name}: {instructions}",
         base_skill=skill,
     )
diff --git a/src/upskill/hf_jobs.py b/src/upskill/hf_jobs.py
new file mode 100644
index 0000000..aba5766
--- /dev/null
+++ b/src/upskill/hf_jobs.py
@@ -0,0 +1,618 @@
+"""Helpers for submitting and collecting Hugging Face Jobs-based eval runs."""
+
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+import threading
+import time
+import uuid
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Mapping
+
+
+@dataclass(frozen=True)
+class JobsConfig:
+    """Configuration for remote Jobs-backed execution."""
+
+    artifact_repo: str
+    wait: bool = True
+    jobs_timeout: str = "2h"
+    jobs_flavor: str = "cpu-basic"
+    jobs_secrets: str = "HF_TOKEN"
+    jobs_namespace: str | None = None
+    jobs_image: str = "ghcr.io/astral-sh/uv:python3.13-bookworm"
+
+
+@dataclass(frozen=True)
+class SubmittedJob:
+    """A submitted Hugging Face Job plus its artifact identifiers."""
+
+    job_id: str
+    run_id: str
+    artifact_repo: str
+
+
+_JOB_URL_RE = re.compile(r"https://huggingface\.co/jobs/(?P<namespace>[^/]+)/(?P<job_id>[^/\s]+)")
+_HF_UPLOAD_CONFLICT_MARKERS = (
+    "412 Precondition Failed",
+    "A commit has happened since. Please refresh and try again.",
+)
+_HF_AUTH_RATE_LIMIT_MARKERS = (
+    "rate limit for the /whoami-v2 endpoint",
+    "whoami-v2",
+)
+_HF_SUBMISSION_LOCK = threading.RLock()
+_VERIFIED_ARTIFACT_REPOS: set[str] = set()
+_HF_HUB_CLI_SPEC = "huggingface_hub==1.7.2"
+_FAST_AGENT_SPEC = "fast-agent-mcp==0.6.8"
+_MAX_HF_JOB_LABEL_VALUE_LENGTH = 63
+_HF_RETRY_ATTEMPTS = 5
+_HF_INITIAL_RETRY_DELAY_SECONDS = 2.0
+
+
+def _normalize_job_id(value: str) -> str:
+    """Normalize a raw job reference into ``job_id`` or ``namespace/job_id`` form."""
+    raw = value.strip()
+    match = _JOB_URL_RE.search(raw)
+    if match:
+        return f"{match.group('namespace')}/{match.group('job_id')}"
+    if raw.startswith("View at:"):
+        return _normalize_job_id(raw.removeprefix("View at:"))
+    return raw
+
+
+def _split_job_reference(value: str) -> tuple[str | None, str]:
+    normalized = _normalize_job_id(value)
+    if "/" in normalized:
+        namespace, job_id = normalized.rsplit("/", 1)
+        return namespace, job_id
+    return None, normalized
+
+
+def _namespace_from_repo_id(repo_id: str) -> str | None:
+    if "/" not in repo_id:
+        return None
+    namespace, _repo_name = repo_id.split("/", 1)
+    normalized = namespace.strip()
+    return normalized or None
+
+
+def _resolve_jobs_namespace(
+    *,
+    job_id: str | None = None,
+    artifact_repo: str | None = None,
+    configured_namespace: str | None = None,
+) -> str | None:
+    if configured_namespace:
+        return configured_namespace
+    if job_id is not None:
+        namespace, _bare_job_id = _split_job_reference(job_id)
+        if namespace is not None:
+            return namespace
+    if artifact_repo is not None:
+        return _namespace_from_repo_id(artifact_repo)
+    return None
+
+
+def _lookup_job_stage(job_id: str, *, namespace: str | None = None) -> str | None:
+    """Best-effort lookup of an HF job stage from ``hf jobs ps --format json``."""
+    bare_namespace, bare_job_id = _split_job_reference(job_id)
+    resolved_namespace = _resolve_jobs_namespace(
+        job_id=job_id,
+        configured_namespace=namespace or bare_namespace,
+    )
+    command = ["hf", "jobs", "ps", "-a", "--format", "json"]
+    if resolved_namespace is not None:
+        command.extend(["--namespace", resolved_namespace])
+    completed = _run_hf_command(command)
+    if completed.returncode != 0:
+        return None
+    try:
+        payload = json.loads(completed.stdout)
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(payload, list):
+        return None
+
+    for entry in payload:
+        if not isinstance(entry, dict):
+            continue
+        if str(entry.get("id", "")) != bare_job_id:
+            continue
+        owner = entry.get("owner")
+        owner_name = owner.get("name") if isinstance(owner, dict) else None
+        if resolved_namespace is not None and owner_name != resolved_namespace:
+            continue
+        status = entry.get("status")
+        if isinstance(status, dict):
+            stage = status.get("stage")
+            if isinstance(stage, str):
+                return stage
+    return None
+
+
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+
+
+def _hf_command_output(completed: subprocess.CompletedProcess[str]) -> str:
+    """Return combined stdout/stderr for retry classification."""
+    return f"{completed.stdout}\n{completed.stderr}"
+
+
+def _has_retryable_hf_failure(
+    completed: subprocess.CompletedProcess[str],
+    *,
+    markers: tuple[str, ...],
+) -> bool:
+    """Return whether a failed HF CLI call matches any retryable marker."""
+    if completed.returncode == 0:
+        return False
+    output = _hf_command_output(completed)
+    return any(marker in output for marker in markers)
+
+
+def _is_retryable_hf_upload_failure(completed: subprocess.CompletedProcess[str]) -> bool:
+    """Return whether a failed ``hf upload`` can be retried safely."""
+    return _has_retryable_hf_failure(completed, markers=_HF_UPLOAD_CONFLICT_MARKERS)
+
+
+def _is_retryable_hf_auth_failure(completed: subprocess.CompletedProcess[str]) -> bool:
+    """Return whether a failed HF CLI call hit auth-related rate limiting."""
+    return _has_retryable_hf_failure(completed, markers=_HF_AUTH_RATE_LIMIT_MARKERS)
+
+
+def _is_retryable_hf_failure(
+    completed: subprocess.CompletedProcess[str],
+    *,
+    retry_auth_rate_limit: bool,
+    retry_upload_conflicts: bool,
+) -> bool:
+    """Return whether a failed HF CLI call should be retried."""
+    auth_retry = retry_auth_rate_limit and _is_retryable_hf_auth_failure(completed)
+    upload_retry = retry_upload_conflicts and _is_retryable_hf_upload_failure(completed)
+    return auth_retry or upload_retry
+
+
+def _retry_exhausted_hf_failure_message(completed: subprocess.CompletedProcess[str]) -> str:
+    """Return extra context when a retryable HF failure still exhausted retries."""
+    if _is_retryable_hf_auth_failure(completed):
+        return (
+            "The Hugging Face CLI continued hitting the /whoami-v2 auth rate limit after "
+            "retrying.\n"
+        )
+    if _is_retryable_hf_upload_failure(completed):
+        return "The Hugging Face CLI continued hitting a retryable upload conflict.\n"
+    return ""
+
+
+def _run_hf_command_with_retry(
+    command: list[str],
+    *,
+    retryable: Callable[[subprocess.CompletedProcess[str]], bool],
+    attempts: int = _HF_RETRY_ATTEMPTS,
+    initial_delay_seconds: float = _HF_INITIAL_RETRY_DELAY_SECONDS,
+) -> subprocess.CompletedProcess[str]:
+    """Run an HF CLI command with retry/backoff for known transient failures."""
+    delay_seconds = initial_delay_seconds
+    last_completed: subprocess.CompletedProcess[str] | None = None
+
+    for attempt in range(1, attempts + 1):
+        completed = subprocess.run(
+            command,
+            cwd=_repo_root(),
+            check=False,
+            capture_output=True,
+            text=True,
+        )
+        last_completed = completed
+        if completed.returncode == 0:
+            return completed
+        if attempt >= attempts or not retryable(completed):
+            return completed
+        time.sleep(delay_seconds)
+        delay_seconds *= 2
+
+    if last_completed is None:
+        raise RuntimeError("HF CLI retry loop completed without executing a command.")
+    return last_completed
+
+
+def _run_hf_command(
+    command: list[str],
+    *,
+    retry_auth_rate_limit: bool = True,
+    retry_upload_conflicts: bool = False,
+    attempts: int = _HF_RETRY_ATTEMPTS,
+    initial_delay_seconds: float = _HF_INITIAL_RETRY_DELAY_SECONDS,
+) -> subprocess.CompletedProcess[str]:
+    """Run an HF CLI command through the shared retry policy."""
+    return _run_hf_command_with_retry(
+        command,
+        retryable=lambda completed: _is_retryable_hf_failure(
+            completed,
+            retry_auth_rate_limit=retry_auth_rate_limit,
+            retry_upload_conflicts=retry_upload_conflicts,
+        ),
+        attempts=attempts,
+        initial_delay_seconds=initial_delay_seconds,
+    )
+
+
+def _verify_artifact_repo_access(artifact_repo: str) -> None:
+    """Verify that the configured artifact dataset repo exists and is accessible."""
+    with _HF_SUBMISSION_LOCK:
+        if artifact_repo in _VERIFIED_ARTIFACT_REPOS:
+            return
+
+        completed = _run_hf_command(
+            [
+                "hf",
+                "download",
+                artifact_repo,
+                "--repo-type",
+                "dataset",
+                "--dry-run",
+                "--quiet",
+            ]
+        )
+        if completed.returncode != 0:
+            raise RuntimeError(
+                "Artifact repo is not accessible. Create it before submitting jobs and "
+                "ensure the current Hugging Face credentials can access it:\n"
+                f"repo: {artifact_repo}\n"
+                f"{_retry_exhausted_hf_failure_message(completed)}"
+                f"stdout:\n{completed.stdout}\n"
+                f"stderr:\n{completed.stderr}"
+            )
+
+        _VERIFIED_ARTIFACT_REPOS.add(artifact_repo)
+
+
+def verify_artifact_repo_access(artifact_repo: str) -> None:
+    """Validate that the artifact dataset repo exists and is accessible."""
+    _verify_artifact_repo_access(artifact_repo)
+
+
+def parse_duration_seconds(value: str) -> float:
+    """Parse a simple HF-style duration like ``45m`` or ``2h``."""
+    if not value:
+        raise ValueError("Duration value must not be empty.")
+    suffix = value[-1]
+    multiplier = {
+        "s": 1.0,
+        "m": 60.0,
+        "h": 3600.0,
+        "d": 86400.0,
+    }.get(suffix)
+    if multiplier is None:
+        suffix = "s"
+        multiplier = 1.0
+        number = value
+    else:
+        number = value[:-1]
+    try:
+        return float(number) * multiplier
+    except ValueError as exc:
+        raise ValueError(f"Invalid duration value: {value}") from exc
+
+
+def _sanitize_label(value: str) -> str:
+    sanitized = re.sub(r"[^a-z0-9]+", "-", value.strip().lower()).strip("-")
+    return sanitized or "eval"
+
+
+def _sanitize_hf_job_label_value(value: str, *, default: str) -> str:
+    sanitized = re.sub(r"[^a-z0-9]+", "-", value.strip().lower()).strip("-")
+    truncated = sanitized[:_MAX_HF_JOB_LABEL_VALUE_LENGTH].strip("-")
+    return truncated or default
+
+
+def _make_run_id(*parts: str) -> str:
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    suffix = "-".join(_sanitize_label(part) for part in parts if part)
+    entropy = uuid.uuid4().hex[:8]
+    prefix = f"{timestamp}_{suffix}" if suffix else timestamp
+    return f"{prefix}_{entropy}"
+
+
+def wait_for_job_outputs(
+    job: SubmittedJob,
+    *,
+    destination_root: Path,
+    wait_timeout_seconds: float,
+    poll_interval_seconds: float = 15.0,
+    progress_callback: Callable[[str], None] | None = None,
+) -> Path:
+    """Wait until a job uploads its exit marker, then download full outputs."""
+    deadline = time.monotonic() + wait_timeout_seconds
+    marker_path = f"outputs/{job.run_id}/exit_code.txt"
+    poll_count = 0
+
+    if progress_callback is not None:
+        progress_callback(f"waiting for job {job.job_id} (run_id={job.run_id})")
+
+    while time.monotonic() < deadline:
+        poll_count += 1
+        stage = _lookup_job_stage(
+            job.job_id,
+            namespace=_resolve_jobs_namespace(
+                job_id=job.job_id,
+                artifact_repo=job.artifact_repo,
+            ),
+        )
+        marker_download = _run_hf_command(
+            [
+                "hf",
+                "download",
+                job.artifact_repo,
+                marker_path,
+                "--repo-type",
+                "dataset",
+                "--local-dir",
+                str(destination_root),
+                "--quiet",
+            ],
+        )
+        if marker_download.returncode == 0:
+            if progress_callback is not None:
+                progress_callback(f"job {job.job_id} completed; downloading artifacts")
+            full_download = _run_hf_command(
+                [
+                    "hf",
+                    "download",
+                    job.artifact_repo,
+                    "--repo-type",
+                    "dataset",
+                    "--include",
+                    f"outputs/{job.run_id}/**",
+                    "--local-dir",
+                    str(destination_root),
+                ],
+            )
+            if full_download.returncode != 0:
+                raise RuntimeError(
+                    "HF job finished but artifacts could not be downloaded:\n"
+                    f"{_retry_exhausted_hf_failure_message(full_download)}"
+                    f"stdout:\n{full_download.stdout}\n"
+                    f"stderr:\n{full_download.stderr}"
+                )
+            if progress_callback is not None:
+                progress_callback(f"downloaded artifacts for job {job.job_id}")
+            return destination_root / "outputs" / job.run_id
+        if _is_retryable_hf_auth_failure(marker_download):
+            raise RuntimeError(
+                "Failed to check remote fast-agent job outputs after repeated Hugging Face "
+                "auth retries:\n"
+                f"stdout:\n{marker_download.stdout}\n"
+                f"stderr:\n{marker_download.stderr}"
+            )
+        if stage in {"ERROR", "CANCELED", "DELETED"}:
+            raise RuntimeError(
+                f"HF job {job.job_id} ended with stage {stage}. "
+                f"Inspect logs with `hf jobs logs {job.job_id}`."
+            )
+        if progress_callback is not None:
+            stage_suffix = f" ({stage.lower()})" if stage else ""
+            progress_callback(f"poll {poll_count}: job {job.job_id} still running{stage_suffix}")
+        time.sleep(poll_interval_seconds)
+
+    raise TimeoutError(
+        f"Timed out waiting for HF job artifacts for job {job.job_id} (run_id={job.run_id})."
+    )
+
+
+def _hf_secret_flags(secrets: str) -> list[str]:
+    flags: list[str] = []
+    for secret in (item.strip() for item in secrets.split(",")):
+        if not secret:
+            continue
+        flags.extend(["--secrets", secret])
+    return flags
+
+
+def _hf_label_flags(labels: Mapping[str, str] | None) -> list[str]:
+    flags: list[str] = []
+    if not labels:
+        return flags
+    for key, value in sorted(labels.items()):
+        flags.extend(["--label", f"{key}={value}"])
+    return flags
+
+
+def _upload_bundle_input(
+    *,
+    bundle_archive: Path,
+    artifact_repo: str,
+    run_id: str,
+) -> subprocess.CompletedProcess[str]:
+    """Upload a prepared request bundle into the artifact dataset."""
+    with _HF_SUBMISSION_LOCK:
+        return _run_hf_command(
+            [
+                "hf",
+                "upload",
+                artifact_repo,
+                str(bundle_archive),
+                f"inputs/{run_id}/bundle.tar.gz",
+                "--repo-type",
+                "dataset",
+                "--commit-message",
+                f"inputs: {run_id}",
+            ],
+            retry_upload_conflicts=True,
+        )
+
+
+def _render_bundle_job_script() -> str:
+    """Render the shell script executed inside the remote HF job container."""
+    return "\n".join(
+        [
+            "set -euo pipefail",
+            "run_hf_with_retries() {",
+            "  local delay=2",
+            "  local attempt",
+            f"  for attempt in $(seq 1 {_HF_RETRY_ATTEMPTS}); do",
+            '    local log_file="$(mktemp)"',
+            '    if "$@" >"$log_file" 2>&1; then',
+            '      cat "$log_file"',
+            '      rm -f "$log_file"',
+            "      return 0",
+            "    fi",
+            f'    if [[ "$attempt" -lt {_HF_RETRY_ATTEMPTS} ]] && (',
+            '      grep -q "rate limit for the /whoami-v2 endpoint" "$log_file" ||',
+            '      grep -q "whoami-v2" "$log_file" ||',
+            '      grep -q "412 Precondition Failed" "$log_file" ||',
+            '      grep -q "A commit has happened since" "$log_file"',
+            "    ); then",
+            '      cat "$log_file" >&2',
+            '      rm -f "$log_file"',
+            '      sleep "$delay"',
+            "      delay=$((delay * 2))",
+            "      continue",
+            "    fi",
+            '    cat "$log_file" >&2',
+            '    rm -f "$log_file"',
+            "    return 1",
+            "  done",
+            "  return 1",
+            "}",
+            "download_with_retries() {",
+            '  local repo="$1"',
+            '  local path="$2"',
+            '  local local_dir="$3"',
+            '  run_hf_with_retries hf download "$repo" "$path" --repo-type dataset --local-dir "$local_dir"',
+            "}",
+            "upload_with_retries() {",
+            '  local repo="$1"',
+            '  local src="$2"',
+            '  local dest="$3"',
+            '  local message="$4"',
+            '  run_hf_with_retries hf upload "$repo" "$src" "$dest" --repo-type dataset --commit-message "$message"',
+            "}",
+            "WORK=/workspace",
+            'mkdir -p "$WORK/out"',
+            'cd "$WORK"',
+            f'uv pip install --system "{_HF_HUB_CLI_SPEC}" "{_FAST_AGENT_SPEC}"',
+            'download_with_retries "$ARTIFACT_REPO" "inputs/$RUN_ID/bundle.tar.gz" "$WORK"',
+            'tar -xzf "$WORK/inputs/$RUN_ID/bundle.tar.gz" -C "$WORK"',
+            "set +e",
+            'bash "$WORK/bundle/job_entrypoint.sh" "$WORK/bundle" "$WORK/out"',
+            "status=$?",
+            "set -e",
+            'echo "$status" > "$WORK/out/exit_code.txt"',
+            'upload_with_retries "$ARTIFACT_REPO" "$WORK/out" "outputs/$RUN_ID" '
+            '"outputs: $RUN_ID (exit=$status)"',
+            'exit "$status"',
+            "",
+        ]
+    )
+
+
+def _build_hf_jobs_run_command(
+    *,
+    jobs_config: JobsConfig,
+    run_id: str,
+    model: str,
+    labels: Mapping[str, str] | None,
+    job_script: str,
+) -> list[str]:
+    """Build the ``hf jobs run`` command for a prepared bundle submission."""
+    namespace = _resolve_jobs_namespace(
+        artifact_repo=jobs_config.artifact_repo,
+        configured_namespace=jobs_config.jobs_namespace,
+    )
+    command = [
+        "hf",
+        "jobs",
+        "run",
+        "--detach",
+        "--flavor",
+        jobs_config.jobs_flavor,
+        "--timeout",
+        jobs_config.jobs_timeout,
+        *_hf_secret_flags(jobs_config.jobs_secrets),
+        *_hf_label_flags(labels),
+        "--env",
+        f"ARTIFACT_REPO={jobs_config.artifact_repo}",
+        "--env",
+        f"RUN_ID={run_id}",
+        "--env",
+        f"FAST_MODEL={model}",
+    ]
+    if namespace is not None:
+        command.extend(["--namespace", namespace])
+    command.extend(
+        [
+            "--",
+            jobs_config.jobs_image,
+            "bash",
+            "-lc",
+            job_script,
+        ]
+    )
+    return command
+
+
+def _submit_prepared_bundle_job(
+    *,
+    jobs_config: JobsConfig,
+    run_id: str,
+    model: str,
+    labels: Mapping[str, str] | None = None,
+) -> SubmittedJob:
+    """Submit a remote job for a bundle that is already present in the dataset."""
+    job_script = _render_bundle_job_script()
+    command = _build_hf_jobs_run_command(
+        jobs_config=jobs_config,
+        run_id=run_id,
+        model=model,
+        labels=labels,
+        job_script=job_script,
+    )
+    with _HF_SUBMISSION_LOCK:
+        completed = _run_hf_command(command)
+    if completed.returncode != 0:
+        raise RuntimeError(
+            "Failed to submit remote fast-agent job:\n"
+            f"{_retry_exhausted_hf_failure_message(completed)}"
+            f"stdout:\n{completed.stdout}\n"
+            f"stderr:\n{completed.stderr}"
+        )
+    job_ref = _normalize_job_id(completed.stdout.strip().splitlines()[-1])
+    return SubmittedJob(job_id=job_ref, run_id=run_id, artifact_repo=jobs_config.artifact_repo)
+
+
+def _submit_bundle_job(
+    *,
+    bundle_archive: Path,
+    jobs_config: JobsConfig,
+    run_id: str,
+    model: str,
+    labels: Mapping[str, str] | None = None,
+) -> SubmittedJob:
+    upload = _upload_bundle_input(
+        bundle_archive=bundle_archive,
+        artifact_repo=jobs_config.artifact_repo,
+        run_id=run_id,
+    )
+    if upload.returncode != 0:
+        raise RuntimeError(
+            "Failed to upload remote fast-agent bundle:\n"
+            f"{_retry_exhausted_hf_failure_message(upload)}"
+            f"stdout:\n{upload.stdout}\n"
+            f"stderr:\n{upload.stderr}"
+        )
+    return _submit_prepared_bundle_job(
+        jobs_config=jobs_config,
+        run_id=run_id,
+        model=model,
+        labels=labels,
+    )
diff --git a/src/upskill/logging.py b/src/upskill/logging.py
index 59c723e..705c330 100644
--- a/src/upskill/logging.py
+++ b/src/upskill/logging.py
@@ -5,14 +5,18 @@
 import csv
 import json
 from datetime import datetime
-from pathlib import Path
+from typing import TYPE_CHECKING
 
-from fast_agent import ConversationSummary
 from fast_agent.constants import FAST_AGENT_TIMING, FAST_AGENT_USAGE
 from fast_agent.mcp.helpers.content_helpers import get_text
 
 from upskill.models import BatchSummary, ConversationStats, RunMetadata, RunResult, TestResult
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fast_agent import ConversationSummary
+
 # CSV field names for run summaries (matching skills-test format)
 FIELDNAMES = [
     "batch_id",
@@ -174,9 +178,6 @@ def extract_tokens_from_messages(
     return input_tokens, output_tokens, total_tokens, usage_summaries
 
 
-
-
-
 def extract_timing_from_messages(messages: list) -> list[dict[str, object]]:
     """Extract timing payloads from message channels."""
     timings: list[dict[str, object]] = []
@@ -250,8 +251,6 @@ def extract_stats_from_summary(summary: ConversationSummary) -> ConversationStat
     )
 
 
-
-
 def aggregate_conversation_stats(results: list[TestResult]) -> ConversationStats:
     """Aggregate ConversationStats across multiple test results."""
     aggregate = ConversationStats()
diff --git a/src/upskill/manifest_utils.py b/src/upskill/manifest_utils.py
index c84c6f2..9d8aede 100644
--- a/src/upskill/manifest_utils.py
+++ b/src/upskill/manifest_utils.py
@@ -2,9 +2,14 @@
 
 from __future__ import annotations
 
-from pathlib import Path
+from typing import TYPE_CHECKING
 
-from fast_agent.skills.registry import SkillManifest, SkillRegistry
+from fast_agent.skills.registry import SkillRegistry
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fast_agent.skills.registry import SkillManifest
 
 
 def parse_skill_manifest_text(
diff --git a/src/upskill/model_resolution.py b/src/upskill/model_resolution.py
index e7ae7f3..6501d03 100644
--- a/src/upskill/model_resolution.py
+++ b/src/upskill/model_resolution.py
@@ -6,9 +6,10 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 
-from upskill.config import Config
+if TYPE_CHECKING:
+    from upskill.config import Config
 
 CommandName = Literal["generate", "eval", "benchmark"]
 
@@ -25,6 +26,26 @@ class ResolvedModels:
     run_baseline: bool = True
 
 
+def build_fastagent_model_references(
+    *,
+    config: Config,
+    resolved: ResolvedModels,
+) -> dict[str, dict[str, str]]:
+    """Build fast-agent model references for the standard upskill card slots."""
+
+    skill_generation_model = resolved.skill_generation_model or config.skill_generation_model
+    test_generation_model = (
+        resolved.test_generation_model or config.test_gen_model or skill_generation_model
+    )
+    return {
+        "system": {
+            "default": skill_generation_model,
+            "skill_gen": skill_generation_model,
+            "test_gen": test_generation_model,
+        }
+    }
+
+
 def resolve_models(
     command: CommandName,
     *,
diff --git a/src/upskill/models.py b/src/upskill/models.py
index cf0a6ea..c4de86f 100644
--- a/src/upskill/models.py
+++ b/src/upskill/models.py
@@ -5,10 +5,13 @@
 import json
 import re
 from datetime import datetime
-from pathlib import Path
+from typing import TYPE_CHECKING
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
 
 class SkillMetadata(BaseModel):
     """Metadata about how a skill was generated (stored in skill_meta.json)."""
@@ -59,6 +62,7 @@ class TestCaseContext(BaseModel):
 class TestCase(BaseModel):
     """A test case for skill evaluation."""
 
+    __test__ = False
     model_config = ConfigDict(extra="forbid")
 
     input: str  # Task/prompt to give the agent
@@ -71,8 +75,6 @@ class TestCase(BaseModel):
     validator_config: dict[str, str | int | float | bool] | None = None
 
 
-
-
 class TestCaseSuite(BaseModel):
     """Structured container for a list of test cases."""
 
@@ -91,6 +93,112 @@ class SkillDraft(BaseModel):
     scripts: dict[str, str] | None = None
 
 
+def _parse_skill_frontmatter(
+    content: str,
+    *,
+    default_name: str,
+) -> tuple[str, str, list[str] | None, str | None, bool, bool, str]:
+    """Parse SKILL.md frontmatter and return normalized fields."""
+    name = default_name
+    description = ""
+    allowed_tools: list[str] | None = None
+    argument_hint: str | None = None
+    user_invocable = True
+    disable_model_invocation = False
+    body = content
+
+    if not content.startswith("---"):
+        return (
+            name,
+            description,
+            allowed_tools,
+            argument_hint,
+            user_invocable,
+            disable_model_invocation,
+            body,
+        )
+
+    parts = content.split("---", 2)
+    if len(parts) < 3:
+        return (
+            name,
+            description,
+            allowed_tools,
+            argument_hint,
+            user_invocable,
+            disable_model_invocation,
+            body,
+        )
+
+    frontmatter = parts[1].strip()
+    body = parts[2].strip()
+
+    for line in frontmatter.splitlines():
+        if ":" not in line:
+            continue
+        key, value = line.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+
+        if key == "name":
+            name = value
+        elif key == "description":
+            description = value
+        elif key == "allowed-tools":
+            allowed_tools = [tool.strip() for tool in value.split(",")]
+        elif key == "argument-hint":
+            argument_hint = value
+        elif key == "user-invocable":
+            user_invocable = value.lower() != "false"
+        elif key == "disable-model-invocation":
+            disable_model_invocation = value.lower() == "true"
+
+    return (
+        name,
+        description,
+        allowed_tools,
+        argument_hint,
+        user_invocable,
+        disable_model_invocation,
+        body,
+    )
+
+
+class SkillState(BaseModel):
+    """Upskill-managed state stored separately from ``SKILL.md``."""
+
+    metadata: SkillMetadata = Field(default_factory=SkillMetadata)
+    tests: list[TestCase] = Field(default_factory=list)
+
+
+def _load_skill_state(path: Path) -> SkillState:
+    """Load optional upskill-managed state stored alongside a skill."""
+    state = SkillState()
+    meta_path = path / "skill_meta.json"
+    if not meta_path.exists():
+        return state
+
+    meta_dict = json.loads(meta_path.read_text())
+    if "metadata" in meta_dict:
+        state.metadata = SkillMetadata.model_validate(meta_dict["metadata"])
+    if "tests" in meta_dict:
+        state.tests = [TestCase.model_validate(test_case) for test_case in meta_dict["tests"]]
+    return state
+
+
+def _load_artifact_directory(path: Path, directory_name: str) -> dict[str, str]:
+    """Load filename-to-content mappings from a skill artifact directory."""
+    directory = path / directory_name
+    if not directory.exists():
+        return {}
+
+    return {
+        file_path.name: file_path.read_text()
+        for file_path in directory.iterdir()
+        if file_path.is_file()
+    }
+
+
 class Skill(BaseModel):
     """A generated agent skill following the Claude Code SKILL.md spec."""
 
@@ -102,17 +210,11 @@ class Skill(BaseModel):
     user_invocable: bool = True
     disable_model_invocation: bool = False
 
-    # upskill metadata (persisted to skill_meta.json)
-    metadata: SkillMetadata = Field(default_factory=SkillMetadata)
-
     # Content
     body: str  # Main instructions markdown
     references: dict[str, str] = Field(default_factory=dict)  # filename -> content
     scripts: dict[str, str] = Field(default_factory=dict)  # filename -> code
 
-    # Test cases (persisted to skill_meta.json)
-    tests: list[TestCase] = Field(default_factory=list)
-
     @field_validator("name")
     @classmethod
     def validate_name(cls, v: str) -> str:
@@ -145,28 +247,13 @@ def render(self) -> str:
 
         return "\n".join(frontmatter_lines) + "\n\n" + self.body
 
-    def save(self, path: Path, tests: list[TestCase] | None = None) -> None:
-        """Write skill directory with all files.
-
-        Args:
-            path: Directory to save skill to
-            tests: Optional test cases to persist (overrides self.tests if provided)
-        """
+    def save(self, path: Path) -> None:
+        """Write the skill document and artifact files."""
         path.mkdir(parents=True, exist_ok=True)
 
         # Write SKILL.md (Claude Code compatible)
         (path / "SKILL.md").write_text(self.render())
 
-        # Write skill_meta.json (upskill-specific metadata + tests)
-        tests_to_save = tests if tests is not None else self.tests
-        meta_dict = {
-            "metadata": self.metadata.model_dump(mode="json"),
-            "tests": [t.model_dump(mode="json") for t in tests_to_save],
-        }
-        (path / "skill_meta.json").write_text(
-            json.dumps(meta_dict, indent=2, default=str)
-        )
-
         # Write references
         if self.references:
             refs_dir = path / "references"
@@ -183,80 +270,23 @@ def save(self, path: Path, tests: list[TestCase] | None = None) -> None:
 
     @classmethod
     def load(cls, path: Path) -> Skill:
-        """Load a skill from a directory.
-
-        Args:
-            path: Directory containing SKILL.md and optionally skill_meta.json
-
-        Returns:
-            Loaded Skill instance
-        """
+        """Load a skill document from a directory."""
         skill_md_path = path / "SKILL.md"
         if not skill_md_path.exists():
             raise FileNotFoundError(f"SKILL.md not found in {path}")
 
         content = skill_md_path.read_text()
-
-        # Parse YAML frontmatter
-        name = path.name  # Default to directory name
-        description = ""
-        allowed_tools: list[str] | None = None
-        argument_hint: str | None = None
-        user_invocable = True
-        disable_model_invocation = False
-        body = content
-
-        if content.startswith("---"):
-            parts = content.split("---", 2)
-            if len(parts) >= 3:
-                frontmatter = parts[1].strip()
-                body = parts[2].strip()
-
-                for line in frontmatter.split("\n"):
-                    if ":" in line:
-                        key, value = line.split(":", 1)
-                        key = key.strip()
-                        value = value.strip()
-
-                        if key == "name":
-                            name = value
-                        elif key == "description":
-                            description = value
-                        elif key == "allowed-tools":
-                            allowed_tools = [t.strip() for t in value.split(",")]
-                        elif key == "argument-hint":
-                            argument_hint = value
-                        elif key == "user-invocable":
-                            user_invocable = value.lower() != "false"
-                        elif key == "disable-model-invocation":
-                            disable_model_invocation = value.lower() == "true"
-
-        # Load metadata and tests from skill_meta.json if present
-        metadata = SkillMetadata()
-        tests: list[TestCase] = []
-        meta_path = path / "skill_meta.json"
-        if meta_path.exists():
-            meta_dict = json.loads(meta_path.read_text())
-            if "metadata" in meta_dict:
-                metadata = SkillMetadata.model_validate(meta_dict["metadata"])
-            if "tests" in meta_dict:
-                tests = [TestCase.model_validate(t) for t in meta_dict["tests"]]
-
-        # Load references
-        references: dict[str, str] = {}
-        refs_dir = path / "references"
-        if refs_dir.exists():
-            for ref_file in refs_dir.iterdir():
-                if ref_file.is_file():
-                    references[ref_file.name] = ref_file.read_text()
-
-        # Load scripts
-        scripts: dict[str, str] = {}
-        scripts_dir = path / "scripts"
-        if scripts_dir.exists():
-            for script_file in scripts_dir.iterdir():
-                if script_file.is_file():
-                    scripts[script_file.name] = script_file.read_text()
+        (
+            name,
+            description,
+            allowed_tools,
+            argument_hint,
+            user_invocable,
+            disable_model_invocation,
+            body,
+        ) = _parse_skill_frontmatter(content, default_name=path.name)
+        references = _load_artifact_directory(path, "references")
+        scripts = _load_artifact_directory(path, "scripts")
 
         return cls(
             name=name,
@@ -265,11 +295,34 @@ def load(cls, path: Path) -> Skill:
             argument_hint=argument_hint,
             user_invocable=user_invocable,
             disable_model_invocation=disable_model_invocation,
-            metadata=metadata,
             body=body,
             references=references,
             scripts=scripts,
-            tests=tests,
+        )
+
+
+class SkillRecord(BaseModel):
+    """Persisted skill document plus separately managed upskill state."""
+
+    skill: Skill
+    state: SkillState = Field(default_factory=SkillState)
+
+    def save(self, path: Path) -> None:
+        """Write the skill document and managed metadata/tests."""
+        path.mkdir(parents=True, exist_ok=True)
+        self.skill.save(path)
+        meta_dict = {
+            "metadata": self.state.metadata.model_dump(mode="json"),
+            "tests": [test.model_dump(mode="json") for test in self.state.tests],
+        }
+        (path / "skill_meta.json").write_text(json.dumps(meta_dict, indent=2, default=str))
+
+    @classmethod
+    def load(cls, path: Path) -> SkillRecord:
+        """Load a persisted skill record from disk."""
+        return cls(
+            skill=Skill.load(path),
+            state=_load_skill_state(path),
         )
 
 
@@ -314,6 +367,7 @@ def tokens(self) -> int:
 class TestResult(BaseModel):
     """Result of running a single test case."""
 
+    __test__ = False
     test_case: TestCase
     success: bool
     output: str | None = None
diff --git a/src/upskill/result_parsing.py b/src/upskill/result_parsing.py
new file mode 100644
index 0000000..908e3a0
--- /dev/null
+++ b/src/upskill/result_parsing.py
@@ -0,0 +1,48 @@
+"""Parse fast-agent result artifacts into upskill-friendly data."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from fast_agent import ConversationSummary
+from fast_agent.mcp.prompt_serialization import load_messages
+
+from upskill.logging import extract_stats_from_summary
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from pathlib import Path
+
+    from upskill.models import ConversationStats
+
+
+@dataclass(slots=True, frozen=True)
+class ParsedExecutionResult:
+    """Parsed view of a fast-agent result export."""
+
+    output_text: str | None
+    stats: ConversationStats
+
+
+def _extract_output_text(messages: Sequence[object]) -> str | None:
+    for message in reversed(messages):
+        role = getattr(message, "role", None)
+        if role != "assistant":
+            continue
+        last_text = getattr(message, "last_text", None)
+        if callable(last_text):
+            text = last_text()
+            if text:
+                return text
+    return None
+
+
+def parse_fast_agent_results(results_path: Path) -> ParsedExecutionResult:
+    """Load and summarize a fast-agent JSON history export."""
+    messages = load_messages(str(results_path))
+    summary = ConversationSummary(messages=messages)
+    return ParsedExecutionResult(
+        output_text=_extract_output_text(messages),
+        stats=extract_stats_from_summary(summary),
+    )
diff --git a/tests/conftest.py b/tests/conftest.py
index fdcbc1f..13941ea 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,5 +5,6 @@
 
 ROOT = Path(__file__).resolve().parents[1]
 SRC = ROOT / "src"
-if str(SRC) not in sys.path:
-    sys.path.insert(0, str(SRC))
+for path in (ROOT, SRC):
+    if str(path) not in sys.path:
+        sys.path.insert(0, str(path))
diff --git a/tests/test_agent_card_guardrails.py b/tests/test_agent_card_guardrails.py
index fe8dfe5..76ce772 100644
--- a/tests/test_agent_card_guardrails.py
+++ b/tests/test_agent_card_guardrails.py
@@ -2,12 +2,7 @@
 
 from pathlib import Path
 
-import pytest
-
 AGENT_CARDS_DIR = Path("src/upskill/agent_cards")
-GUARDED_CARDS = ("skill_gen.md", "test_gen.md")
-# Intentional exceptions require both allowlist entry and frontmatter annotation.
-ALLOWED_MODEL_PIN_OVERRIDES: dict[str, str] = {}
 
 
 def _parse_frontmatter(path: Path) -> dict[str, str]:
@@ -36,29 +31,10 @@ def _parse_frontmatter(path: Path) -> dict[str, str]:
     return data
 
 
-@pytest.mark.parametrize("card_name", GUARDED_CARDS)
-def test_guarded_agent_cards_do_not_pin_model_unless_explicitly_allowed(card_name: str) -> None:
-    card_path = AGENT_CARDS_DIR / card_name
-    assert card_path.exists(), f"Missing guarded agent card: {card_path}"
-
-    frontmatter = _parse_frontmatter(card_path)
-    if "model" not in frontmatter:
-        return
-
-    assert card_name in ALLOWED_MODEL_PIN_OVERRIDES, (
-        f"Unexpected model pin in {card_name}. Remove `model:` from frontmatter or add an "
-        "explicit temporary override in ALLOWED_MODEL_PIN_OVERRIDES with a justification."
+def test_evaluator_card_does_not_pin_skills_dir() -> None:
+    """Evaluation skill loading should come from --skills-dir, not card frontmatter."""
+    frontmatter = _parse_frontmatter(AGENT_CARDS_DIR / "evaluator.md")
+    assert "skills" not in frontmatter, (
+        "evaluator.md should not define `skills:` in frontmatter. "
+        "Evaluation availability must be controlled by the executor's --skills-dir."
     )
-    assert frontmatter.get("allow_model_pin", "").lower() == "true", (
-        f"{card_name} is allowlisted but missing `allow_model_pin: true` annotation in frontmatter."
-    )
-
-
-def test_default_guarded_cards_have_no_model_pin() -> None:
-    """Regression guard: current default cards should not define a model pin."""
-    for card_name in GUARDED_CARDS:
-        frontmatter = _parse_frontmatter(AGENT_CARDS_DIR / card_name)
-        assert "model" not in frontmatter, (
-            f"Unexpected model pin in guarded card {card_name}. "
-            "Model selection should come from runtime resolution."
-        )
diff --git a/tests/test_check_script.py b/tests/test_check_script.py
new file mode 100644
index 0000000..5a8cf4c
--- /dev/null
+++ b/tests/test_check_script.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from scripts.check import build_check_steps
+
+
+def test_build_check_steps_includes_cpd_and_pytest() -> None:
+    steps = build_check_steps()
+
+    assert [step.name for step in steps] == ["format", "lint", "typecheck", "cpd", "pytest"]
+    assert steps[3].command[-1] == "--check"
+    assert steps[4].command[1:] == ("-m", "pytest", "-v")
+
+
+def test_build_check_steps_can_skip_pytest() -> None:
+    steps = build_check_steps(skip_tests=True)
+
+    assert [step.name for step in steps] == ["format", "lint", "typecheck", "cpd"]
diff --git a/tests/test_cli_eval_jobs.py b/tests/test_cli_eval_jobs.py
new file mode 100644
index 0000000..48d8b68
--- /dev/null
+++ b/tests/test_cli_eval_jobs.py
@@ -0,0 +1,560 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import click
+import pytest
+from click.testing import CliRunner
+
+from upskill.cli import _eval_async, _jobs_execution_options, _raise_on_execution_errors
+from upskill.config import Config
+from upskill.evaluate import apply_eval_metrics
+from upskill.hf_jobs import JobsConfig
+from upskill.logging import load_batch_summary, load_run_result
+from upskill.models import (
+    ConversationStats,
+    EvalResults,
+    ExpectedSpec,
+    Skill,
+    SkillRecord,
+    SkillState,
+    TestCase,
+    TestResult,
+)
+
+
+def _make_eval_results(
+    *,
+    skill: Skill,
+    model: str,
+    test_cases: list[TestCase],
+    run_baseline: bool,
+) -> EvalResults:
+    with_skill_results = [
+        TestResult(
+            test_case=test_case,
+            success=True,
+            stats=ConversationStats(total_tokens=10, turns=1),
+        )
+        for test_case in test_cases
+    ]
+    results = EvalResults(
+        skill_name=skill.name,
+        model=model,
+        with_skill_results=with_skill_results,
+    )
+    if run_baseline:
+        results.baseline_results = [
+            TestResult(
+                test_case=test_case,
+                success=False,
+                stats=ConversationStats(total_tokens=20, turns=1),
+            )
+            for test_case in test_cases
+        ]
+    return apply_eval_metrics(results, test_cases)
+
+
+def _write_skill_fixture(skill_dir: Path) -> SkillRecord:
+    record = SkillRecord(
+        skill=Skill(
+            name="pull-request-descriptions",
+            description="Write good pull request descriptions.",
+            body="Use a clear structure.",
+        ),
+        state=SkillState(
+            tests=[
+                TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+                TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])),
+            ]
+        ),
+    )
+    record.save(skill_dir)
+    return record
+
+
+def test_jobs_execution_options_waits_by_default() -> None:
+    @click.command()
+    @_jobs_execution_options(
+        executor_help="Execution backend for tests",
+        runs_dir_help="Runs directory for tests",
+    )
+    def command(
+        executor: str | None,
+        artifact_repo: str | None,
+        wait: bool,
+        jobs_timeout: str,
+        jobs_flavor: str,
+        jobs_secrets: str | None,
+        jobs_namespace: str | None,
+        max_parallel: int | None,
+        runs_dir: str | None,
+        log_runs: bool,
+    ) -> None:
+        del (
+            executor,
+            artifact_repo,
+            jobs_timeout,
+            jobs_flavor,
+            jobs_secrets,
+            jobs_namespace,
+            max_parallel,
+            runs_dir,
+            log_runs,
+        )
+        click.echo(f"wait={wait}")
+
+    runner = CliRunner()
+
+    default_result = runner.invoke(command)
+    assert default_result.exit_code == 0
+    assert "wait=True" in default_result.output
+
+    no_wait_result = runner.invoke(command, ["--no-wait"])
+    assert no_wait_result.exit_code == 0
+    assert "wait=False" in no_wait_result.output
+
+
+def test_raise_on_execution_errors_surfaces_backend_failures() -> None:
+    test_case = TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"]))
+    results = EvalResults(
+        skill_name="pull-request-descriptions",
+        model="haiku",
+        with_skill_results=[
+            TestResult(
+                test_case=test_case,
+                success=False,
+                error="fast-agent exited with code 1.",
+            )
+        ],
+    )
+
+    with pytest.raises(click.ClickException, match="execution errors") as exc_info:
+        _raise_on_execution_errors(results, context="Evaluation on haiku")
+
+    assert "with-skill test 1" in str(exc_info.value)
+    assert "fast-agent exited with code 1." in str(exc_info.value)
+
+
+@pytest.mark.asyncio
+async def test_eval_jobs_wait_persists_simple_run_summaries(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    skill_record = _write_skill_fixture(tmp_path / "skill")
+    skill = skill_record.skill
+    config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml")
+    fake_executor = object()
+    max_parallel_calls: list[int] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr("upskill.cli._build_executor", lambda *args, **kwargs: fake_executor)
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args
+        assert kwargs["executor"] is fake_executor
+        assert kwargs["operation"] == "eval"
+        max_parallel = kwargs["max_parallel"]
+        assert isinstance(max_parallel, int)
+        max_parallel_calls.append(max_parallel)
+        results = _make_eval_results(
+            skill=skill,
+            model=str(kwargs["model"]),
+            test_cases=skill_record.state.tests,
+            run_baseline=True,
+        )
+        return results
+
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _eval_async(
+        skill_path=str(tmp_path / "skill"),
+        tests=None,
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=1,
+        no_baseline=False,
+        verbose=False,
+        executor_name="jobs",
+        artifact_repo="ns/repo",
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=3,
+        log_runs=True,
+        runs_dir=str(config.runs_dir),
+    )
+
+    batch_folder = next(config.runs_dir.iterdir())
+    summary = load_batch_summary(batch_folder)
+    assert summary is not None
+    assert summary.total_runs == 2
+    assert summary.passed_runs == 1
+
+    baseline_result = load_run_result(batch_folder / "run_1")
+    with_skill_result = load_run_result(batch_folder / "run_2")
+    assert baseline_result is not None
+    assert with_skill_result is not None
+    assert baseline_result.run_type == "baseline"
+    assert with_skill_result.run_type == "with_skill"
+    assert max_parallel_calls == [3]
+
+
+@pytest.mark.asyncio
+async def test_eval_uses_config_execution_defaults_when_cli_unset(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    skill_record = _write_skill_fixture(tmp_path / "skill")
+    config = Config(
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+        executor="jobs",
+        num_runs=2,
+        max_parallel=4,
+        jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY",
+        jobs_image="ghcr.io/example/custom:latest",
+    )
+    fake_executor = object()
+    build_calls: list[str] = []
+    calls: list[tuple[int, bool]] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+
+    def fake_build_executor(name: str, **kwargs: object) -> object:
+        build_calls.append(name)
+        jobs_config = kwargs["jobs_config"]
+        assert isinstance(jobs_config, JobsConfig)
+        assert jobs_config.jobs_secrets == "HF_TOKEN,ANTHROPIC_API_KEY"
+        assert jobs_config.jobs_image == "ghcr.io/example/custom:latest"
+        return fake_executor
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args
+        max_parallel = kwargs["max_parallel"]
+        run_baseline = kwargs["run_baseline"]
+        assert kwargs["executor"] is fake_executor
+        assert isinstance(max_parallel, int)
+        assert isinstance(run_baseline, bool)
+        calls.append((max_parallel, run_baseline))
+        return _make_eval_results(
+            skill=skill_record.skill,
+            model=str(kwargs["model"]),
+            test_cases=skill_record.state.tests,
+            run_baseline=False,
+        )
+
+    monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _eval_async(
+        skill_path=str(tmp_path / "skill"),
+        tests=None,
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=None,
+        no_baseline=False,
+        verbose=False,
+        executor_name=None,
+        artifact_repo="ns/repo",
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets=None,
+        jobs_namespace=None,
+        max_parallel=None,
+        log_runs=True,
+        runs_dir=str(config.runs_dir),
+    )
+
+    assert build_calls == ["jobs"]
+    assert calls == [(4, False), (4, False)]
+
+
+@pytest.mark.asyncio
+async def test_eval_cli_execution_options_override_config_defaults(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    skill_record = _write_skill_fixture(tmp_path / "skill")
+    config = Config(
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+        executor="jobs",
+        num_runs=2,
+        max_parallel=4,
+        jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY",
+    )
+    fake_executor = object()
+    build_calls: list[str] = []
+    calls: list[tuple[int, bool]] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+
+    def fake_build_executor(name: str, **kwargs: object) -> object:
+        del kwargs
+        build_calls.append(name)
+        return fake_executor
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args
+        max_parallel = kwargs["max_parallel"]
+        run_baseline = kwargs["run_baseline"]
+        assert kwargs["executor"] is fake_executor
+        assert isinstance(max_parallel, int)
+        assert isinstance(run_baseline, bool)
+        calls.append((max_parallel, run_baseline))
+        return _make_eval_results(
+            skill=skill_record.skill,
+            model=str(kwargs["model"]),
+            test_cases=skill_record.state.tests,
+            run_baseline=True,
+        )
+
+    monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _eval_async(
+        skill_path=str(tmp_path / "skill"),
+        tests=None,
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=1,
+        no_baseline=False,
+        verbose=False,
+        executor_name="local",
+        artifact_repo=None,
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=1,
+        log_runs=True,
+        runs_dir=str(config.runs_dir),
+    )
+
+    assert build_calls == ["local"]
+    assert calls == [(1, True)]
+
+
+@pytest.mark.asyncio
+async def test_eval_cli_jobs_secrets_override_config_defaults(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    skill_record = _write_skill_fixture(tmp_path / "skill")
+    config = Config(
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+        executor="jobs",
+        jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY",
+    )
+    fake_executor = object()
+    build_calls: list[str] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+
+    def fake_build_executor(name: str, **kwargs: object) -> object:
+        build_calls.append(name)
+        jobs_config = kwargs["jobs_config"]
+        assert isinstance(jobs_config, JobsConfig)
+        assert jobs_config.jobs_secrets == "HF_TOKEN,OPENAI_API_KEY"
+        return fake_executor
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args, kwargs
+        return _make_eval_results(
+            skill=skill_record.skill,
+            model="haiku",
+            test_cases=skill_record.state.tests,
+            run_baseline=True,
+        )
+
+    monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _eval_async(
+        skill_path=str(tmp_path / "skill"),
+        tests=None,
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=1,
+        no_baseline=False,
+        verbose=False,
+        executor_name="jobs",
+        artifact_repo="ns/repo",
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN,OPENAI_API_KEY",
+        jobs_namespace=None,
+        max_parallel=1,
+        log_runs=True,
+        runs_dir=str(config.runs_dir),
+    )
+
+    assert build_calls == ["jobs"]
+
+
+@pytest.mark.asyncio
+async def test_eval_jobs_wait_persists_benchmark_run_summaries(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    skill_record = _write_skill_fixture(tmp_path / "skill")
+    skill = skill_record.skill
+    config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml")
+    fake_executor = object()
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr("upskill.cli._build_executor", lambda *args, **kwargs: fake_executor)
+
+    calls: list[tuple[int, bool]] = []
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args
+        artifact_root = kwargs["artifact_root"]
+        assert isinstance(artifact_root, Path)
+        assert kwargs["operation"] == "benchmark"
+        max_parallel = kwargs["max_parallel"]
+        run_baseline = kwargs["run_baseline"]
+        assert isinstance(max_parallel, int)
+        assert isinstance(run_baseline, bool)
+        calls.append((max_parallel, run_baseline))
+        results = _make_eval_results(
+            skill=skill,
+            model=str(kwargs["model"]),
+            test_cases=skill_record.state.tests,
+            run_baseline=False,
+        )
+        return results
+
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _eval_async(
+        skill_path=str(tmp_path / "skill"),
+        tests=None,
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=2,
+        no_baseline=True,
+        verbose=False,
+        executor_name="jobs",
+        artifact_repo="ns/repo",
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=4,
+        log_runs=True,
+        runs_dir=str(config.runs_dir),
+    )
+
+    batch_folder = next(config.runs_dir.iterdir())
+    summary = load_batch_summary(batch_folder)
+    assert summary is not None
+    assert summary.total_runs == 2
+    assert calls == [(4, False), (4, False)]
+    assert load_run_result(batch_folder / "run_1") is not None
+    assert load_run_result(batch_folder / "run_2") is not None
+
+
+@pytest.mark.asyncio
+async def test_eval_jobs_no_wait_submits_remote_requests(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    _write_skill_fixture(tmp_path / "skill")
+    config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml")
+    submit_calls: list[tuple[str, bool, str]] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+
+    async def fake_submit_remote_eval_jobs(**kwargs: object) -> list[str]:
+        submit_calls.append(
+            (
+                str(kwargs["model"]),
+                bool(kwargs["run_baseline"]),
+                str(kwargs["operation"]),
+            )
+        )
+        return ["evalstate/job-1", "evalstate/job-2"]
+
+    def fail_build_executor(*args: object, **kwargs: object) -> object:
+        del args, kwargs
+        raise AssertionError("_build_executor should not be used for jobs --no-wait submission")
+
+    async def fail_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args, kwargs
+        raise AssertionError("evaluate_skill should not be called for jobs --no-wait submission")
+
+    monkeypatch.setattr("upskill.cli._submit_remote_eval_jobs", fake_submit_remote_eval_jobs)
+    monkeypatch.setattr("upskill.cli._build_executor", fail_build_executor)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fail_evaluate_skill)
+
+    await _eval_async(
+        skill_path=str(tmp_path / "skill"),
+        tests=None,
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=1,
+        no_baseline=False,
+        verbose=False,
+        executor_name="jobs",
+        artifact_repo="ns/repo",
+        wait=False,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=3,
+        log_runs=True,
+        runs_dir=str(config.runs_dir),
+    )
+
+    assert submit_calls == [("haiku", True, "eval")]
+
+
+@pytest.mark.asyncio
+async def test_eval_jobs_wait_fails_cleanly_when_artifact_repo_is_inaccessible(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    _write_skill_fixture(tmp_path / "skill")
+    config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml")
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr(
+        "upskill.cli.verify_artifact_repo_access",
+        lambda _repo: (_ for _ in ()).throw(
+            RuntimeError("404 Not Found\nRepository Not Found for url")
+        ),
+    )
+
+    with pytest.raises(click.ClickException, match="Artifact repo is not accessible") as exc_info:
+        await _eval_async(
+            skill_path=str(tmp_path / "skill"),
+            tests=None,
+            models=["haiku"],
+            test_gen_model=None,
+            num_runs=1,
+            no_baseline=False,
+            verbose=False,
+            executor_name="jobs",
+            artifact_repo="evalstate/uskill-test",
+            wait=True,
+            jobs_timeout="2h",
+            jobs_flavor="cpu-basic",
+            jobs_secrets="HF_TOKEN",
+            jobs_namespace=None,
+            max_parallel=3,
+            log_runs=True,
+            runs_dir=str(config.runs_dir),
+        )
+
+    assert "Repo: evalstate/uskill-test" in str(exc_info.value)
+    assert "name is wrong" in str(exc_info.value)
diff --git a/tests/test_cli_generate_benchmark.py b/tests/test_cli_generate_benchmark.py
new file mode 100644
index 0000000..9554543
--- /dev/null
+++ b/tests/test_cli_generate_benchmark.py
@@ -0,0 +1,637 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import TYPE_CHECKING
+
+import pytest
+from click.testing import CliRunner
+
+from upskill.cli import (
+    _benchmark_async,
+    _build_logged_run_result,
+    _generate_async,
+    _install_fast_agent_model_references,
+    _submit_generate_jobs_eval,
+    main,
+)
+from upskill.config import Config
+from upskill.evaluate import apply_eval_metrics
+from upskill.hf_jobs import JobsConfig
+from upskill.logging import load_batch_summary
+from upskill.models import (
+    ConversationStats,
+    EvalResults,
+    ExpectedSpec,
+    Skill,
+    SkillRecord,
+    SkillState,
+    TestCase,
+    TestResult,
+    ValidationResult,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class _FakeAgentContext:
+    async def __aenter__(self) -> SimpleNamespace:
+        return SimpleNamespace(skill_gen=object(), test_gen=object())
+
+    async def __aexit__(self, exc_type: object, exc: object, tb: object) -> bool:
+        del exc_type, exc, tb
+        return False
+
+
+def _make_eval_results(
+    *,
+    skill: Skill,
+    model: str,
+    test_cases: list[TestCase],
+    run_baseline: bool,
+) -> EvalResults:
+    with_skill_results = [
+        TestResult(
+            test_case=test_case,
+            success=True,
+            stats=ConversationStats(total_tokens=10, turns=1),
+        )
+        for test_case in test_cases
+    ]
+    results = EvalResults(
+        skill_name=skill.name,
+        model=model,
+        with_skill_results=with_skill_results,
+    )
+    if run_baseline:
+        results.baseline_results = [
+            TestResult(
+                test_case=test_case,
+                success=False,
+                stats=ConversationStats(total_tokens=20, turns=1),
+            )
+            for test_case in test_cases
+        ]
+    return apply_eval_metrics(results, test_cases)
+
+
+def test_build_logged_run_result_preserves_validator_assertion_counts() -> None:
+    test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"]))
+    run_result = _build_logged_run_result(
+        model="haiku",
+        task="Write good pull request descriptions.",
+        batch_id="batch-1",
+        run_number=1,
+        test_results=[
+            TestResult(
+                test_case=test_case,
+                success=True,
+                validation_result=ValidationResult(
+                    passed=True,
+                    assertions_passed=2,
+                    assertions_total=3,
+                ),
+                stats=ConversationStats(total_tokens=10, turns=1),
+            ),
+            TestResult(
+                test_case=test_case,
+                success=True,
+                stats=ConversationStats(total_tokens=12, turns=1),
+            ),
+        ],
+        assertions_total=2,
+        passed=False,
+        run_type="with_skill",
+        skill_name="pull-request-descriptions",
+    )
+
+    assert run_result.assertions_passed == 3
+    assert run_result.assertions_total == 4
+
+
+def test_generate_help_does_not_expose_removed_tool_option() -> None:
+    runner = CliRunner()
+
+    result = runner.invoke(main, ["generate", "--help"])
+
+    assert result.exit_code == 0
+    assert "--tool" not in result.output
+    assert "--from PATH" in result.output
+    assert "--artifact-repo TEXT" in result.output
+    assert 'upskill generate "parse invoices"' in result.output
+    assert "--artifact-repo" in result.output
+    assert "<user>/upskill-tests" in result.output
+
+
+def test_install_fast_agent_model_references_merges_existing_namespaces() -> None:
+    fast = SimpleNamespace(
+        app=SimpleNamespace(
+            _config_or_path=SimpleNamespace(
+                model_references={
+                    "custom": {"router": "haiku"},
+                    "system": {"existing": "keep"},
+                }
+            )
+        )
+    )
+
+    _install_fast_agent_model_references(
+        fast,
+        model_references={"system": {"skill_gen": "sonnet", "test_gen": "opus"}},
+    )
+
+    assert fast.app._config_or_path.model_references == {
+        "custom": {"router": "haiku"},
+        "system": {
+            "existing": "keep",
+            "skill_gen": "sonnet",
+            "test_gen": "opus",
+        },
+    }
+
+
+@pytest.mark.asyncio
+async def test_generate_persists_generated_tests_in_skill_meta(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(
+        skills_dir=tmp_path / "skills",
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+    )
+    test_cases = [
+        TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+        TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])),
+    ]
+    fake_executor = object()
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr(
+        "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext()
+    )
+
+    monkeypatch.setattr("upskill.cli._build_executor", lambda *args, **kwargs: fake_executor)
+
+    async def fake_generate_skill(**kwargs: object) -> SkillRecord:
+        del kwargs
+        return SkillRecord(
+            skill=Skill(
+                name="pull-request-descriptions",
+                description="Write good pull request descriptions.",
+                body="Use a clear structure.",
+            )
+        )
+
+    async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]:
+        del args, kwargs
+        return test_cases
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        skill = args[0]
+        assert isinstance(skill, Skill)
+        assert kwargs["executor"] is fake_executor
+        assert kwargs["operation"] == "generate"
+        return _make_eval_results(
+            skill=skill,
+            model=str(kwargs["model"]),
+            test_cases=test_cases,
+            run_baseline=True,
+        )
+
+    monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill)
+    monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _generate_async(
+        task="write good pull request descriptions",
+        examples=None,
+        from_skill=None,
+        from_trace=None,
+        model="haiku",
+        test_gen_model=None,
+        output=None,
+        no_eval=False,
+        eval_model=None,
+        executor_name="local",
+        artifact_repo=None,
+        wait=False,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=2,
+        runs_dir=str(config.runs_dir),
+        log_runs=True,
+    )
+
+    saved = SkillRecord.load(config.skills_dir / "pull-request-descriptions")
+    assert len(saved.state.tests) == 2
+    assert saved.state.tests[0].input == "prompt one"
+
+
+@pytest.mark.asyncio
+async def test_generate_no_eval_still_persists_generated_tests(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(
+        skills_dir=tmp_path / "skills",
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+    )
+    test_cases = [
+        TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+        TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])),
+    ]
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr(
+        "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext()
+    )
+
+    async def fake_generate_skill(**kwargs: object) -> SkillRecord:
+        del kwargs
+        return SkillRecord(
+            skill=Skill(
+                name="pull-request-descriptions",
+                description="Write good pull request descriptions.",
+                body="Use a clear structure.",
+            )
+        )
+
+    async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]:
+        del args, kwargs
+        return test_cases
+
+    async def fail_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args, kwargs
+        raise AssertionError("evaluate_skill should not be called when --no-eval is set")
+
+    monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill)
+    monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fail_evaluate_skill)
+
+    await _generate_async(
+        task="write good pull request descriptions",
+        examples=None,
+        from_skill=None,
+        from_trace=None,
+        model="haiku",
+        test_gen_model=None,
+        output=None,
+        no_eval=True,
+        eval_model=None,
+        executor_name="local",
+        artifact_repo=None,
+        wait=False,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=2,
+        runs_dir=str(config.runs_dir),
+        log_runs=True,
+    )
+
+    saved = SkillRecord.load(config.skills_dir / "pull-request-descriptions")
+    assert len(saved.state.tests) == 2
+    assert saved.state.tests[1].input == "prompt two"
+
+
+@pytest.mark.asyncio
+async def test_generate_prints_test_generation_model(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(
+        skills_dir=tmp_path / "skills",
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+    )
+    test_cases = [
+        TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+    ]
+    printed_messages: list[str] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr(
+        "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext()
+    )
+    monkeypatch.setattr(
+        "upskill.cli.console.print",
+        lambda *args, **kwargs: printed_messages.append(" ".join(str(arg) for arg in args)),
+    )
+
+    async def fake_generate_skill(**kwargs: object) -> SkillRecord:
+        del kwargs
+        return SkillRecord(
+            skill=Skill(
+                name="pull-request-descriptions",
+                description="Write good pull request descriptions.",
+                body="Use a clear structure.",
+            )
+        )
+
+    async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]:
+        del args, kwargs
+        return test_cases
+
+    monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill)
+    monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests)
+
+    await _generate_async(
+        task="write good pull request descriptions",
+        examples=None,
+        from_skill=None,
+        from_trace=None,
+        model="haiku",
+        test_gen_model="opus",
+        output=None,
+        no_eval=True,
+        eval_model=None,
+        executor_name="local",
+        artifact_repo=None,
+        wait=False,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=2,
+        runs_dir=str(config.runs_dir),
+        log_runs=True,
+    )
+
+    assert any("Generating test cases with opus..." in message for message in printed_messages)
+
+
+@pytest.mark.asyncio
+async def test_generate_jobs_no_wait_submits_remote_eval_requests(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(
+        skills_dir=tmp_path / "skills",
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+    )
+    test_cases = [
+        TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+        TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])),
+    ]
+    submit_models: list[str] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr(
+        "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext()
+    )
+
+    async def fake_generate_skill(**kwargs: object) -> SkillRecord:
+        del kwargs
+        return SkillRecord(
+            skill=Skill(
+                name="pull-request-descriptions",
+                description="Write good pull request descriptions.",
+                body="Use a clear structure.",
+            )
+        )
+
+    async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]:
+        del args, kwargs
+        return test_cases
+
+    async def fake_submit_generate_jobs_eval(**kwargs: object) -> list[str]:
+        submit_models.append(str(kwargs["model"]))
+        assert kwargs["test_cases"] == test_cases
+        return ["evalstate/job-1", "evalstate/job-2"]
+
+    def fail_build_executor(*args: object, **kwargs: object) -> object:
+        del args, kwargs
+        raise AssertionError("_build_executor should not be used for jobs --no-wait submission")
+
+    async def fail_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        del args, kwargs
+        raise AssertionError("evaluate_skill should not be called for jobs --no-wait submission")
+
+    monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill)
+    monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests)
+    monkeypatch.setattr("upskill.cli._submit_generate_jobs_eval", fake_submit_generate_jobs_eval)
+    monkeypatch.setattr("upskill.cli._build_executor", fail_build_executor)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fail_evaluate_skill)
+
+    await _generate_async(
+        task="write good pull request descriptions",
+        examples=None,
+        from_skill=None,
+        from_trace=None,
+        model="haiku",
+        test_gen_model=None,
+        output=None,
+        no_eval=False,
+        eval_model=None,
+        executor_name="jobs",
+        artifact_repo="ns/repo",
+        wait=False,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        max_parallel=2,
+        runs_dir=str(config.runs_dir),
+        log_runs=True,
+    )
+
+    saved = SkillRecord.load(config.skills_dir / "pull-request-descriptions")
+    assert len(saved.state.tests) == 2
+    assert submit_models == ["haiku"]
+
+
+@pytest.mark.asyncio
+async def test_submit_generate_jobs_eval_marks_operation_as_generate(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(fastagent_config=tmp_path / "fastagent.config.yaml")
+    test_cases = [TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"]))]
+    skill = Skill(
+        name="pull-request-descriptions",
+        description="Write good pull request descriptions.",
+        body="Use a clear structure.",
+    )
+    operation_calls: list[str] = []
+
+    async def fake_submit_remote_eval_jobs(**kwargs: object) -> list[str]:
+        operation_calls.append(str(kwargs["operation"]))
+        return ["evalstate/job-1"]
+
+    monkeypatch.setattr("upskill.cli._submit_remote_eval_jobs", fake_submit_remote_eval_jobs)
+
+    job_refs = await _submit_generate_jobs_eval(
+        skill=skill,
+        test_cases=test_cases,
+        model="haiku",
+        jobs_config=JobsConfig(artifact_repo="ns/repo"),
+        config=config,
+        cards_path=tmp_path / "cards",
+        batch_folder=tmp_path / "runs" / "batch_1",
+    )
+
+    assert job_refs == ["evalstate/job-1"]
+    assert operation_calls == ["generate"]
+
+
+@pytest.mark.asyncio
+async def test_benchmark_jobs_uses_remote_executor(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+    )
+    skill_record = SkillRecord(
+        skill=Skill(
+            name="pull-request-descriptions",
+            description="Write good pull request descriptions.",
+            body="Use a clear structure.",
+        ),
+        state=SkillState(
+            tests=[
+                TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+                TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])),
+            ]
+        ),
+    )
+    skill_dir = tmp_path / "skill"
+    skill_record.save(skill_dir)
+    fake_executor = object()
+    build_calls: list[str] = []
+    eval_calls: list[int] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+    monkeypatch.setattr(
+        "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext()
+    )
+
+    def fake_build_executor(name: str, **kwargs: object) -> object:
+        del kwargs
+        build_calls.append(name)
+        return fake_executor
+
+    async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults:
+        skill = args[0]
+        assert isinstance(skill, Skill)
+        assert kwargs["executor"] is fake_executor
+        assert kwargs["operation"] == "benchmark"
+        max_parallel = kwargs["max_parallel"]
+        assert isinstance(max_parallel, int)
+        eval_calls.append(max_parallel)
+        return _make_eval_results(
+            skill=skill,
+            model=str(kwargs["model"]),
+            test_cases=skill_record.state.tests,
+            run_baseline=False,
+        )
+
+    monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor)
+    monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill)
+
+    await _benchmark_async(
+        skill_path=str(skill_dir),
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=2,
+        tests_path=None,
+        executor_name="jobs",
+        artifact_repo="ns/repo",
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        output_dir=str(config.runs_dir),
+        verbose=False,
+        max_parallel=4,
+    )
+
+    assert build_calls == ["jobs"]
+    assert eval_calls == [4, 4]
+    batch_folder = next(config.runs_dir.iterdir())
+    summary = load_batch_summary(batch_folder)
+    assert summary is not None
+    assert summary.total_runs == 2
+
+
+@pytest.mark.asyncio
+async def test_benchmark_uses_config_execution_defaults_when_cli_unset(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config = Config(
+        runs_dir=tmp_path / "runs",
+        fastagent_config=tmp_path / "fastagent.config.yaml",
+        executor="jobs",
+        num_runs=2,
+        max_parallel=6,
+    )
+    skill_record = SkillRecord(
+        skill=Skill(
+            name="pull-request-descriptions",
+            description="Write good pull request descriptions.",
+            body="Use a clear structure.",
+        ),
+        state=SkillState(
+            tests=[
+                TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])),
+                TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])),
+            ]
+        ),
+    )
+    skill_dir = tmp_path / "skill"
+    skill_record.save(skill_dir)
+    fake_executor = object()
+    build_calls: list[str] = []
+    benchmark_calls: list[tuple[int, int]] = []
+
+    monkeypatch.setattr("upskill.cli.Config.load", lambda: config)
+
+    def fake_build_executor(name: str, **kwargs: object) -> object:
+        del kwargs
+        build_calls.append(name)
+        return fake_executor
+
+    async def fake_run_with_skill_benchmark(*args: object, **kwargs: object):
+        del args
+        assert kwargs["executor"] is fake_executor
+        num_runs = kwargs["num_runs"]
+        max_parallel = kwargs["max_parallel"]
+        assert isinstance(num_runs, int)
+        assert isinstance(max_parallel, int)
+        benchmark_calls.append((num_runs, max_parallel))
+        return {}, []
+
+    monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor)
+    monkeypatch.setattr("upskill.cli._run_with_skill_benchmark", fake_run_with_skill_benchmark)
+    monkeypatch.setattr("upskill.cli._print_benchmark_summary", lambda _results: None)
+    monkeypatch.setattr("upskill.cli._write_benchmark_summary", lambda **_kwargs: None)
+
+    await _benchmark_async(
+        skill_path=str(skill_dir),
+        models=["haiku"],
+        test_gen_model=None,
+        num_runs=None,
+        tests_path=None,
+        executor_name=None,
+        artifact_repo="ns/repo",
+        wait=True,
+        jobs_timeout="2h",
+        jobs_flavor="cpu-basic",
+        jobs_secrets="HF_TOKEN",
+        jobs_namespace=None,
+        output_dir=str(config.runs_dir),
+        verbose=False,
+        max_parallel=None,
+    )
+
+    assert build_calls == ["jobs"]
+    assert benchmark_calls == [(2, 6)]
diff --git a/tests/test_config.py b/tests/test_config.py
index 2ec6e1f..56769f0 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -36,7 +36,14 @@ def test_config_save_uses_env_override_path_when_file_is_missing(tmp_path, monke
     monkeypatch.setenv(UPSKILL_CONFIG_ENV, str(override_path))
     monkeypatch.chdir(tmp_path)
 
-    config = Config(skill_generation_model="haiku")
+    config = Config(
+        skill_generation_model="haiku",
+        executor="jobs",
+        num_runs=4,
+        max_parallel=7,
+        jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY",
+        jobs_image="ghcr.io/example/custom:latest",
+    )
     config.save()
 
     assert override_path.exists()
@@ -46,3 +53,35 @@ def test_config_save_uses_env_override_path_when_file_is_missing(tmp_path, monke
         saved = yaml.safe_load(f) or {}
 
     assert saved["skill_generation_model"] == "haiku"
+    assert saved["executor"] == "jobs"
+    assert saved["num_runs"] == 4
+    assert saved["max_parallel"] == 7
+    assert saved["jobs_secrets"] == "HF_TOKEN,ANTHROPIC_API_KEY"
+    assert saved["jobs_image"] == "ghcr.io/example/custom:latest"
+
+
+def test_config_load_reads_execution_settings(tmp_path, monkeypatch) -> None:
+    config_path = tmp_path / "upskill.config.yaml"
+    config_path.write_text(
+        "\n".join(
+            [
+                "skill_generation_model: sonnet",
+                "executor: jobs",
+                "num_runs: 2",
+                "max_parallel: 6",
+                "jobs_secrets: HF_TOKEN,OPENAI_API_KEY",
+                "jobs_image: ghcr.io/example/custom:latest",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.chdir(tmp_path)
+
+    config = Config.load()
+
+    assert config.skill_generation_model == "sonnet"
+    assert config.executor == "jobs"
+    assert config.num_runs == 2
+    assert config.max_parallel == 6
+    assert config.jobs_secrets == "HF_TOKEN,OPENAI_API_KEY"
+    assert config.jobs_image == "ghcr.io/example/custom:latest"
diff --git a/tests/test_cpd.py b/tests/test_cpd.py
new file mode 100644
index 0000000..a71322f
--- /dev/null
+++ b/tests/test_cpd.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from scripts.cpd import build_cpd_command, resolve_cli_exit_code, resolve_platform
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+def test_resolve_platform_maps_common_linux_labels() -> None:
+    platform_config = resolve_platform(system="linux", arch="x86_64")
+
+    assert platform_config.system == "linux"
+    assert platform_config.arch == "x86_64"
+    assert platform_config.os_label == "linux"
+    assert platform_config.arch_label == "x64"
+    assert platform_config.java_name == "java"
+    assert platform_config.pmd_name == "pmd"
+
+
+def test_build_cpd_command_includes_expected_arguments(tmp_path: Path) -> None:
+    platform_config = resolve_platform(system="linux", arch="x86_64")
+    pmd_dir = tmp_path / "pmd-bin"
+    src_dir = tmp_path / "src"
+    excluded_path = src_dir / "skip_me.py"
+
+    command = build_cpd_command(
+        platform_config=platform_config,
+        pmd_dir=pmd_dir,
+        src_dir=src_dir,
+        excluded_paths=[excluded_path],
+        min_tokens=120,
+        output_format="xml",
+    )
+
+    assert command == [
+        str(pmd_dir / "bin" / "pmd"),
+        "cpd",
+        "--language",
+        "python",
+        "--minimum-tokens",
+        "120",
+        "--dir",
+        str(src_dir),
+        "--format",
+        "xml",
+        "--exclude",
+        str(excluded_path),
+    ]
+
+
+def test_resolve_cli_exit_code_honors_check_mode() -> None:
+    assert resolve_cli_exit_code(cpd_exit_code=0, check=False) == 0
+    assert resolve_cli_exit_code(cpd_exit_code=4, check=False) == 0
+    assert resolve_cli_exit_code(cpd_exit_code=4, check=True) == 1
+    assert resolve_cli_exit_code(cpd_exit_code=7, check=True) == 7
diff --git a/tests/test_execution_backends.py b/tests/test_execution_backends.py
new file mode 100644
index 0000000..e14f960
--- /dev/null
+++ b/tests/test_execution_backends.py
@@ -0,0 +1,568 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import shutil
+import tarfile
+from dataclasses import replace
+from pathlib import Path
+
+import pytest
+from fast_agent.mcp.prompt_message_extended import PromptMessageExtended
+from fast_agent.mcp.prompt_serialization import save_json
+from mcp.types import TextContent
+
+from upskill.artifacts import materialize_workspace
+from upskill.evaluate import evaluate_skill, load_eval_results_from_artifact_root
+from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult
+from upskill.executors.local_fast_agent import LocalFastAgentExecutor
+from upskill.executors.remote_fast_agent import RemoteFastAgentExecutor
+from upskill.fast_agent_cli import build_fast_agent_command
+from upskill.hf_jobs import JobsConfig, SubmittedJob
+from upskill.models import ConversationStats, ExpectedSpec, Skill, TestCase, TestResult
+from upskill.result_parsing import parse_fast_agent_results
+
+
+def _write_result_history(path: Path, *, assistant_text: str) -> None:
+    messages = [
+        PromptMessageExtended(
+            role="user",
+            content=[TextContent(type="text", text="Do the task")],
+        ),
+        PromptMessageExtended(
+            role="assistant",
+            content=[TextContent(type="text", text=assistant_text)],
+        ),
+    ]
+    save_json(messages, str(path))
+
+
+def _build_request(tmp_path: Path) -> ExecutionRequest:
+    cards_dir = tmp_path / "cards-source"
+    cards_dir.mkdir()
+    (cards_dir / "evaluator.md").write_text("---\ndescription: evaluator\n---\n{{agentSkills}}\n")
+    (cards_dir / "skill_gen.md").write_text(
+        "---\ndescription: skill generator\n---\nGenerate skills\n"
+    )
+    (cards_dir / "test_gen.md").write_text(
+        "---\ndescription: test generator\n---\nGenerate tests\n"
+    )
+    config_path = tmp_path / "fastagent.config.yaml"
+    config_path.write_text("default_model: sonnet\n")
+    return ExecutionRequest(
+        prompt="Do the task",
+        model="haiku",
+        agent="evaluator",
+        fastagent_config_path=config_path,
+        artifact_dir=tmp_path / "artifacts" / "run_1",
+        cards_source_dir=cards_dir,
+        label="test run",
+        skill=Skill(
+            name="write-good-prs",
+            description="Write good pull request descriptions.",
+            body="Use a clear structure.",
+        ),
+        workspace_files={"context.txt": "hello"},
+    )
+
+
+def test_build_fast_agent_command_uses_explicit_contract(tmp_path: Path) -> None:
+    request = _build_request(tmp_path)
+    prompt_path = tmp_path / "bundle" / "prompt.txt"
+    prompt_path.parent.mkdir(parents=True)
+    prompt_path.write_text(request.prompt, encoding="utf-8")
+    command = build_fast_agent_command(
+        request,
+        config_path=request.fastagent_config_path,
+        cards_dir=tmp_path / "bundle" / "cards",
+        skills_dir=tmp_path / "bundle" / "skills",
+        prompt_path=prompt_path,
+        results_path=tmp_path / "bundle" / "results.json",
+        fast_agent_bin="fast-agent",
+    )
+
+    assert command[:2] == ["fast-agent", "go"]
+    assert "--config-path" in command
+    assert "--card" in command
+    assert "--agent" in command
+    assert "--model" in command
+    assert "--skills-dir" in command
+    assert "--prompt-file" in command
+    assert "--results" in command
+    assert "--quiet" in command
+
+
+def test_build_fast_agent_command_omits_missing_config_path(tmp_path: Path) -> None:
+    request = replace(_build_request(tmp_path), fastagent_config_path=tmp_path / "missing.yaml")
+    prompt_path = tmp_path / "bundle" / "prompt.txt"
+    prompt_path.parent.mkdir(parents=True)
+    prompt_path.write_text(request.prompt, encoding="utf-8")
+
+    command = build_fast_agent_command(
+        request,
+        config_path=None,
+        cards_dir=tmp_path / "bundle" / "cards",
+        skills_dir=tmp_path / "bundle" / "skills",
+        prompt_path=prompt_path,
+        results_path=tmp_path / "bundle" / "results.json",
+    )
+
+    assert "--config-path" not in command
+    assert "--prompt-file" in command
+
+
+def test_parse_fast_agent_results_extracts_output_text(tmp_path: Path) -> None:
+    results_path = tmp_path / "results.json"
+    _write_result_history(results_path, assistant_text="Structured answer")
+
+    parsed = parse_fast_agent_results(results_path)
+
+    assert parsed.output_text == "Structured answer"
+    assert parsed.stats.turns == 1
+
+
+@pytest.mark.asyncio
+async def test_local_fast_agent_executor_preserves_artifacts_and_parses_results(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request = _build_request(tmp_path)
+    executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent")
+
+    class FakeProcess:
+        returncode = 0
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            return (b"assistant output\n", b"")
+
+    async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess:
+        del kwargs
+        results_index = args.index("--results") + 1
+        results_path = Path(args[results_index])
+        _write_result_history(results_path, assistant_text="Final answer")
+        return FakeProcess()
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec)
+
+    handle = await executor.execute(request)
+    result = await executor.collect(handle)
+
+    assert result.error is None
+    assert result.output_text == "Final answer"
+    assert result.raw_results_path == request.artifact_dir / "results.json"
+    assert (request.artifact_dir / "request.json").exists()
+    assert (request.artifact_dir / "stdout.txt").exists()
+    assert (request.artifact_dir / "stderr.txt").exists()
+    assert (request.artifact_dir / "workspace" / "context.txt").read_text() == "hello"
+    assert (request.artifact_dir / "workspace" / "fastagent.config.yaml").exists()
+    assert (request.artifact_dir / "cards" / "evaluator.md").exists()
+    assert not (request.artifact_dir / "cards" / "skill_gen.md").exists()
+    assert not (request.artifact_dir / "cards" / "test_gen.md").exists()
+    assert (request.artifact_dir / "skills" / "write-good-prs" / "SKILL.md").exists()
+
+
+@pytest.mark.asyncio
+async def test_local_fast_agent_executor_fails_when_results_artifact_is_missing(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request = _build_request(tmp_path)
+    executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent")
+
+    class FakeProcess:
+        returncode = 0
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            return (b"", b"")
+
+    async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess:
+        del args, kwargs
+        return FakeProcess()
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec)
+
+    handle = await executor.execute(request)
+    result = await executor.collect(handle)
+
+    assert result.error == "fast-agent run did not produce a results artifact."
+    assert result.raw_results_path is None
+
+
+@pytest.mark.asyncio
+async def test_local_fast_agent_executor_omits_missing_config_from_command_and_artifacts(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request = replace(_build_request(tmp_path), fastagent_config_path=tmp_path / "missing.yaml")
+    executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent")
+
+    class FakeProcess:
+        returncode = 0
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            return (b"", b"")
+
+    async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess:
+        del kwargs
+        assert "--config-path" not in args
+        assert "--prompt-file" in args
+        results_index = args.index("--results") + 1
+        _write_result_history(Path(args[results_index]), assistant_text="Final answer")
+        return FakeProcess()
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec)
+
+    handle = await executor.execute(request)
+    result = await executor.collect(handle)
+
+    assert result.error is None
+    assert not (request.artifact_dir / "fastagent.config.yaml").exists()
+    assert not (request.artifact_dir / "workspace" / "fastagent.config.yaml").exists()
+
+
+@pytest.mark.asyncio
+async def test_remote_fast_agent_executor_preserves_artifacts_and_parses_results(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request = _build_request(tmp_path)
+    executor = RemoteFastAgentExecutor(jobs_config=JobsConfig(artifact_repo="ns/repo"))
+    submitted_labels: dict[str, str] = {}
+
+    def fake_submit_bundle_job(**kwargs: object) -> SubmittedJob:
+        nonlocal submitted_labels
+        labels = kwargs["labels"]
+        assert isinstance(labels, dict)
+        assert all(isinstance(key, str) and isinstance(value, str) for key, value in labels.items())
+        submitted_labels = {str(key): str(value) for key, value in labels.items()}
+        del kwargs
+        return SubmittedJob(
+            job_id="evalstate/job-123",
+            run_id="run-456",
+            artifact_repo="ns/repo",
+        )
+
+    def fake_wait_for_job_outputs(
+        job: SubmittedJob,
+        *,
+        destination_root: Path,
+        wait_timeout_seconds: float,
+        progress_callback: object = None,
+    ) -> Path:
+        del wait_timeout_seconds, progress_callback
+        output_dir = destination_root / "outputs" / job.run_id
+        (output_dir / "results").mkdir(parents=True, exist_ok=True)
+        (output_dir / "logs").mkdir(parents=True, exist_ok=True)
+        (output_dir / "status").mkdir(parents=True, exist_ok=True)
+        (output_dir / "workspaces" / "request_1").mkdir(parents=True, exist_ok=True)
+        _write_result_history(
+            output_dir / "results" / "request_1.json", assistant_text="Remote answer"
+        )
+        (output_dir / "logs" / "request_1.out.txt").write_text("stdout\n", encoding="utf-8")
+        (output_dir / "logs" / "request_1.err.txt").write_text("", encoding="utf-8")
+        (output_dir / "status" / "request_1.exit_code.txt").write_text("0\n", encoding="utf-8")
+        (output_dir / "workspaces" / "request_1" / "context.txt").write_text(
+            "remote hello",
+            encoding="utf-8",
+        )
+        return output_dir
+
+    monkeypatch.setattr(
+        "upskill.executors.remote_fast_agent._submit_bundle_job",
+        fake_submit_bundle_job,
+    )
+    monkeypatch.setattr(
+        "upskill.executors.remote_fast_agent._make_run_id",
+        lambda *_args: "run-456",
+    )
+    monkeypatch.setattr(
+        "upskill.executors.remote_fast_agent.wait_for_job_outputs",
+        fake_wait_for_job_outputs,
+    )
+
+    handle = await executor.execute(request)
+    result = await executor.collect(handle)
+
+    assert result.error is None
+    assert result.output_text == "Remote answer"
+    assert result.raw_results_path == request.artifact_dir / "results.json"
+    assert result.metadata["job_id"] == "evalstate/job-123"
+    assert (request.artifact_dir / "stdout.txt").exists()
+    assert (request.artifact_dir / "stderr.txt").exists()
+    assert (request.artifact_dir / "remote_output" / "results" / "request_1.json").exists()
+    assert (request.artifact_dir / "workspace" / "context.txt").read_text() == "remote hello"
+    assert not (request.artifact_dir / "cards" / "skill_gen.md").exists()
+    assert not (request.artifact_dir / "cards" / "test_gen.md").exists()
+    assert submitted_labels == {
+        "upskill-agent": "evaluator",
+        "upskill-executor": "remote-fast-agent",
+        "upskill-model": "haiku",
+        "upskill-operation": "eval",
+        "upskill-request": "test-run",
+        "upskill-run-id": "run-456",
+        "upskill-skill": "write-good-prs",
+    }
+
+
+@pytest.mark.asyncio
+async def test_remote_fast_agent_executor_submit_preserves_artifacts(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request = _build_request(tmp_path)
+    executor = RemoteFastAgentExecutor(jobs_config=JobsConfig(artifact_repo="ns/repo"))
+
+    def fake_submit_bundle_job(**kwargs: object) -> SubmittedJob:
+        del kwargs
+        return SubmittedJob(
+            job_id="evalstate/job-123",
+            run_id="run-456",
+            artifact_repo="ns/repo",
+        )
+
+    def fail_wait_for_job_outputs(*args: object, **kwargs: object) -> Path:
+        del args, kwargs
+        raise AssertionError("submit() should not wait for job outputs")
+
+    monkeypatch.setattr(
+        "upskill.executors.remote_fast_agent._submit_bundle_job",
+        fake_submit_bundle_job,
+    )
+    monkeypatch.setattr(
+        "upskill.executors.remote_fast_agent.wait_for_job_outputs",
+        fail_wait_for_job_outputs,
+    )
+
+    submission = await executor.submit(request)
+
+    assert submission == SubmittedJob(
+        job_id="evalstate/job-123",
+        run_id="run-456",
+        artifact_repo="ns/repo",
+    )
+    assert (request.artifact_dir / "request.json").exists()
+    assert (request.artifact_dir / "prompt.txt").exists()
+    assert (request.artifact_dir / "cards" / "evaluator.md").exists()
+    assert not (request.artifact_dir / "cards" / "skill_gen.md").exists()
+    assert not (request.artifact_dir / "cards" / "test_gen.md").exists()
+    assert (request.artifact_dir / "skills" / "write-good-prs" / "SKILL.md").exists()
+    submitted_job = json.loads((request.artifact_dir / "submitted_job.json").read_text())
+    assert submitted_job["job_id"] == "evalstate/job-123"
+    assert submitted_job["run_id"] == "run-456"
+
+
+def test_remote_fast_agent_executor_bundle_omits_missing_config(tmp_path: Path) -> None:
+    request = replace(_build_request(tmp_path), fastagent_config_path=tmp_path / "missing.yaml")
+    executor = RemoteFastAgentExecutor(jobs_config=JobsConfig(artifact_repo="ns/repo"))
+
+    temp_root, bundle_archive = executor._create_bundle_archive(request)
+    try:
+        with tarfile.open(bundle_archive, "r:gz") as archive:
+            assert "bundle/fastagent.config.yaml" not in archive.getnames()
+    finally:
+        shutil.rmtree(temp_root, ignore_errors=True)
+
+
+@pytest.mark.asyncio
+async def test_local_fast_agent_executor_normalizes_paths_and_preserves_file_context(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.chdir(tmp_path)
+
+    cards_dir = Path("cards-source")
+    cards_dir.mkdir()
+    (cards_dir / "evaluator.md").write_text("---\ndescription: evaluator\n---\n{{agentSkills}}\n")
+    config_path = Path("fastagent.config.yaml")
+    config_path.write_text("default_model: sonnet\n")
+    request = ExecutionRequest(
+        prompt="Base prompt\n\n```context.txt\nhello\n```",
+        model="haiku",
+        agent="evaluator",
+        fastagent_config_path=config_path,
+        artifact_dir=Path("artifacts") / "run_1",
+        cards_source_dir=cards_dir,
+        label="test run",
+        workspace_files={"context.txt": "hello"},
+    )
+    executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent")
+
+    class FakeProcess:
+        returncode = 0
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            return (b"assistant output\n", b"")
+
+    async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess:
+        cwd = kwargs["cwd"]
+        assert isinstance(cwd, Path)
+        assert cwd.is_absolute()
+        results_index = args.index("--results") + 1
+        prompt_index = args.index("--prompt-file") + 1
+        cards_index = args.index("--card") + 1
+        skills_index = args.index("--skills-dir") + 1
+        config_index = args.index("--config-path") + 1
+        agent_index = args.index("--agent") + 1
+        assert args[agent_index] == "evaluator"
+        for index in (results_index, cards_index, skills_index, config_index, prompt_index):
+            if index == prompt_index:
+                continue
+            assert Path(args[index]).is_absolute()
+        prompt_text = Path(args[prompt_index]).read_text(encoding="utf-8")
+        assert "```context.txt\nhello\n```" in prompt_text
+        _write_result_history(Path(args[results_index]), assistant_text="Final answer")
+        return FakeProcess()
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec)
+
+    handle = await executor.execute(request)
+    result = await executor.collect(handle)
+
+    assert result.error is None
+
+
+def test_materialize_workspace_rejects_paths_outside_workspace(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="must not traverse parents"):
+        materialize_workspace(tmp_path / "workspace", {"../pyproject.toml": "oops"})
+
+    with pytest.raises(ValueError, match="must be relative"):
+        materialize_workspace(tmp_path / "workspace", {"/tmp/pwned": "oops"})
+
+
+def test_load_eval_results_from_artifact_root_reconstructs_metrics(tmp_path: Path) -> None:
+    artifact_root = tmp_path / "eval"
+    with_skill_dir = artifact_root / "with-skill" / "test_1"
+    baseline_dir = artifact_root / "baseline" / "test_1"
+    with_skill_dir.mkdir(parents=True)
+    baseline_dir.mkdir(parents=True)
+
+    test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"]))
+    with_skill_result = TestResult(test_case=test_case, success=True, output="answer")
+    baseline_result = TestResult(test_case=test_case, success=False, output="miss")
+
+    (with_skill_dir / "test_result.json").write_text(
+        with_skill_result.model_dump_json(indent=2),
+        encoding="utf-8",
+    )
+    (baseline_dir / "test_result.json").write_text(
+        baseline_result.model_dump_json(indent=2),
+        encoding="utf-8",
+    )
+
+    reconstructed = load_eval_results_from_artifact_root(
+        skill_name="write-good-prs",
+        model="qwen35",
+        artifact_root=artifact_root,
+    )
+
+    assert reconstructed is not None
+    assert reconstructed.with_skill_success_rate == 1.0
+    assert reconstructed.baseline_success_rate == 0.0
+
+
+@pytest.mark.asyncio
+async def test_evaluate_skill_emits_per_test_progress_messages(tmp_path: Path) -> None:
+    skill = Skill(
+        name="write-good-prs",
+        description="Write good pull request descriptions.",
+        body="Use a clear structure.",
+    )
+    test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"]))
+    messages: list[str] = []
+
+    class FakeExecutor:
+        async def execute(self, request: ExecutionRequest) -> ExecutionHandle:
+            request.artifact_dir.mkdir(parents=True, exist_ok=True)
+            workspace_dir = request.artifact_dir / "workspace"
+            workspace_dir.mkdir(parents=True, exist_ok=True)
+            task = asyncio.create_task(
+                asyncio.sleep(
+                    0,
+                    result=ExecutionResult(
+                        output_text="answer",
+                        raw_results_path=None,
+                        stdout_path=request.artifact_dir / "stdout.txt",
+                        stderr_path=request.artifact_dir / "stderr.txt",
+                        artifact_dir=request.artifact_dir,
+                        workspace_dir=workspace_dir,
+                        stats=ConversationStats(),
+                    ),
+                )
+            )
+            return ExecutionHandle(request=request, task=task)
+
+        async def collect(self, handle: ExecutionHandle) -> ExecutionResult:
+            return await handle.task
+
+        async def cancel(self, handle: ExecutionHandle) -> None:
+            handle.task.cancel()
+
+    results = await evaluate_skill(
+        skill,
+        [test_case],
+        FakeExecutor(),
+        model="haiku",
+        fastagent_config_path=tmp_path / "fastagent.config.yaml",
+        cards_source_dir=tmp_path,
+        artifact_root=tmp_path / "eval",
+        progress_callback=messages.append,
+    )
+
+    assert results.with_skill_success_rate == 1.0
+    assert "starting with-skill test 1/1" in messages
+    assert "finished with-skill test 1/1 (ok)" in messages
+
+
+@pytest.mark.asyncio
+async def test_evaluate_skill_includes_job_id_in_execution_errors(tmp_path: Path) -> None:
+    skill = Skill(
+        name="write-good-prs",
+        description="Write good pull request descriptions.",
+        body="Use a clear structure.",
+    )
+    test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"]))
+
+    class FakeExecutor:
+        async def execute(self, request: ExecutionRequest) -> ExecutionHandle:
+            request.artifact_dir.mkdir(parents=True, exist_ok=True)
+            workspace_dir = request.artifact_dir / "workspace"
+            workspace_dir.mkdir(parents=True, exist_ok=True)
+            task = asyncio.create_task(
+                asyncio.sleep(
+                    0,
+                    result=ExecutionResult(
+                        output_text=None,
+                        raw_results_path=None,
+                        stdout_path=request.artifact_dir / "stdout.txt",
+                        stderr_path=request.artifact_dir / "stderr.txt",
+                        artifact_dir=request.artifact_dir,
+                        workspace_dir=workspace_dir,
+                        stats=ConversationStats(),
+                        error="fast-agent exited with code 1.",
+                        metadata={"job_id": "evalstate/job-123"},
+                    ),
+                )
+            )
+            return ExecutionHandle(request=request, task=task)
+
+        async def collect(self, handle: ExecutionHandle) -> ExecutionResult:
+            return await handle.task
+
+        async def cancel(self, handle: ExecutionHandle) -> None:
+            handle.task.cancel()
+
+    results = await evaluate_skill(
+        skill,
+        [test_case],
+        FakeExecutor(),
+        model="haiku",
+        fastagent_config_path=tmp_path / "fastagent.config.yaml",
+        cards_source_dir=tmp_path,
+        artifact_root=tmp_path / "eval",
+        progress_callback=None,
+    )
+
+    assert (
+        results.with_skill_results[0].error
+        == "fast-agent exited with code 1. (job evalstate/job-123)"
+    )
diff --git a/tests/test_hf_jobs.py b/tests/test_hf_jobs.py
new file mode 100644
index 0000000..2ea9088
--- /dev/null
+++ b/tests/test_hf_jobs.py
@@ -0,0 +1,523 @@
+from __future__ import annotations
+
+import subprocess
+from datetime import UTC, datetime
+from typing import TYPE_CHECKING
+
+import pytest
+
+import upskill.hf_jobs as hf_jobs
+from upskill.hf_jobs import (
+    JobsConfig,
+    SubmittedJob,
+    _build_hf_jobs_run_command,
+    _make_run_id,
+    _normalize_job_id,
+    _render_bundle_job_script,
+    _submit_bundle_job,
+    parse_duration_seconds,
+    verify_artifact_repo_access,
+    wait_for_job_outputs,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+def test_parse_duration_seconds_supports_hf_style_suffixes() -> None:
+    assert parse_duration_seconds("45m") == 2700.0
+    assert parse_duration_seconds("2h") == 7200.0
+    assert parse_duration_seconds("30") == 30.0
+
+
+def test_make_run_id_adds_entropy_even_with_frozen_timestamp(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class FrozenDateTime(datetime):
+        @classmethod
+        def now(cls, tz: object | None = None) -> FrozenDateTime:
+            del tz
+            return cls(2026, 3, 22, 12, 0, 0, tzinfo=UTC)
+
+    monkeypatch.setattr("upskill.hf_jobs.datetime", FrozenDateTime)
+
+    run_id_a = _make_run_id("with-skill", "qwen35", "pull-request-descriptions")
+    run_id_b = _make_run_id("with-skill", "qwen35", "pull-request-descriptions")
+
+    assert run_id_a != run_id_b
+    assert run_id_a.startswith("20260322T120000Z_with-skill-qwen35-pull-request-descriptions_")
+    assert run_id_b.startswith("20260322T120000Z_with-skill-qwen35-pull-request-descriptions_")
+
+
+def test_normalize_job_id_extracts_namespace_and_id_from_url() -> None:
+    assert (
+        _normalize_job_id("View at: https://huggingface.co/jobs/evalstate/69bd5e5f71691dc46f161e83")
+        == "evalstate/69bd5e5f71691dc46f161e83"
+    )
+
+
+def test_run_hf_command_uses_doubled_retry_backoff(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    sleep_calls: list[float] = []
+    attempts = 0
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        nonlocal attempts
+        del kwargs
+        attempts += 1
+        if attempts < 3:
+            return subprocess.CompletedProcess(
+                args=args,
+                returncode=1,
+                stdout="",
+                stderr="rate limit for the /whoami-v2 endpoint\n",
+            )
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr("upskill.hf_jobs.time.sleep", sleep_calls.append)
+
+    completed = hf_jobs._run_hf_command(["hf", "jobs", "run"])
+
+    assert completed.returncode == 0
+    assert sleep_calls == [2.0, 4.0]
+
+
+def test_wait_for_job_outputs_downloads_full_directory(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    calls: list[list[str]] = []
+    messages: list[str] = []
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[2] == "ns/repo" and args[3].endswith("exit_code.txt"):
+            marker = tmp_path / "outputs" / "run-456" / "exit_code.txt"
+            marker.parent.mkdir(parents=True, exist_ok=True)
+            marker.write_text("0\n", encoding="utf-8")
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        output_dir = tmp_path / "outputs" / "run-456"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / "exit_code.txt").write_text("0\n", encoding="utf-8")
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    output_dir = wait_for_job_outputs(
+        SubmittedJob(job_id="job-123", run_id="run-456", artifact_repo="ns/repo"),
+        destination_root=tmp_path,
+        wait_timeout_seconds=1.0,
+        poll_interval_seconds=0.01,
+        progress_callback=messages.append,
+    )
+
+    assert output_dir == tmp_path / "outputs" / "run-456"
+    assert len(calls) == 3
+    assert any(
+        call[:7] == ["hf", "jobs", "ps", "-a", "--format", "json", "--namespace"] for call in calls
+    )
+    assert messages[0] == "waiting for job job-123 (run_id=run-456)"
+    assert "completed; downloading artifacts" in messages[1]
+
+
+def test_wait_for_job_outputs_raises_when_job_enters_error_stage(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        if args[:4] == ["hf", "jobs", "ps", "-a"]:
+            return subprocess.CompletedProcess(
+                args=args,
+                returncode=0,
+                stdout=(
+                    '[{"id":"job-123","owner":{"name":"evalstate"},'
+                    '"status":{"stage":"ERROR","message":"boom"}}]'
+                ),
+                stderr="",
+            )
+        return subprocess.CompletedProcess(args=args, returncode=1, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    with pytest.raises(RuntimeError, match="ended with stage ERROR"):
+        wait_for_job_outputs(
+            SubmittedJob(job_id="evalstate/job-123", run_id="run-456", artifact_repo="ns/repo"),
+            destination_root=tmp_path,
+            wait_timeout_seconds=1.0,
+            poll_interval_seconds=0.01,
+        )
+
+
+def test_wait_for_job_outputs_retries_auth_rate_limited_downloads(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    calls: list[list[str]] = []
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[:4] == ["hf", "jobs", "ps", "-a"]:
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="[]", stderr="")
+        if args[:2] == ["hf", "download"] and args[3].endswith("exit_code.txt"):
+            marker_attempt = sum(
+                1
+                for call in calls
+                if call[:2] == ["hf", "download"] and call[3].endswith("exit_code.txt")
+            )
+            if marker_attempt == 1:
+                return subprocess.CompletedProcess(
+                    args=args,
+                    returncode=1,
+                    stdout="",
+                    stderr=(
+                        "Error: You've hit the rate limit for the /whoami-v2 endpoint, "
+                        "which is intentionally strict for security reasons.\n"
+                    ),
+                )
+            marker = tmp_path / "outputs" / "run-456" / "exit_code.txt"
+            marker.parent.mkdir(parents=True, exist_ok=True)
+            marker.write_text("0\n", encoding="utf-8")
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        if args[:2] == ["hf", "download"] and "--include" in args:
+            full_download_attempt = sum(
+                1 for call in calls if call[:2] == ["hf", "download"] and "--include" in call
+            )
+            if full_download_attempt == 1:
+                return subprocess.CompletedProcess(
+                    args=args,
+                    returncode=1,
+                    stdout="",
+                    stderr=(
+                        "Error: You've hit the rate limit for the /whoami-v2 endpoint, "
+                        "which is intentionally strict for security reasons.\n"
+                    ),
+                )
+            output_dir = tmp_path / "outputs" / "run-456"
+            output_dir.mkdir(parents=True, exist_ok=True)
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        return subprocess.CompletedProcess(args=args, returncode=1, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr("upskill.hf_jobs.time.sleep", lambda *_args, **_kwargs: None)
+
+    output_dir = wait_for_job_outputs(
+        SubmittedJob(job_id="job-123", run_id="run-456", artifact_repo="ns/repo"),
+        destination_root=tmp_path,
+        wait_timeout_seconds=1.0,
+        poll_interval_seconds=0.01,
+    )
+
+    assert output_dir == tmp_path / "outputs" / "run-456"
+    assert sum(1 for call in calls if call[:2] == ["hf", "download"] and "--include" in call) == 2
+    assert (
+        sum(
+            1
+            for call in calls
+            if call[:2] == ["hf", "download"] and call[3].endswith("exit_code.txt")
+        )
+        == 2
+    )
+
+
+def test_submit_bundle_job_retries_conflict_upload_and_auth_rate_limit(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    calls: list[list[str]] = []
+    bundle_archive = tmp_path / "bundle.tar.gz"
+    bundle_archive.write_text("bundle", encoding="utf-8")
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[:2] == ["hf", "upload"] and args[4].endswith("bundle.tar.gz"):
+            upload_attempt = sum(
+                1
+                for call in calls
+                if call[:2] == ["hf", "upload"] and call[4].endswith("bundle.tar.gz")
+            )
+            if upload_attempt == 1:
+                return subprocess.CompletedProcess(
+                    args=args,
+                    returncode=1,
+                    stdout="",
+                    stderr="412 Precondition Failed\nA commit has happened since. Please refresh and try again.\n",
+                )
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        if args[:3] == ["hf", "jobs", "run"]:
+            submit_attempt = sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"])
+            if submit_attempt == 1:
+                return subprocess.CompletedProcess(
+                    args=args,
+                    returncode=1,
+                    stdout="Set HF_DEBUG=1 as environment variable for full traceback.\n",
+                    stderr=(
+                        "Error: You've hit the rate limit for the /whoami-v2 endpoint, "
+                        "which is intentionally strict for security reasons.\n"
+                    ),
+                )
+            return subprocess.CompletedProcess(
+                args=args,
+                returncode=0,
+                stdout="View at: https://huggingface.co/jobs/evalstate/job-123\n",
+                stderr="",
+            )
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr("upskill.hf_jobs.time.sleep", lambda *_args, **_kwargs: None)
+    monkeypatch.setattr("upskill.hf_jobs._VERIFIED_ARTIFACT_REPOS", set())
+
+    submission = _submit_bundle_job(
+        bundle_archive=bundle_archive,
+        jobs_config=JobsConfig(artifact_repo="ns/repo"),
+        run_id="run-456",
+        model="qwen35",
+    )
+
+    assert submission == SubmittedJob(
+        job_id="evalstate/job-123",
+        run_id="run-456",
+        artifact_repo="ns/repo",
+    )
+    assert sum(1 for call in calls if call[:2] == ["hf", "upload"]) == 2
+    assert sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"]) == 2
+
+
+def test_submit_bundle_job_retries_auth_rate_limit_during_upload(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    calls: list[list[str]] = []
+    bundle_archive = tmp_path / "bundle.tar.gz"
+    bundle_archive.write_text("bundle", encoding="utf-8")
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[:2] == ["hf", "upload"] and args[4].endswith("bundle.tar.gz"):
+            upload_attempt = sum(
+                1
+                for call in calls
+                if call[:2] == ["hf", "upload"] and call[4].endswith("bundle.tar.gz")
+            )
+            if upload_attempt == 1:
+                return subprocess.CompletedProcess(
+                    args=args,
+                    returncode=1,
+                    stdout="",
+                    stderr=(
+                        "Error: You've hit the rate limit for the /whoami-v2 endpoint, "
+                        "which is intentionally strict for security reasons.\n"
+                    ),
+                )
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        if args[:3] == ["hf", "jobs", "run"]:
+            return subprocess.CompletedProcess(
+                args=args,
+                returncode=0,
+                stdout="View at: https://huggingface.co/jobs/evalstate/job-123\n",
+                stderr="",
+            )
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr("upskill.hf_jobs.time.sleep", lambda *_args, **_kwargs: None)
+
+    submission = _submit_bundle_job(
+        bundle_archive=bundle_archive,
+        jobs_config=JobsConfig(artifact_repo="ns/repo"),
+        run_id="run-456",
+        model="qwen35",
+    )
+
+    assert submission.job_id == "evalstate/job-123"
+    assert sum(1 for call in calls if call[:2] == ["hf", "upload"]) == 2
+    assert sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"]) == 1
+
+
+def test_verify_artifact_repo_access_checks_artifact_repo_once_per_process(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    calls: list[list[str]] = []
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[:2] == ["hf", "download"] and "--dry-run" in args:
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr("upskill.hf_jobs._VERIFIED_ARTIFACT_REPOS", set())
+
+    verify_artifact_repo_access("ns/repo")
+    verify_artifact_repo_access("ns/repo")
+
+    assert sum(1 for call in calls if call[:2] == ["hf", "download"] and "--dry-run" in call) == 1
+    assert {"ns/repo"} == hf_jobs._VERIFIED_ARTIFACT_REPOS
+
+
+def test_submit_bundle_job_uses_prepared_artifact_repo(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    calls: list[list[str]] = []
+    bundle_archive = tmp_path / "bundle.tar.gz"
+    bundle_archive.write_text("bundle", encoding="utf-8")
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[:2] == ["hf", "download"] and "--dry-run" in args:
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        if args[:2] == ["hf", "upload"]:
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        if args[:3] == ["hf", "jobs", "run"]:
+            run_number = sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"])
+            return subprocess.CompletedProcess(
+                args=args,
+                returncode=0,
+                stdout=f"View at: https://huggingface.co/jobs/evalstate/job-{run_number}\n",
+                stderr="",
+            )
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr("upskill.hf_jobs._VERIFIED_ARTIFACT_REPOS", set())
+
+    verify_artifact_repo_access("ns/repo")
+
+    first = _submit_bundle_job(
+        bundle_archive=bundle_archive,
+        jobs_config=JobsConfig(artifact_repo="ns/repo"),
+        run_id="run-1",
+        model="qwen35",
+    )
+    second = _submit_bundle_job(
+        bundle_archive=bundle_archive,
+        jobs_config=JobsConfig(artifact_repo="ns/repo"),
+        run_id="run-2",
+        model="qwen35",
+    )
+
+    assert first.job_id == "evalstate/job-1"
+    assert second.job_id == "evalstate/job-2"
+    assert sum(1 for call in calls if call[:2] == ["hf", "download"] and "--dry-run" in call) == 1
+    assert {"ns/repo"} == hf_jobs._VERIFIED_ARTIFACT_REPOS
+    jobs_run_call = calls[-1]
+    assert "--namespace" in jobs_run_call
+    assert jobs_run_call[jobs_run_call.index("--namespace") + 1] == "ns"
+    assert "ghcr.io/astral-sh/uv:python3.13-bookworm" in jobs_run_call
+    assert any("huggingface_hub==1.7.2" in arg for arg in jobs_run_call)
+
+
+def test_build_hf_jobs_run_command_uses_configured_image() -> None:
+    command = _build_hf_jobs_run_command(
+        jobs_config=JobsConfig(
+            artifact_repo="ns/repo",
+            jobs_image="ghcr.io/example/custom:latest",
+        ),
+        run_id="run-123",
+        model="haiku",
+        labels=None,
+        job_script="echo hi",
+    )
+
+    assert command[-5:] == [
+        "--",
+        "ghcr.io/example/custom:latest",
+        "bash",
+        "-lc",
+        "echo hi",
+    ]
+
+
+def test_render_bundle_job_script_retries_auth_rate_limits_for_downloads_and_uploads() -> None:
+    script = _render_bundle_job_script()
+
+    assert "rate limit for the /whoami-v2 endpoint" in script
+    assert "local delay=2" in script
+    assert 'download_with_retries "$ARTIFACT_REPO" "inputs/$RUN_ID/bundle.tar.gz" "$WORK"' in script
+    assert 'run_hf_with_retries hf upload "$repo" "$src" "$dest"' in script
+
+
+def test_submit_bundle_job_passes_labels_to_hf_jobs_run(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    calls: list[list[str]] = []
+    bundle_archive = tmp_path / "bundle.tar.gz"
+    bundle_archive.write_text("bundle", encoding="utf-8")
+
+    def fake_run(
+        args: list[str],
+        **kwargs: object,
+    ) -> subprocess.CompletedProcess[str]:
+        del kwargs
+        calls.append(args)
+        if args[:2] == ["hf", "upload"]:
+            return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        if args[:3] == ["hf", "jobs", "run"]:
+            return subprocess.CompletedProcess(
+                args=args,
+                returncode=0,
+                stdout="View at: https://huggingface.co/jobs/evalstate/job-123\n",
+                stderr="",
+            )
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    _submit_bundle_job(
+        bundle_archive=bundle_archive,
+        jobs_config=JobsConfig(artifact_repo="ns/repo"),
+        run_id="run-123",
+        model="qwen35",
+        labels={
+            "upskill-model": "qwen35",
+            "upskill-operation": "eval",
+            "upskill-request": "eval-with-skill-test-1",
+        },
+    )
+
+    jobs_run_call = next(call for call in calls if call[:3] == ["hf", "jobs", "run"])
+    label_values = [
+        jobs_run_call[index + 1]
+        for index, token in enumerate(jobs_run_call[:-1])
+        if token == "--label"
+    ]
+    assert label_values == [
+        "upskill-model=qwen35",
+        "upskill-operation=eval",
+        "upskill-request=eval-with-skill-test-1",
+    ]
diff --git a/tests/test_model_resolution.py b/tests/test_model_resolution.py
index c0b330c..9657040 100644
--- a/tests/test_model_resolution.py
+++ b/tests/test_model_resolution.py
@@ -3,7 +3,7 @@
 import pytest
 
 from upskill.config import Config
-from upskill.model_resolution import resolve_models
+from upskill.model_resolution import build_fastagent_model_references, resolve_models
 
 
 def test_resolve_generate_uses_generation_model_for_test_gen_by_default() -> None:
@@ -42,6 +42,26 @@ def test_resolve_generate_cli_test_gen_model_overrides_config() -> None:
     assert resolved.test_generation_model == "opus"
 
 
+def test_build_fastagent_model_references_for_generate_uses_resolved_models() -> None:
+    config = Config(skill_generation_model="sonnet", test_gen_model="haiku")
+
+    resolved = resolve_models(
+        "generate",
+        config=config,
+        cli_model="opus",
+        cli_test_gen_model="kimi",
+    )
+    references = build_fastagent_model_references(config=config, resolved=resolved)
+
+    assert references == {
+        "system": {
+            "default": "opus",
+            "skill_gen": "opus",
+            "test_gen": "kimi",
+        }
+    }
+
+
 def test_resolve_eval_defaults_and_simple_mode() -> None:
     config = Config(skill_generation_model="sonnet", eval_model="haiku", test_gen_model=None)
 
@@ -67,6 +87,26 @@ def test_resolve_eval_cli_test_gen_model_overrides_config() -> None:
     assert resolved.test_generation_model == "opus"
 
 
+def test_build_fastagent_model_references_for_eval_keeps_configured_skill_generator() -> None:
+    config = Config(skill_generation_model="sonnet", test_gen_model="haiku")
+
+    resolved = resolve_models(
+        "eval",
+        config=config,
+        cli_models=["kimi"],
+        cli_test_gen_model="opus",
+    )
+    references = build_fastagent_model_references(config=config, resolved=resolved)
+
+    assert references == {
+        "system": {
+            "default": "sonnet",
+            "skill_gen": "sonnet",
+            "test_gen": "opus",
+        }
+    }
+
+
 def test_resolve_eval_benchmark_mode_disables_baseline() -> None:
     config = Config(skill_generation_model="sonnet", eval_model="haiku")
 
@@ -149,7 +189,7 @@ def test_resolve_unsupported_command_raises() -> None:
 
 
 def test_config_legacy_model_key_maps_to_skill_generation_model() -> None:
-    config = Config(model="haiku")
+    config = Config.model_validate({"model": "haiku"})
 
     assert config.skill_generation_model == "haiku"
     assert config.model == "haiku"
diff --git a/upskill.config.yaml b/upskill.config.yaml
index 3ddf1e2..190161a 100644
--- a/upskill.config.yaml
+++ b/upskill.config.yaml
@@ -1,7 +1,9 @@
 # upskill project configuration
 
 # Default model for skill generation.
-model: sonnet
+skill_generation_model: sonnet
+test_gen_model: opus
+eval_model: sonnet
 
 # Optional separate model for evaluation. If omitted, uses `model`.
 # eval_model: haiku
@@ -13,5 +15,19 @@ runs_dir: ./runs
 # Number of refinement passes during `upskill generate`.
 max_refine_attempts: 2
 
+# Default execution settings for eval/benchmark/refinement.
+executor: jobs        # local | jobs; override with --executor
+num_runs: 1            # Override with --runs
+max_parallel: 5        # Override with --max-parallel
+
+# HF Jobs secrets to forward when executor=jobs.
+# Use a comma-separated list of environment variable names, not secret values.
+# Example: HF_TOKEN,ANTHROPIC_API_KEY
+jobs_secrets: HF_TOKEN,ANTHROPIC_API_KEY # Override with --jobs-secrets
+
+
+# Container image used for HF Jobs submissions.
+jobs_image: ghcr.io/astral-sh/uv:python3.13-bookworm
+
 # Optional override for fast-agent config file.
 # fastagent_config: ./fastagent.config.yaml
diff --git a/uv.lock b/uv.lock
index 9caa95c..7932959 100644
--- a/uv.lock
+++ b/uv.lock
@@ -30,6 +30,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4b/f3/219eeca0ad4a20843d4b9eaac5532f87018b9d25730a62a16f54f6c52d1a/agent_client_protocol-0.8.1-py3-none-any.whl", hash = "sha256:9421a11fd435b4831660272d169c3812d553bb7247049c138c3ca127e4b8af8e", size = 54529, upload-time = "2026-02-13T15:34:53.344Z" },
 ]
 
+[[package]]
+name = "aiofile"
+version = "3.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "caio" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/67/e2/d7cb819de8df6b5c1968a2756c3cb4122d4fa2b8fc768b53b7c9e5edb646/aiofile-3.9.0.tar.gz", hash = "sha256:e5ad718bb148b265b6df1b3752c4d1d83024b93da9bd599df74b9d9ffcf7919b", size = 17943, upload-time = "2024-10-08T10:39:35.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/25/da1f0b4dd970e52bf5a36c204c107e11a0c6d3ed195eba0bfbc664c312b2/aiofile-3.9.0-py3-none-any.whl", hash = "sha256:ce2f6c1571538cbdfa0143b04e16b208ecb0e9cb4148e528af8a640ed51cc8aa", size = 19539, upload-time = "2024-10-08T10:39:32.955Z" },
+]
+
 [[package]]
 name = "aiohappyeyeballs"
 version = "2.6.1"
@@ -105,7 +117,7 @@ wheels = [
 
 [[package]]
 name = "anthropic"
-version = "0.79.0"
+version = "0.86.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -117,9 +129,9 @@ dependencies = [
     { name = "sniffio" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/15/b1/91aea3f8fd180d01d133d931a167a78a3737b3fd39ccef2ae8d6619c24fd/anthropic-0.79.0.tar.gz", hash = "sha256:8707aafb3b1176ed6c13e2b1c9fb3efddce90d17aee5d8b83a86c70dcdcca871", size = 509825, upload-time = "2026-02-07T18:06:18.388Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/7a/8b390dc47945d3169875d342847431e5f7d5fa716b2e37494d57cfc1db10/anthropic-0.86.0.tar.gz", hash = "sha256:60023a7e879aa4fbb1fed99d487fe407b2ebf6569603e5047cfe304cebdaa0e5", size = 583820, upload-time = "2026-03-18T18:43:08.017Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/95/b2/cc0b8e874a18d7da50b0fda8c99e4ac123f23bf47b471827c5f6f3e4a767/anthropic-0.79.0-py3-none-any.whl", hash = "sha256:04cbd473b6bbda4ca2e41dd670fe2f829a911530f01697d0a1e37321eb75f3cf", size = 405918, upload-time = "2026-02-07T18:06:20.246Z" },
+    { url = "https://files.pythonhosted.org/packages/63/5f/67db29c6e5d16c8c9c4652d3efb934d89cb750cad201539141781d8eae14/anthropic-0.86.0-py3-none-any.whl", hash = "sha256:9d2bbd339446acce98858c5627d33056efe01f70435b22b63546fe7edae0cd57", size = 469400, upload-time = "2026-03-18T18:43:06.526Z" },
 ]
 
 [[package]]
@@ -143,6 +155,49 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
 ]
 
+[[package]]
+name = "authlib"
+version = "1.6.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/af/98/00d3dd826d46959ad8e32af2dbb2398868fd9fd0683c26e56d0789bd0e68/authlib-1.6.9.tar.gz", hash = "sha256:d8f2421e7e5980cc1ddb4e32d3f5fa659cfaf60d8eaf3281ebed192e4ab74f04", size = 165134, upload-time = "2026-03-02T07:44:01.998Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/53/23/b65f568ed0c22f1efacb744d2db1a33c8068f384b8c9b482b52ebdbc3ef6/authlib-1.6.9-py2.py3-none-any.whl", hash = "sha256:f08b4c14e08f0861dc18a32357b33fbcfd2ea86cfe3fe149484b4d764c4a0ac3", size = 244197, upload-time = "2026-03-02T07:44:00.307Z" },
+]
+
+[[package]]
+name = "beartype"
+version = "0.22.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/94/1009e248bbfbab11397abca7193bea6626806be9a327d399810d523a07cb/beartype-0.22.9.tar.gz", hash = "sha256:8f82b54aa723a2848a56008d18875f91c1db02c32ef6a62319a002e3e25a975f", size = 1608866, upload-time = "2025-12-13T06:50:30.72Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl", hash = "sha256:d16c9bbc61ea14637596c5f6fbff2ee99cbe3573e46a716401734ef50c3060c2", size = 1333658, upload-time = "2025-12-13T06:50:28.266Z" },
+]
+
+[[package]]
+name = "cachetools"
+version = "7.0.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/af/dd/57fe3fdb6e65b25a5987fd2cdc7e22db0aef508b91634d2e57d22928d41b/cachetools-7.0.5.tar.gz", hash = "sha256:0cd042c24377200c1dcd225f8b7b12b0ca53cc2c961b43757e774ebe190fd990", size = 37367, upload-time = "2026-03-09T20:51:29.451Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/f3/39cf3367b8107baa44f861dc802cbf16263c945b62d8265d36034fc07bea/cachetools-7.0.5-py3-none-any.whl", hash = "sha256:46bc8ebefbe485407621d0a4264b23c080cedd913921bad7ac3ed2f26c183114", size = 13918, upload-time = "2026-03-09T20:51:27.33Z" },
+]
+
+[[package]]
+name = "caio"
+version = "0.9.25"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/92/88/b8527e1b00c1811db339a1df8bd1ae49d146fcea9d6a5c40e3a80aaeb38d/caio-0.9.25.tar.gz", hash = "sha256:16498e7f81d1d0f5a4c0ad3f2540e65fe25691376e0a5bd367f558067113ed10", size = 26781, upload-time = "2025-12-26T15:21:36.501Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/31/57/5e6ff127e6f62c9f15d989560435c642144aa4210882f9494204bc892305/caio-0.9.25-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d6c2a3411af97762a2b03840c3cec2f7f728921ff8adda53d7ea2315a8563451", size = 36979, upload-time = "2025-12-26T15:21:35.484Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/9f/f21af50e72117eb528c422d4276cbac11fb941b1b812b182e0a9c70d19c5/caio-0.9.25-cp313-cp313-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0998210a4d5cd5cb565b32ccfe4e53d67303f868a76f212e002a8554692870e6", size = 81900, upload-time = "2025-12-26T15:22:21.919Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/12/c39ae2a4037cb10ad5eb3578eb4d5f8c1a2575c62bba675f3406b7ef0824/caio-0.9.25-cp313-cp313-manylinux_2_34_aarch64.whl", hash = "sha256:1a177d4777141b96f175fe2c37a3d96dec7911ed9ad5f02bac38aaa1c936611f", size = 81523, upload-time = "2026-03-04T22:08:25.187Z" },
+    { url = "https://files.pythonhosted.org/packages/22/59/f8f2e950eb4f1a5a3883e198dca514b9d475415cb6cd7b78b9213a0dd45a/caio-0.9.25-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:9ed3cfb28c0e99fec5e208c934e5c157d0866aa9c32aa4dc5e9b6034af6286b7", size = 80243, upload-time = "2026-03-04T22:08:26.449Z" },
+    { url = "https://files.pythonhosted.org/packages/86/93/1f76c8d1bafe3b0614e06b2195784a3765bbf7b0a067661af9e2dd47fc33/caio-0.9.25-py3-none-any.whl", hash = "sha256:06c0bb02d6b929119b1cfbe1ca403c768b2013a369e2db46bfa2a5761cf82e40", size = 19087, upload-time = "2025-12-26T15:22:00.221Z" },
+]
+
 [[package]]
 name = "cattrs"
 version = "25.3.0"
@@ -275,6 +330,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" },
 ]
 
+[[package]]
+name = "cyclopts"
+version = "4.10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "docstring-parser" },
+    { name = "rich" },
+    { name = "rich-rst" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2c/e7/3e26855c046ac527cf94d890f6698e703980337f22ea7097e02b35b910f9/cyclopts-4.10.0.tar.gz", hash = "sha256:0ae04a53274e200ef3477c8b54de63b019bc6cd0162d75c718bf40c9c3fb5268", size = 166394, upload-time = "2026-03-14T14:09:31.043Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/06/d68a5d5d292c2ad2bc6a02e5ca2cb1bb9c15e941ab02f004a06a342d7f0f/cyclopts-4.10.0-py3-none-any.whl", hash = "sha256:50f333382a60df8d40ec14aa2e627316b361c4f478598ada1f4169d959bf9ea7", size = 204097, upload-time = "2026-03-14T14:09:32.504Z" },
+]
+
 [[package]]
 name = "deprecated"
 version = "1.3.1"
@@ -327,6 +397,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/56/7b/af3d0da15bed3a8665419bb3a630585756920f4ad67abfdfef26240ebcc0/docstring_to_markdown-0.17-py3-none-any.whl", hash = "sha256:fd7d5094aa83943bf5f9e1a13701866b7c452eac19765380dead666e36d3711c", size = 23479, upload-time = "2025-05-02T15:09:06.676Z" },
 ]
 
+[[package]]
+name = "docutils"
+version = "0.22.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
+]
+
 [[package]]
 name = "email-validator"
 version = "2.3.0"
@@ -340,9 +419,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/15/545e2b6cf2e3be84bc1ed85613edd75b8aea69807a71c26f4ca6a9258e82/email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4", size = 35604, upload-time = "2025-08-26T13:09:05.858Z" },
 ]
 
+[[package]]
+name = "exceptiongroup"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
+]
+
 [[package]]
 name = "fast-agent-mcp"
-version = "0.4.53"
+version = "0.6.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "a2a-sdk" },
@@ -352,6 +440,7 @@ dependencies = [
     { name = "deprecated" },
     { name = "email-validator" },
     { name = "fastapi" },
+    { name = "fastmcp" },
     { name = "google-genai" },
     { name = "keyring" },
     { name = "mcp" },
@@ -376,9 +465,9 @@ dependencies = [
     { name = "uvloop", marker = "sys_platform != 'win32'" },
     { name = "watchfiles" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/89/48/027760d3e271299ad71b4baef77f0edb509fcf1ad1e0b6e38367fabf622f/fast_agent_mcp-0.4.53.tar.gz", hash = "sha256:bada3c4ec8be873e2b0fa844524df9da0c0492ca67270ec2b826e7e319f95dda", size = 1688537, upload-time = "2026-02-15T23:09:31.809Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/de/b4/abee6994b9d72a6b8763b0f1ac6273a54e6529e26277146df88d58366754/fast_agent_mcp-0.6.7.tar.gz", hash = "sha256:307148c04c3a8817a46e873137d7bd36f03feff53b90e3ec98c8798d6ccb49f8", size = 1992547, upload-time = "2026-03-22T20:50:48.563Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7c/3b/c385a276521033ce1dec729feb9b7760a7d6f7ff15641e51c51b6d27301d/fast_agent_mcp-0.4.53-py3-none-any.whl", hash = "sha256:9dac6fe59e552b3ba56d19e225bbc59a9e3ec20ac7b8cfe1760c62cb54384a23", size = 1130674, upload-time = "2026-02-15T23:09:26.314Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/ea/a99a6859316172cbbb419ddc7923a450c7f36dd1f8a121d0bbc8e47932c9/fast_agent_mcp-0.6.7-py3-none-any.whl", hash = "sha256:c59ac2f24c677fa3966da0662b9387f64aaf491d76d617fb697563783ed239b3", size = 1461755, upload-time = "2026-03-22T20:50:46.885Z" },
 ]
 
 [[package]]
@@ -396,6 +485,38 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/05/5cbb59154b093548acd0f4c7c474a118eda06da25aa75c616b72d8fcd92a/fastapi-0.128.0-py3-none-any.whl", hash = "sha256:aebd93f9716ee3b4f4fcfe13ffb7cf308d99c9f3ab5622d8877441072561582d", size = 103094, upload-time = "2025-12-27T15:21:12.154Z" },
 ]
 
+[[package]]
+name = "fastmcp"
+version = "3.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "authlib" },
+    { name = "cyclopts" },
+    { name = "exceptiongroup" },
+    { name = "httpx" },
+    { name = "jsonref" },
+    { name = "jsonschema-path" },
+    { name = "mcp" },
+    { name = "openapi-pydantic" },
+    { name = "opentelemetry-api" },
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "py-key-value-aio", extra = ["filetree", "keyring", "memory"] },
+    { name = "pydantic", extra = ["email"] },
+    { name = "pyperclip" },
+    { name = "python-dotenv" },
+    { name = "pyyaml" },
+    { name = "rich" },
+    { name = "uncalled-for" },
+    { name = "uvicorn" },
+    { name = "watchfiles" },
+    { name = "websockets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/25/83/c95d3bf717698a693eccb43e137a32939d2549876e884e246028bff6ecce/fastmcp-3.1.1.tar.gz", hash = "sha256:db184b5391a31199323766a3abf3a8bfbb8010479f77eca84c0e554f18655c48", size = 17347644, upload-time = "2026-03-14T19:12:20.235Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/70/ea/570122de7e24f72138d006f799768e14cc1ccf7fcb22b7750b2bd276c711/fastmcp-3.1.1-py3-none-any.whl", hash = "sha256:8132ba069d89f14566b3266919d6d72e2ec23dd45d8944622dca407e9beda7eb", size = 633754, upload-time = "2026-03-14T19:12:22.736Z" },
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.8.0"
@@ -473,7 +594,7 @@ requests = [
 
 [[package]]
 name = "google-genai"
-version = "1.60.0"
+version = "1.68.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -487,9 +608,9 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "websockets" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0a/3f/a753be0dcee352b7d63bc6d1ba14a72591d63b6391dac0cdff7ac168c530/google_genai-1.60.0.tar.gz", hash = "sha256:9768061775fddfaecfefb0d6d7a6cabefb3952ebd246cd5f65247151c07d33d1", size = 487721, upload-time = "2026-01-21T22:17:30.398Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/2c/f059982dbcb658cc535c81bbcbe7e2c040d675f4b563b03cdb01018a4bc3/google_genai-1.68.0.tar.gz", hash = "sha256:ac30c0b8bc630f9372993a97e4a11dae0e36f2e10d7c55eacdca95a9fa14ca96", size = 511285, upload-time = "2026-03-18T01:03:18.243Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/e5/384b1f383917b5f0ae92e28f47bc27b16e3d26cd9bacb25e9f8ecab3c8fe/google_genai-1.60.0-py3-none-any.whl", hash = "sha256:967338378ffecebec19a8ed90cf8797b26818bacbefd7846a9280beb1099f7f3", size = 719431, upload-time = "2026-01-21T22:17:28.086Z" },
+    { url = "https://files.pythonhosted.org/packages/84/de/7d3ee9c94b74c3578ea4f88d45e8de9405902f857932334d81e89bce3dfa/google_genai-1.68.0-py3-none-any.whl", hash = "sha256:a1bc9919c0e2ea2907d1e319b65471d3d6d58c54822039a249fe1323e4178d15", size = 750912, upload-time = "2026-03-18T01:03:15.983Z" },
 ]
 
 [[package]]
@@ -689,6 +810,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" },
 ]
 
+[[package]]
+name = "jsonref"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" },
+]
+
 [[package]]
 name = "jsonschema"
 version = "4.26.0"
@@ -704,6 +834,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" },
 ]
 
+[[package]]
+name = "jsonschema-path"
+version = "0.4.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pathable" },
+    { name = "pyyaml" },
+    { name = "referencing" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/8a/7e6102f2b8bdc6705a9eb5294f8f6f9ccd3a8420e8e8e19671d1dd773251/jsonschema_path-0.4.5.tar.gz", hash = "sha256:c6cd7d577ae290c7defd4f4029e86fdb248ca1bd41a07557795b3c95e5144918", size = 15113, upload-time = "2026-03-03T09:56:46.87Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/d5/4e96c44f6c1ea3d812cf5391d81a4f5abaa540abf8d04ecd7f66e0ed11df/jsonschema_path-0.4.5-py3-none-any.whl", hash = "sha256:7d77a2c3f3ec569a40efe5c5f942c44c1af2a6f96fe0866794c9ef5b8f87fd65", size = 19368, upload-time = "2026-03-03T09:56:45.39Z" },
+]
+
 [[package]]
 name = "jsonschema-specifications"
 version = "2025.9.1"
@@ -863,7 +1007,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "2.21.0"
+version = "2.29.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -875,9 +1019,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/92/e5/3d197a0947a166649f566706d7a4c8f7fe38f1fa7b24c9bcffe4c7591d44/openai-2.21.0.tar.gz", hash = "sha256:81b48ce4b8bbb2cc3af02047ceb19561f7b1dc0d4e52d1de7f02abfd15aa59b7", size = 644374, upload-time = "2026-02-14T00:12:01.577Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b4/15/203d537e58986b5673e7f232453a2a2f110f22757b15921cbdeea392e520/openai-2.29.0.tar.gz", hash = "sha256:32d09eb2f661b38d3edd7d7e1a2943d1633f572596febe64c0cd370c86d52bec", size = 671128, upload-time = "2026-03-17T17:53:49.599Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/56/0a89092a453bb2c676d66abee44f863e742b2110d4dbb1dbcca3f7e5fc33/openai-2.21.0-py3-none-any.whl", hash = "sha256:0bc1c775e5b1536c294eded39ee08f8407656537ccc71b1004104fe1602e267c", size = 1103065, upload-time = "2026-02-14T00:11:59.603Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/b1/35b6f9c8cf9318e3dbb7146cc82dab4cf61182a8d5406fc9b50864362895/openai-2.29.0-py3-none-any.whl", hash = "sha256:b7c5de513c3286d17c5e29b92c4c98ceaf0d775244ac8159aeb1bddf840eb42a", size = 1141533, upload-time = "2026-03-17T17:53:47.348Z" },
 ]
 
 [package.optional-dependencies]
@@ -886,6 +1030,18 @@ aiohttp = [
     { name = "httpx-aiohttp" },
 ]
 
+[[package]]
+name = "openapi-pydantic"
+version = "0.5.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/2e/58d83848dd1a79cb92ed8e63f6ba901ca282c5f09d04af9423ec26c56fd7/openapi_pydantic-0.5.1.tar.gz", hash = "sha256:ff6835af6bde7a459fb93eb93bb92b8749b754fc6e51b2f1590a19dc3005ee0d", size = 60892, upload-time = "2025-01-08T19:29:27.083Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/cf/03675d8bd8ecbf4445504d8071adab19f5f993676795708e36402ab38263/openapi_pydantic-0.5.1-py3-none-any.whl", hash = "sha256:a3a09ef4586f5bd760a8df7f43028b60cafb6d9f61de2acba9574766255ab146", size = 96381, upload-time = "2025-01-08T19:29:25.275Z" },
+]
+
 [[package]]
 name = "opentelemetry-api"
 version = "1.39.1"
@@ -1098,6 +1254,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" },
 ]
 
+[[package]]
+name = "pathable"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/55/b748445cb4ea6b125626f15379be7c96d1035d4fa3e8fee362fa92298abf/pathable-0.5.0.tar.gz", hash = "sha256:d81938348a1cacb525e7c75166270644782c0fb9c8cecc16be033e71427e0ef1", size = 16655, upload-time = "2026-02-20T08:47:00.748Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/96/5a770e5c461462575474468e5af931cff9de036e7c2b4fea23c1c58d2cbe/pathable-0.5.0-py3-none-any.whl", hash = "sha256:646e3d09491a6351a0c82632a09c02cdf70a252e73196b36d8a15ba0a114f0a6", size = 16867, upload-time = "2026-02-20T08:46:59.536Z" },
+]
+
+[[package]]
+name = "platformdirs"
+version = "4.9.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/56/8d4c30c8a1d07013911a8fdbd8f89440ef9f08d07a1b50ab8ca8be5a20f9/platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934", size = 28737, upload-time = "2026-03-05T18:34:13.271Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.6.0"
@@ -1207,6 +1381,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3e/73/2ce007f4198c80fcf2cb24c169884f833fe93fbc03d55d302627b094ee91/psutil-7.2.1-cp37-abi3-win_arm64.whl", hash = "sha256:0d67c1822c355aa6f7314d92018fb4268a76668a536f133599b91edd48759442", size = 133836, upload-time = "2025-12-29T08:26:43.086Z" },
 ]
 
+[[package]]
+name = "py-key-value-aio"
+version = "0.4.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beartype" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/3c/0397c072a38d4bc580994b42e0c90c5f44f679303489e4376289534735e5/py_key_value_aio-0.4.4.tar.gz", hash = "sha256:e3012e6243ed7cc09bb05457bd4d03b1ba5c2b1ca8700096b3927db79ffbbe55", size = 92300, upload-time = "2026-02-16T21:21:43.245Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/69/f1b537ee70b7def42d63124a539ed3026a11a3ffc3086947a1ca6e861868/py_key_value_aio-0.4.4-py3-none-any.whl", hash = "sha256:18e17564ecae61b987f909fc2cd41ee2012c84b4b1dcb8c055cf8b4bc1bf3f5d", size = 152291, upload-time = "2026-02-16T21:21:44.241Z" },
+]
+
+[package.optional-dependencies]
+filetree = [
+    { name = "aiofile" },
+    { name = "anyio" },
+]
+keyring = [
+    { name = "keyring" },
+]
+memory = [
+    { name = "cachetools" },
+]
+
 [[package]]
 name = "pyasn1"
 version = "0.6.2"
@@ -1252,6 +1451,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
 ]
 
+[package.optional-dependencies]
+email = [
+    { name = "email-validator" },
+]
+
 [[package]]
 name = "pydantic-core"
 version = "2.41.5"
@@ -1501,15 +1705,28 @@ wheels = [
 
 [[package]]
 name = "rich"
-version = "14.3.1"
+version = "14.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "markdown-it-py" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a1/84/4831f881aa6ff3c976f6d6809b58cdfa350593ffc0dc3c58f5f6586780fb/rich-14.3.1.tar.gz", hash = "sha256:b8c5f568a3a749f9290ec6bddedf835cec33696bfc1e48bcfecb276c7386e4b8", size = 230125, upload-time = "2026-01-24T21:40:44.847Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/87/2a/a1810c8627b9ec8c57ec5ec325d306701ae7be50235e8fd81266e002a3cc/rich-14.3.1-py3-none-any.whl", hash = "sha256:da750b1aebbff0b372557426fb3f35ba56de8ef954b3190315eb64076d6fb54e", size = 309952, upload-time = "2026-01-24T21:40:42.969Z" },
+    { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" },
+]
+
+[[package]]
+name = "rich-rst"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docutils" },
+    { name = "rich" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/6d/a506aaa4a9eaa945ed8ab2b7347859f53593864289853c5d6d62b77246e0/rich_rst-1.3.2.tar.gz", hash = "sha256:a1196fdddf1e364b02ec68a05e8ff8f6914fee10fbca2e6b6735f166bb0da8d4", size = 14936, upload-time = "2025-10-14T16:49:45.332Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl", hash = "sha256:a99b4907cbe118cf9d18b0b44de272efa61f15117c61e39ebdc431baf5df722a", size = 12567, upload-time = "2025-10-14T16:49:42.953Z" },
 ]
 
 [[package]]
@@ -1699,19 +1916,43 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
 ]
 
+[[package]]
+name = "ty"
+version = "0.0.24"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7a/96/652a425030f95dc2c9548d9019e52502e17079e1daeefbc4036f1c0905b4/ty-0.0.24.tar.gz", hash = "sha256:9fe42f6b98207bdaef51f71487d6d087f2cb02555ee3939884d779b2b3cc8bfc", size = 5354286, upload-time = "2026-03-19T16:55:57.035Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/e5/34457ee11708e734ba81ad65723af83030e484f961e281d57d1eecf08951/ty-0.0.24-py3-none-linux_armv6l.whl", hash = "sha256:1ab4f1f61334d533a3fdf5d9772b51b1300ac5da4f3cdb0be9657a3ccb2ce3e7", size = 10394877, upload-time = "2026-03-19T16:55:54.246Z" },
+    { url = "https://files.pythonhosted.org/packages/44/81/bc9a1b1a87f43db15ab64ad781a4f999734ec3b470ad042624fa875b20e6/ty-0.0.24-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:facbf2c4aaa6985229e08f8f9bf152215eb078212f22b5c2411f35386688ab42", size = 10211109, upload-time = "2026-03-19T16:55:28.554Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/63/cfc805adeaa61d63ba3ea71127efa7d97c40ba36d97ee7bd957341d05107/ty-0.0.24-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b6d2a3b6d4470c483552a31e9b368c86f154dcc964bccb5406159dc9cd362246", size = 9694769, upload-time = "2026-03-19T16:55:34.309Z" },
+    { url = "https://files.pythonhosted.org/packages/33/09/edc220726b6ec44a58900401f6b27140997ef15026b791e26b69a6e69eb5/ty-0.0.24-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c94c25d0500939fd5f8f16ce41cbed5b20528702c1d649bf80300253813f0a2", size = 10176287, upload-time = "2026-03-19T16:55:37.17Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/bf/cbe2227be711e65017655d8ee4d050f4c92b113fb4dc4c3bd6a19d3a86d8/ty-0.0.24-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:89cbe7bc7df0fab02dbd8cda79b737df83f1ef7fb573b08c0ee043dc68cffb08", size = 10214832, upload-time = "2026-03-19T16:56:08.518Z" },
+    { url = "https://files.pythonhosted.org/packages/af/1d/d15803ee47e9143d10e10bd81ccc14761d08758082bda402950685f0ddfe/ty-0.0.24-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2c5d269bcc9b764850c99f457b5018a79b3ef40ecfbc03344e65effd6cf743", size = 10709892, upload-time = "2026-03-19T16:56:05.727Z" },
+    { url = "https://files.pythonhosted.org/packages/36/12/6db0d86c477147f67b9052de209421d76c3e855197b000c25fcbbe86b3a2/ty-0.0.24-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba44512db5b97c3bbd59d93e11296e8548d0c9a3bdd1280de36d7ff22d351896", size = 11280872, upload-time = "2026-03-19T16:56:02.899Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/fc/155fe83a97c06d33ccc9e0f428258b32df2e08a428300c715d34757f0111/ty-0.0.24-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a52b7f589c3205512a9c50ba5b2b1e8c0698b72e51b8b9285c90420c06f1cae8", size = 11060520, upload-time = "2026-03-19T16:55:59.956Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/f1/32c05a1c4c3c2a95c5b7361dee03a9bf1231d4ad096b161c838b45bce5a0/ty-0.0.24-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7981df5c709c054da4ac5d7c93f8feb8f45e69e829e4461df4d5f0988fe67d04", size = 10791455, upload-time = "2026-03-19T16:55:25.728Z" },
+    { url = "https://files.pythonhosted.org/packages/17/2c/53c1ea6bedfa4d4ab64d4de262d8f5e405ecbffefd364459c628c0310d33/ty-0.0.24-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b2860151ad95a00d0f0280b8fef79900d08dcd63276b57e6e5774f2c055979c5", size = 10156708, upload-time = "2026-03-19T16:55:45.563Z" },
+    { url = "https://files.pythonhosted.org/packages/45/39/7d2919cf194707169474d80720a5f3d793e983416f25e7ffcf80504c9df2/ty-0.0.24-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:5674a1146d927ab77ff198a88e0c4505134ced342a0e7d1beb4a076a728b7496", size = 10236263, upload-time = "2026-03-19T16:55:31.474Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/7f/48eac722f2fd12a5b7aae0effdcb75c46053f94b783d989e3ef0d7380082/ty-0.0.24-py3-none-musllinux_1_2_i686.whl", hash = "sha256:438ecbf1608a9b16dd84502f3f1b23ef2ef32bbd0ab3e0ca5a82f0e0d1cd41ea", size = 10402559, upload-time = "2026-03-19T16:55:39.602Z" },
+    { url = "https://files.pythonhosted.org/packages/75/e0/8cf868b9749ce1e5166462759545964e95b02353243594062b927d8bff2a/ty-0.0.24-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ddeed3098dd92a83964e7aa7b41e509ba3530eb539fc4cd8322ff64a09daf1f5", size = 10893684, upload-time = "2026-03-19T16:55:51.439Z" },
+    { url = "https://files.pythonhosted.org/packages/17/9f/f54bf3be01d2c2ed731d10a5afa3324dc66f987a6ae0a4a6cbfa2323d080/ty-0.0.24-py3-none-win32.whl", hash = "sha256:83013fb3a4764a8f8bcc6ca11ff8bdfd8c5f719fc249241cb2b8916e80778eb1", size = 9781542, upload-time = "2026-03-19T16:56:11.588Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/49/c004c5cc258b10b3a145666e9a9c28ae7678bc958c8926e8078d5d769081/ty-0.0.24-py3-none-win_amd64.whl", hash = "sha256:748a60eb6912d1cf27aaab105ffadb6f4d2e458a3fcadfbd3cf26db0d8062eeb", size = 10764801, upload-time = "2026-03-19T16:55:42.752Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/59/006a074e185bfccf5e4c026015245ab4fcd2362b13a8d24cf37a277909a9/ty-0.0.24-py3-none-win_arm64.whl", hash = "sha256:280a3d31e86d0721947238f17030c33f0911cae851d108ea9f4e3ab12a5ed01f", size = 10194093, upload-time = "2026-03-19T16:55:48.303Z" },
+]
+
 [[package]]
 name = "typer"
-version = "0.21.1"
+version = "0.24.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "annotated-doc" },
     { name = "click" },
     { name = "rich" },
     { name = "shellingham" },
-    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/36/bf/8825b5929afd84d0dabd606c67cd57b8388cb3ec385f7ef19c5cc2202069/typer-0.21.1.tar.gz", hash = "sha256:ea835607cd752343b6b2b7ce676893e5a0324082268b48f27aa058bdb7d2145d", size = 110371, upload-time = "2026-01-06T11:21:10.989Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl", hash = "sha256:7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01", size = 47381, upload-time = "2026-01-06T11:21:09.824Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" },
 ]
 
 [[package]]
@@ -1735,6 +1976,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
 ]
 
+[[package]]
+name = "uncalled-for"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/7c/b5b7d8136f872e3f13b0584e576886de0489d7213a12de6bebf29ff6ebfc/uncalled_for-0.2.0.tar.gz", hash = "sha256:b4f8fdbcec328c5a113807d653e041c5094473dd4afa7c34599ace69ccb7e69f", size = 49488, upload-time = "2026-02-27T17:40:58.137Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/7f/4320d9ce3be404e6310b915c3629fe27bf1e2f438a1a7a3cb0396e32e9a9/uncalled_for-0.2.0-py3-none-any.whl", hash = "sha256:2c0bd338faff5f930918f79e7eb9ff48290df2cb05fcc0b40a7f334e55d4d85f", size = 11351, upload-time = "2026-02-27T17:40:56.804Z" },
+]
+
 [[package]]
 name = "upskill"
 version = "0.2.1"
@@ -1753,19 +2003,21 @@ dev = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "ruff" },
+    { name = "ty" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "click", specifier = ">=8.1" },
-    { name = "fast-agent-mcp", specifier = ">=0.4.53" },
+    { name = "fast-agent-mcp", specifier = ">=0.6.7" },
     { name = "pydantic", specifier = ">=2.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" },
     { name = "python-dotenv", specifier = ">=1.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "rich", specifier = ">=13.0" },
-    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" },
+    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.11" },
+    { name = "ty", marker = "extra == 'dev'", specifier = ">=0.0.23" },
 ]
 provides-extras = ["dev"]