diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81945b4..3fa9e69 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,27 @@ on: branches: [main] jobs: + format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13.5" + + - name: Install dependencies + run: uv sync --frozen --extra dev + + - name: Check formatting + run: uv run scripts/format.py + lint: runs-on: ubuntu-latest steps: @@ -14,15 +35,61 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v5 + with: + enable-cache: true - name: Set up Python - run: uv python install 3.13.5 + uses: actions/setup-python@v5 + with: + python-version: "3.13.5" - name: Install dependencies - run: uv sync --extra dev + run: uv sync --frozen --extra dev - name: Lint with ruff - run: uv run ruff check src/ + run: uv run scripts/lint.py + + cpd: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13.5" + + - name: Install dependencies + run: uv sync --frozen --extra dev + + - name: Check for duplicated code + run: uv run scripts/cpd.py --check + + typecheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13.5" + + - name: Install dependencies + run: uv sync --frozen --extra dev + + - name: Run ty + run: uv run scripts/typecheck.py test: runs-on: ubuntu-latest @@ -31,12 +98,16 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v5 + with: + enable-cache: true - name: Set up Python - run: uv python install 3.13.5 + uses: actions/setup-python@v5 + with: + python-version: "3.13.5" - name: Install dependencies - run: uv sync --extra dev + run: uv sync --frozen --extra dev - name: Run tests run: uv run pytest -v diff --git a/.gitignore b/.gitignore index 6ed73f9..6e1c571 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,7 @@ Thumbs.db *.tmp *.temp *.txt +.smoke-test/ +.fast-agent/ +.fast-agent-old/ +skills/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..240d2e4 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,61 @@ +# Agent guidance + +## Code quality baseline + +- Target Python `3.13`. +- Keep changes compatible with the repo's `pyproject.toml` settings. +- Prefer small, typed, composable functions over large command-style blocks. +- Avoid introducing new complexity suppressions unless there is a strong reason. + +## Required local checks + +Before finishing a change, run: + +```bash +uv sync --extra dev +uv run scripts/format.py +uv run scripts/lint.py +uv run scripts/typecheck.py +uv run --extra dev pytest -v +``` + +CI enforces the same flow in `.github/workflows/ci.yml`. + +## Ruff rules + +Formatting and linting are enforced with Ruff. + +- Line length: `100` +- Target version: `py313` +- Enabled lint families: + - `B` - bugbear + - `C90` - cyclomatic complexity + - `E` - pycodestyle errors + - `F` - pyflakes + - `I` - import sorting + - `RUF` - Ruff-specific rules + - `SIM` - simplifications + - `TCH` - type-checking import hygiene + - `UP` - pyupgrade +- `E501` is ignored; let `ruff format` own line wrapping. +- Cyclomatic complexity limit: `15` + +## Type checking + +- `ty` is required for `src`, `tests`, and `scripts`. +- Add or improve annotations when touching code that is ambiguous to the type checker. +- Prefer explicit protocols / typed helper structures over `object` when wiring dynamic APIs. +- Keep type-only imports behind `TYPE_CHECKING` when appropriate. + +## Tests + +- Add or update tests for behavior changes. +- Keep the test suite passing with `pytest`. +- Use focused unit-style tests for logic changes when possible. + +## Practical authoring guidance + +- Prefer refactoring over adding broad ignores. +- If a function is nearing the complexity limit, split it before adding more branches. +- Keep CLI orchestration, model resolution, and persistence logic separated when possible. +- When adding developer tooling, update the README and CI together. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..75f563d --- /dev/null +++ b/Makefile @@ -0,0 +1,91 @@ +SHELL := /usr/bin/env bash + +# Common +ARTIFACT_REPO ?= +FLAVOR ?= cpu-basic +TIMEOUT ?= 45m +SECRETS ?= HF_TOKEN,OPENAI_API_KEY + +# Fast-agent mode (lean) +SKILLS_DIR ?= +CARD_DIR ?= +FAST_AGENT ?= +FAST_MODEL ?= haiku +MESSAGE ?= Write a concise conventional commit message for: add password reset endpoint with tests. +PROMPT_FILE ?= +PROMPTS_JSONL ?= + +.PHONY: \ + format format-write lint typecheck test check \ + hf-go-check hf-go-smoke hf-go-prompt hf-go-batch + +format: + uv run --extra dev scripts/format.py + +format-write: + uv run --extra dev scripts/format.py --write + +lint: + uv run --extra dev scripts/lint.py + +typecheck: + uv run --extra dev scripts/typecheck.py + +test: + uv run --extra dev pytest -v + +check: format lint typecheck test + +hf-go-check: + @test -n "$(ARTIFACT_REPO)" || (echo "ARTIFACT_REPO is required" && exit 1) + @test -n "$(SKILLS_DIR)" || (echo "SKILLS_DIR is required" && exit 1) + @test -d "$(SKILLS_DIR)" || (echo "SKILLS_DIR not found: $(SKILLS_DIR)" && exit 1) + @test -x scripts/hf/submit_hf_job.sh || (echo "scripts/hf/submit_hf_job.sh missing or not executable" && exit 1) + @test -x scripts/hf/job_entrypoint_fast_agent.sh || (echo "scripts/hf/job_entrypoint_fast_agent.sh missing or not executable" && exit 1) + @hf auth whoami >/dev/null || (echo "hf auth required: run 'hf auth login'" && exit 1) + +hf-go-smoke: hf-go-check + @cmd=(scripts/hf/submit_hf_job.sh \ + --artifact-repo "$(ARTIFACT_REPO)" \ + --skills-dir "$(SKILLS_DIR)" \ + --model "$(FAST_MODEL)" \ + --message "$(MESSAGE)" \ + --flavor "$(FLAVOR)" \ + --timeout "$(TIMEOUT)" \ + --secrets "$(SECRETS)"); \ + if [[ -n "$(CARD_DIR)" ]]; then cmd+=(--card-dir "$(CARD_DIR)"); fi; \ + if [[ -n "$(FAST_AGENT)" ]]; then cmd+=(--agent "$(FAST_AGENT)"); fi; \ + echo "Running: $${cmd[*]}"; \ + "$${cmd[@]}" + +hf-go-prompt: hf-go-check + @test -n "$(PROMPT_FILE)" || (echo "PROMPT_FILE is required" && exit 1) + @test -f "$(PROMPT_FILE)" || (echo "PROMPT_FILE not found: $(PROMPT_FILE)" && exit 1) + @cmd=(scripts/hf/submit_hf_job.sh \ + --artifact-repo "$(ARTIFACT_REPO)" \ + --skills-dir "$(SKILLS_DIR)" \ + --model "$(FAST_MODEL)" \ + --prompt-file "$(PROMPT_FILE)" \ + --flavor "$(FLAVOR)" \ + --timeout "$(TIMEOUT)" \ + --secrets "$(SECRETS)"); \ + if [[ -n "$(CARD_DIR)" ]]; then cmd+=(--card-dir "$(CARD_DIR)"); fi; \ + if [[ -n "$(FAST_AGENT)" ]]; then cmd+=(--agent "$(FAST_AGENT)"); fi; \ + echo "Running: $${cmd[*]}"; \ + "$${cmd[@]}" + +hf-go-batch: hf-go-check + @test -n "$(PROMPTS_JSONL)" || (echo "PROMPTS_JSONL is required" && exit 1) + @test -f "$(PROMPTS_JSONL)" || (echo "PROMPTS_JSONL not found: $(PROMPTS_JSONL)" && exit 1) + @cmd=(scripts/hf/submit_hf_job.sh \ + --artifact-repo "$(ARTIFACT_REPO)" \ + --skills-dir "$(SKILLS_DIR)" \ + --model "$(FAST_MODEL)" \ + --prompts-jsonl "$(PROMPTS_JSONL)" \ + --flavor "$(FLAVOR)" \ + --timeout "$(TIMEOUT)" \ + --secrets "$(SECRETS)"); \ + if [[ -n "$(CARD_DIR)" ]]; then cmd+=(--card-dir "$(CARD_DIR)"); fi; \ + if [[ -n "$(FAST_AGENT)" ]]; then cmd+=(--agent "$(FAST_AGENT)"); fi; \ + echo "Running: $${cmd[*]}"; \ + "$${cmd[@]}" diff --git a/README.md b/README.md index 24e9088..e10eb97 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,18 @@ Generate and evaluate agent skills based on traces with agents. Create skills with teacher models (expensive/slow) that student models (cheap/fast) can use to perform harder tasks reliably. + +> [!TIP] +> +> UPskill v2 - recommended default config file now runs evaluations on Hugging Face Jobs. Make sure +> to set your `HF_TOKEN` and use `--artifact-repo ` for job creation and result capture + ## Quick Start Install upskill: ```bash -pip install upskill +uv pip install upskill # or just use uv uvx upskill ``` @@ -42,6 +48,53 @@ View the results later. upskill runs --skill git-commit-messages ``` +## Development checks + +This repo uses a CI flow inspired by `fast-agent` with separate format, lint, typecheck, and test +stages. + +Install dev dependencies: + +```bash +uv sync --extra dev +``` + +Run the quality gates locally: + +```bash +uv run scripts/format.py +uv run scripts/lint.py +uv run scripts/typecheck.py +uv run scripts/cpd.py --check +uv run --extra dev pytest -v +``` + +Or use the helper script to run the whole sequence: + +```bash +uv run scripts/check.py +``` + +Add `--sync` to include `uv sync --extra dev`, or `--skip-tests` for a faster static-only pass. + +To auto-format before re-running checks: + +```bash +uv run --extra dev scripts/format.py --write +``` + +Current enforced standards: + +- `ruff format --check` for formatting +- `ruff check` for style, imports, modernization, bugbear, simplify, and import-hygiene rules +- cyclomatic complexity via Ruff `C90` with `max-complexity = 15` +- `ty check` across `src`, `tests`, and `scripts` +- `pmd cpd` via `scripts/cpd.py --check` to flag duplicated code in `src/` +- `pytest` for the test suite + +CI enforcement lives in `.github/workflows/ci.yml` and runs on pushes and pull requests targeting +`main`. + ## Model Handling Overview upskill uses distinct phases with explicit model roles: @@ -82,13 +135,15 @@ upskill generate TASK [OPTIONS] **Options:** - `-e, --example` - Input -> output example (can be repeated) -- `--tool` - Generate from MCP tool schema (path#tool_name) - `-f, --from PATH` - Improve from existing skill dir or agent trace file (auto-detected) - `-m, --model MODEL` - Skill generation model (e.g., 'sonnet', 'haiku', 'anthropic.claude-sonnet-4-20250514') - `--test-gen-model MODEL` - Override test generation model for this run - `-o, --output PATH` - Output directory for skill - `--no-eval` - Skip evaluation and refinement - `--eval-model MODEL` - Different model to evaluate skill on +- `--executor [local|jobs]` - Execution backend for evaluation/refinement; overrides config +- `--artifact-repo TEXT` - Dataset repo for remote fast-agent job artifacts (required with `--executor jobs`) +- `--max-parallel N` - Max concurrent evaluation executions; overrides config - `--runs-dir PATH` - Directory for run logs (default: ./runs) - `--log-runs / --no-log-runs` - Log run data (default: enabled) @@ -101,6 +156,9 @@ upskill generate "parse JSON Schema files" # Make and evaluate skills for less powerful models upskill generate "write git commits" --model sonnet --eval-model haiku +# Remote execution on Hugging Face Jobs +upskill generate "parse invoices" --executor jobs --artifact-repo /upskill-tests + # Improve an existing skill (auto-detected as directory) upskill generate "add more error handling examples" --from ./skills/api-errors/ @@ -147,9 +205,11 @@ upskill eval SKILL_PATH [OPTIONS] - `-t, --tests PATH` - Test cases JSON file - `-m, --model MODEL` - Model(s) to evaluate against (repeatable for multi-model benchmarking) - `--test-gen-model MODEL` - Override test generation model when tests must be generated -- `--runs N` - Number of runs per model (default: 1) +- `--runs N` - Number of runs per model; overrides config - `--no-baseline` - Skip baseline comparison (simple eval mode only; ignored in benchmark mode) - `-v, --verbose` - Show per-test results +- `--executor [local|jobs]` - Execution backend for evaluation; overrides config +- `--max-parallel N` - Max concurrent evaluation executions; overrides config - `--log-runs / --no-log-runs` - Log run data (default: enabled) - `--runs-dir PATH` - Directory for run logs @@ -403,7 +463,13 @@ eval_model: haiku # Default evaluation model (optional) test_gen_model: null # Optional test generation model skills_dir: ./skills # Where to save skills runs_dir: ./runs # Where to save run logs -max_refine_attempts: 3 # Refinement iterations +max_refine_attempts: 2 # Refinement iterations +executor: local # Default execution backend +num_runs: 1 # Default eval/benchmark runs when --runs is omitted +max_parallel: 5 # Default concurrent evaluation executions +jobs_secrets: HF_TOKEN # Comma-separated HF Jobs env var names to forward +jobs_image: ghcr.io/astral-sh/uv:python3.13-bookworm # HF Jobs container image +# fastagent_config: ./fastagent.config.yaml # Optional FastAgent config override ``` `test_gen_model` fallback behavior: @@ -417,6 +483,22 @@ max_refine_attempts: 3 # Refinement iterations Backward compatibility: `model` is still accepted in config files as a legacy alias for `skill_generation_model`. +CLI flags override config values for execution settings: + +- `--executor` overrides `executor` +- `--runs` overrides `num_runs` +- `--max-parallel` overrides `max_parallel` +- `--jobs-secrets` overrides `jobs_secrets` + +If you set `executor: jobs`, you still need the required jobs-specific CLI inputs such as +`--artifact-repo`. + +`jobs_secrets` is a comma-separated list of environment variable names to forward into +remote HF Jobs runs. It should contain secret names such as `HF_TOKEN` or +`ANTHROPIC_API_KEY`, not literal secret values. + +`jobs_image` controls which container image HF Jobs uses for remote execution. + Config lookup order: 1. `UPSKILL_CONFIG` environment variable (path) diff --git a/fastagent.config.yaml b/fastagent.config.yaml index 67747f3..d7b381d 100644 --- a/fastagent.config.yaml +++ b/fastagent.config.yaml @@ -6,22 +6,7 @@ # Examples: anthropic.claude-sonnet-4-20250514, openai.gpt-4.1 # Aliases: haiku, sonnet, opus (Anthropic), gpt-4.1, o3-mini (OpenAI) # Local models: generic. (e.g., generic.llama3.2:latest) -default_model: kimi - -# Generic provider for local OpenAI-compatible endpoints (Ollama, llama.cpp, etc.) -# Override with GENERIC_BASE_URL and GENERIC_API_KEY environment variables -generic: - api_key: "local" - base_url: "http://localhost:11434/v1" - -# MCP timeline display settings -mcp_timeline: - steps: 20 - step_seconds: 15 - -# Shell execution settings -shell_execution: - show_bash: true +#default_model: kimi # Logging and Console Configuration logger: @@ -29,4 +14,5 @@ logger: show_chat: false show_tools: false truncate_tools: true - streaming: none + streaming: markdown + diff --git a/pyproject.toml b/pyproject.toml index 160bcec..fb2f0c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.13.5,<3.14" dependencies = [ "click>=8.1", - "fast-agent-mcp>=0.4.53", + "fast-agent-mcp>=0.6.7", "pydantic>=2.0", "python-dotenv>=1.0", "pyyaml>=6.0", @@ -21,7 +21,8 @@ Repository = "https://github.com/huggingface/upskill" dev = [ "pytest>=8.0", "pytest-asyncio>=0.23", - "ruff>=0.4", + "ruff>=0.11", + "ty>=0.0.23", ] [project.scripts] @@ -39,4 +40,21 @@ line-length = 100 target-version = "py313" [tool.ruff.lint] -select = ["E", "F", "I", "UP"] +select = [ + "B", + "C90", + "E", + "F", + "I", + "RUF", + "SIM", + "TCH", + "UP", +] +ignore = ["E501"] + +[tool.ruff.lint.mccabe] +max-complexity = 15 + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..902e2a6 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"""Developer scripts for the upskill repository.""" diff --git a/scripts/check.py b/scripts/check.py new file mode 100644 index 0000000..a59ba13 --- /dev/null +++ b/scripts/check.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import argparse +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Final + +PROJECT_ROOT: Final = Path(__file__).resolve().parent.parent + + +@dataclass(frozen=True) +class CheckStep: + """A named local quality-gate command.""" + + name: str + command: tuple[str, ...] + + +def build_check_steps(*, skip_tests: bool = False) -> list[CheckStep]: + """Build the local quality-gate command sequence.""" + python_executable = sys.executable + steps = [ + CheckStep("format", (python_executable, str(PROJECT_ROOT / "scripts" / "format.py"))), + CheckStep("lint", (python_executable, str(PROJECT_ROOT / "scripts" / "lint.py"))), + CheckStep("typecheck", (python_executable, str(PROJECT_ROOT / "scripts" / "typecheck.py"))), + CheckStep( + "cpd", + (python_executable, str(PROJECT_ROOT / "scripts" / "cpd.py"), "--check"), + ), + ] + if not skip_tests: + steps.append(CheckStep("pytest", (python_executable, "-m", "pytest", "-v"))) + return steps + + +def run_step(step: CheckStep) -> int: + """Run a single quality-gate step.""" + print(f"\n==> {step.name}: {' '.join(step.command)}", flush=True) + try: + completed = subprocess.run(step.command, cwd=PROJECT_ROOT, check=False) + except FileNotFoundError as error: + print(f"Error: failed to execute {step.name}: {error}", file=sys.stderr) + return 1 + return completed.returncode + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run the local quality-gate sequence.") + parser.add_argument( + "--skip-tests", + action="store_true", + help="Skip pytest after running the static checks.", + ) + parser.add_argument( + "--sync", + action="store_true", + help="Run `uv sync --extra dev` before the quality gates.", + ) + args = parser.parse_args() + + if args.sync: + sync_step = CheckStep("sync", ("uv", "sync", "--extra", "dev")) + if run_step(sync_step) != 0: + return 1 + + for step in build_check_steps(skip_tests=args.skip_tests): + if run_step(step) != 0: + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/cpd.py b/scripts/cpd.py new file mode 100644 index 0000000..286a828 --- /dev/null +++ b/scripts/cpd.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +"""Copy/Paste Detector (CPD) runner for upskill. + +Uses PMD's CPD tool to detect duplicated code in the Python source tree. +If Java or PMD are not already available, the script downloads them into +``~/tools`` and reuses them on later runs. + +Usage: + uv run scripts/cpd.py [--min-tokens N] [--format FORMAT] [--report FILE] + +Options: + --min-tokens N Minimum token count for duplication (default: 100) + --format FORMAT Output format: text, csv, xml (default: text) + --report FILE Write report to file (default: stdout) + --check Exit with error code if duplications are found +""" + +from __future__ import annotations + +import argparse +import os +import platform +import shutil +import subprocess +import sys +import tarfile +import urllib.request +import zipfile +from dataclasses import dataclass +from pathlib import Path +from typing import Final + +JRE_VERSION: Final = "17.0.9+9" +PMD_VERSION: Final = "7.9.0" + +TOOLS_DIR: Final = Path.home() / "tools" +JRE_DIR: Final = TOOLS_DIR / f"jdk-{JRE_VERSION}-jre" +PMD_DIR: Final = TOOLS_DIR / f"pmd-bin-{PMD_VERSION}" + +JRE_URL_TEMPLATE: Final = ( + "https://github.com/adoptium/temurin17-binaries/releases/download/" + "jdk-{version}/OpenJDK17U-jre_{arch}_{os}_hotspot_{archive_version}.tar.gz" +) +PMD_URL: Final = ( + "https://github.com/pmd/pmd/releases/download/pmd_releases%2F" + f"{PMD_VERSION}/pmd-dist-{PMD_VERSION}-bin.zip" +) + +CPD_EXCLUSIONS: Final[dict[str, str]] = {} + + +@dataclass(frozen=True) +class PlatformConfig: + """Resolved platform labels for tool downloads.""" + + system: str + arch: str + os_label: str + arch_label: str + + @property + def archive_version(self) -> str: + return JRE_VERSION.replace("+", "_") + + @property + def version_label(self) -> str: + return JRE_VERSION.replace("+", "%2B") + + @property + def java_name(self) -> str: + return "java.exe" if self.system == "windows" else "java" + + @property + def pmd_name(self) -> str: + return "pmd.bat" if self.system == "windows" else "pmd" + + @property + def jre_filename(self) -> str: + return f"OpenJDK17U-jre_{self.arch_label}_{self.os_label}_hotspot_{self.archive_version}" + + @property + def jre_url(self) -> str: + return JRE_URL_TEMPLATE.format( + version=self.version_label, + arch=self.arch_label, + os=self.os_label, + archive_version=self.archive_version, + ) + + +def resolve_platform(*, system: str | None = None, arch: str | None = None) -> PlatformConfig: + """Resolve download labels for the current platform.""" + normalized_system = (system or platform.system()).lower() + normalized_arch = (arch or platform.machine()).lower() + + arch_label = { + "x86_64": "x64", + "amd64": "x64", + "aarch64": "aarch64", + "arm64": "aarch64", + }.get(normalized_arch, normalized_arch) + + os_label = { + "darwin": "mac", + "linux": "linux", + "windows": "windows", + }.get(normalized_system, normalized_system) + + return PlatformConfig( + system=normalized_system, + arch=normalized_arch, + os_label=os_label, + arch_label=arch_label, + ) + + +def download_file(url: str, destination: Path, description: str) -> None: + """Download a file with simple progress reporting.""" + print(f"Downloading {description}...") + try: + urllib.request.urlretrieve(url, destination) + except Exception as error: # pragma: no cover - network failures are environment-specific + print(f" Failed to download: {error}", file=sys.stderr) + raise SystemExit(1) from error + print(f" Downloaded to {destination}") + + +def extract_tar_archive(archive_path: Path, destination: Path) -> None: + """Extract a tar.gz archive while guarding against path traversal.""" + destination_root = destination.resolve() + with tarfile.open(archive_path, "r:gz") as archive: + for member in archive.getmembers(): + member_path = (destination_root / member.name).resolve() + try: + member_path.relative_to(destination_root) + except ValueError as error: + message = f"Unsafe path in archive {archive_path}: {member.name}" + raise RuntimeError(message) from error + archive.extractall(destination_root) + + +def ensure_jre(platform_config: PlatformConfig) -> Path: + """Ensure Java is available, downloading a JRE if required.""" + java_bin = JRE_DIR / "bin" / platform_config.java_name + if java_bin.exists(): + return JRE_DIR + + system_java = shutil.which("java") + if system_java: + try: + result = subprocess.run( + [system_java, "-version"], + capture_output=True, + check=False, + text=True, + ) + except OSError: + pass + else: + version_output = result.stderr + result.stdout + if "17" in version_output or "21" in version_output: + print(f"Using system Java: {system_java}") + return Path(system_java).resolve().parent.parent + + TOOLS_DIR.mkdir(parents=True, exist_ok=True) + archive_path = TOOLS_DIR / f"{platform_config.jre_filename}.tar.gz" + if not archive_path.exists(): + download_file(platform_config.jre_url, archive_path, f"Java JRE {JRE_VERSION}") + + print("Extracting Java JRE...") + extract_tar_archive(archive_path, TOOLS_DIR) + + if JRE_DIR.exists(): + return JRE_DIR + + for candidate in TOOLS_DIR.iterdir(): + if candidate.is_dir() and candidate.name.startswith("jdk-17"): + return candidate + + message = f"Unable to locate extracted Java runtime under {TOOLS_DIR}" + raise RuntimeError(message) + + +def ensure_pmd(platform_config: PlatformConfig) -> Path: + """Ensure PMD is available, downloading it if required.""" + pmd_bin = PMD_DIR / "bin" / platform_config.pmd_name + if pmd_bin.exists(): + return PMD_DIR + + TOOLS_DIR.mkdir(parents=True, exist_ok=True) + archive_path = TOOLS_DIR / f"pmd-{PMD_VERSION}.zip" + if not archive_path.exists(): + download_file(PMD_URL, archive_path, f"PMD {PMD_VERSION}") + + print("Extracting PMD...") + with zipfile.ZipFile(archive_path, "r") as archive: + archive.extractall(TOOLS_DIR) + + if platform_config.system != "windows": + pmd_bin.chmod(0o755) + + return PMD_DIR + + +def build_cpd_command( + *, + platform_config: PlatformConfig, + pmd_dir: Path, + src_dir: Path, + excluded_paths: list[Path], + min_tokens: int, + output_format: str, +) -> list[str]: + """Build the PMD CPD command line.""" + pmd_bin = pmd_dir / "bin" / platform_config.pmd_name + command = [ + str(pmd_bin), + "cpd", + "--language", + "python", + "--minimum-tokens", + str(min_tokens), + "--dir", + str(src_dir), + "--format", + output_format, + ] + for excluded_path in excluded_paths: + command.extend(["--exclude", str(excluded_path)]) + return command + + +def run_cpd( + *, + platform_config: PlatformConfig, + java_home: Path, + pmd_dir: Path, + src_dir: Path, + excluded_paths: list[Path], + min_tokens: int = 100, + output_format: str = "text", +) -> tuple[int, str]: + """Run CPD and return its exit code and combined output.""" + env = os.environ.copy() + env["JAVA_HOME"] = str(java_home) + env["PATH"] = f"{java_home / 'bin'}{os.pathsep}{env.get('PATH', '')}" + + command = build_cpd_command( + platform_config=platform_config, + pmd_dir=pmd_dir, + src_dir=src_dir, + excluded_paths=excluded_paths, + min_tokens=min_tokens, + output_format=output_format, + ) + result = subprocess.run(command, capture_output=True, check=False, text=True, env=env) + return result.returncode, result.stdout + result.stderr + + +def resolve_cli_exit_code(*, cpd_exit_code: int, check: bool) -> int: + """Map PMD CPD exit codes to the script's CLI exit codes.""" + if cpd_exit_code == 4: + return 1 if check else 0 + return cpd_exit_code + + +def main() -> int: + parser = argparse.ArgumentParser(description="Detect duplicated code in upskill source") + parser.add_argument( + "--min-tokens", + type=int, + default=100, + help="Minimum token count for duplication (default: 100)", + ) + parser.add_argument( + "--format", + choices=["text", "csv", "xml"], + default="text", + help="Output format (default: text)", + ) + parser.add_argument( + "--report", + type=Path, + help="Write report to file (default: stdout)", + ) + parser.add_argument( + "--check", + action="store_true", + help="Exit with error code if duplications are found", + ) + args = parser.parse_args() + + script_dir = Path(__file__).resolve().parent + project_root = script_dir.parent + src_dir = project_root / "src" + if not src_dir.exists(): + print(f"Source directory not found: {src_dir}", file=sys.stderr) + return 1 + + excluded_paths = [project_root / relative_path for relative_path in CPD_EXCLUSIONS] + platform_config = resolve_platform() + + print("Checking dependencies...") + java_home = ensure_jre(platform_config) + pmd_dir = ensure_pmd(platform_config) + print() + + print(f"Running CPD on {src_dir} (min-tokens={args.min_tokens})...") + if excluded_paths: + print("Excluding intentional duplicates:") + for relative_path, reason in CPD_EXCLUSIONS.items(): + print(f" - {relative_path}: {reason}") + print() + + cpd_exit_code, output = run_cpd( + platform_config=platform_config, + java_home=java_home, + pmd_dir=pmd_dir, + src_dir=src_dir, + excluded_paths=excluded_paths, + min_tokens=args.min_tokens, + output_format=args.format, + ) + + if args.report: + args.report.write_text(output, encoding="utf-8") + print(f"Report written to {args.report}") + else: + print(output) + + if cpd_exit_code == 4: + print("\n⚠️ Duplicated code detected!") + elif cpd_exit_code == 0: + print("\n✅ No duplicated code found.") + else: + print(f"\n❌ CPD failed with exit code {cpd_exit_code}", file=sys.stderr) + + return resolve_cli_exit_code(cpd_exit_code=cpd_exit_code, check=args.check) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/format.py b/scripts/format.py new file mode 100644 index 0000000..39fa78f --- /dev/null +++ b/scripts/format.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import argparse +import subprocess +import sys + +DEFAULT_PATHS = ["src", "tests", "scripts"] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run ruff format for the repo.") + parser.add_argument( + "--write", + action="store_true", + help="Apply formatting changes instead of checking only.", + ) + parser.add_argument("paths", nargs="*", default=DEFAULT_PATHS, help="Optional paths to format.") + args = parser.parse_args() + + command = ["ruff", "format", *args.paths] + if not args.write: + command.insert(2, "--check") + + try: + completed = subprocess.run(command, check=False) + except FileNotFoundError: + print("Error: `ruff` is not installed in the current environment.", file=sys.stderr) + return 1 + + return completed.returncode + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/hf/job_entrypoint_eval_fast_agent.sh b/scripts/hf/job_entrypoint_eval_fast_agent.sh new file mode 100644 index 0000000..98e757f --- /dev/null +++ b/scripts/hf/job_entrypoint_eval_fast_agent.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +BUNDLE_DIR="${1:?bundle_dir required}" +OUT_DIR="${2:?out_dir required}" + +mkdir -p "$OUT_DIR/results" "$OUT_DIR/logs" "$OUT_DIR/status" "$OUT_DIR/workspaces" +cp -f "$BUNDLE_DIR/manifest.json" "$OUT_DIR/manifest.json" || true + +COMMON=(fast-agent go --skills-dir "$BUNDLE_DIR/skills" --quiet) +if [[ -f "$BUNDLE_DIR/fastagent.config.yaml" ]]; then + COMMON+=(--config-path "$BUNDLE_DIR/fastagent.config.yaml") +fi +if [[ -d "$BUNDLE_DIR/cards" ]]; then + COMMON+=(--card "$BUNDLE_DIR/cards") +fi +if [[ -f "$BUNDLE_DIR/agent.txt" ]]; then + COMMON+=(--agent "$(cat "$BUNDLE_DIR/agent.txt")") +fi + +FAST_MODEL="${FAST_MODEL:?FAST_MODEL is required}" +overall_status=0 + +for request_dir in "$BUNDLE_DIR"/requests/*; do + [[ -d "$request_dir" ]] || continue + request_id="$(basename "$request_dir")" + prompt_path="$request_dir/prompt.txt" + workspace_src="$request_dir/workspace" + + workspace_tmp="$(mktemp -d)" + if [[ -d "$workspace_src" ]]; then + cp -a "$workspace_src/." "$workspace_tmp/" 2>/dev/null || true + fi + if [[ -f "$BUNDLE_DIR/fastagent.config.yaml" ]]; then + cp -f "$BUNDLE_DIR/fastagent.config.yaml" "$workspace_tmp/fastagent.config.yaml" + fi + + cmd=("${COMMON[@]}" --model "$FAST_MODEL" --prompt-file "$prompt_path" --results "$OUT_DIR/results/$request_id.json") + + printf '%s\n' "${cmd[*]}" > "$OUT_DIR/logs/$request_id.command.txt" + + set +e + ( + cd "$workspace_tmp" + "${cmd[@]}" >"$OUT_DIR/logs/$request_id.out.txt" 2>"$OUT_DIR/logs/$request_id.err.txt" + ) + status=$? + set -e + + printf '%s\n' "$status" > "$OUT_DIR/status/$request_id.exit_code.txt" + mkdir -p "$OUT_DIR/workspaces/$request_id" + cp -a "$workspace_tmp/." "$OUT_DIR/workspaces/$request_id/" 2>/dev/null || true + rm -rf "$workspace_tmp" + + if [[ "$status" -ne 0 ]]; then + overall_status="$status" + fi + if [[ ! -f "$OUT_DIR/results/$request_id.json" ]]; then + overall_status=1 + fi +done + +exit "$overall_status" diff --git a/scripts/hf/job_entrypoint_fast_agent.sh b/scripts/hf/job_entrypoint_fast_agent.sh new file mode 100755 index 0000000..26e1980 --- /dev/null +++ b/scripts/hf/job_entrypoint_fast_agent.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +BUNDLE_DIR="${1:?bundle_dir required}" +OUT_DIR="${2:?out_dir required}" + +mkdir -p "$OUT_DIR/results" "$OUT_DIR/logs" +cp -f "$BUNDLE_DIR/manifest.json" "$OUT_DIR/manifest.json" || true + +COMMON=(fast-agent go) +if [[ -f "$BUNDLE_DIR/fastagent.config.yaml" ]]; then + COMMON+=(--config-path "$BUNDLE_DIR/fastagent.config.yaml") +fi +COMMON+=(--skills-dir "$BUNDLE_DIR/skills") +if [[ -d "$BUNDLE_DIR/cards" ]]; then + COMMON+=(--card "$BUNDLE_DIR/cards") +fi +if [[ -n "${FAST_AGENT:-}" ]]; then + COMMON+=(--agent "$FAST_AGENT") +fi + +if [[ -f "$BUNDLE_DIR/prompts.jsonl" ]]; then + export BUNDLE_DIR OUT_DIR + python - <<'PY' +import json, os, subprocess, pathlib, sys +bundle = pathlib.Path(os.environ["BUNDLE_DIR"]) +out = pathlib.Path(os.environ["OUT_DIR"]) +base = ["fast-agent", "go"] +if (bundle / "fastagent.config.yaml").exists(): + base += ["--config-path", str(bundle / "fastagent.config.yaml")] +base += ["--skills-dir", str(bundle / "skills")] +if (bundle / "cards").exists(): + base += ["--card", str(bundle / "cards")] +agent = os.environ.get("FAST_AGENT", "") +if agent: + base += ["--agent", agent] +default_model = os.environ.get("FAST_MODEL", "") + +failures = 0 +summary = [] +for idx, line in enumerate((bundle / "prompts.jsonl").read_text(encoding="utf-8").splitlines(), start=1): + line = line.strip() + if not line: + continue + rec = json.loads(line) + rid = rec.get("id") or f"case_{idx:03d}" + msg = rec.get("message") + if not msg: + raise SystemExit(f"missing message at line {idx}") + model = rec.get("model") or default_model + result_path = out / "results" / f"{rid}.json" + cmd = base + ["--message", msg, "--results", str(result_path)] + if model: + cmd += ["--model", model] + stdout_path = out / "logs" / f"{rid}.out.txt" + stderr_path = out / "logs" / f"{rid}.err.txt" + with stdout_path.open("w", encoding="utf-8") as so, stderr_path.open("w", encoding="utf-8") as se: + proc = subprocess.run(cmd, stdout=so, stderr=se) + summary.append({"id": rid, "exit_code": proc.returncode, "model": model}) + if proc.returncode != 0: + failures += 1 + +(out / "summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") +if failures: + sys.exit(1) +PY +else + CMD=("${COMMON[@]}" --results "$OUT_DIR/results/default.json") + if [[ -n "${FAST_MODEL:-}" ]]; then + CMD+=(--model "$FAST_MODEL") + fi + + if [[ -n "${FAST_MESSAGE:-}" ]]; then + CMD+=(--message "$FAST_MESSAGE") + elif [[ -f "$BUNDLE_DIR/prompt.txt" ]]; then + CMD+=(--prompt-file "$BUNDLE_DIR/prompt.txt") + else + echo "No message/prompt input found" >&2 + exit 2 + fi + + echo "Running: ${CMD[*]}" | tee "$OUT_DIR/command.txt" + ("${CMD[@]}") 2>&1 | tee "$OUT_DIR/logs/default.out.txt" +fi diff --git a/scripts/hf/submit_hf_job.sh b/scripts/hf/submit_hf_job.sh new file mode 100755 index 0000000..468f1e4 --- /dev/null +++ b/scripts/hf/submit_hf_job.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +set -euo pipefail + +readonly DEFAULT_IMAGE="ghcr.io/astral-sh/uv:python3.13-bookworm" + +usage() { + cat <<'USAGE' +Usage: + submit_hf_job.sh \ + --artifact-repo \ + --skills-dir \ + [--card-dir ] \ + [--agent ] \ + [--model ] \ + [--message | --prompt-file | --prompts-jsonl ] \ + [--flavor cpu-basic] \ + [--timeout 45m] \ + [--image ghcr.io/astral-sh/uv:python3.13-bookworm] \ + [--secrets HF_TOKEN,OPENAI_API_KEY] \ + [--namespace my-org] \ + [--yes] \ + [--json] + +Notes: + - Artifacts are stored in dataset repo under inputs// and outputs// + - prompts-jsonl mode expects one JSON object per line: + {"id":"case1","message":"...","model":"haiku"} +USAGE +} + +fail() { + echo "$*" >&2 + exit 1 +} + +trim() { + xargs <<<"$1" +} + +prepare_secret_flags() { + IFS=',' read -r -a secret_keys <<< "$SECRETS" + secret_flags=() + echo "Secrets to forward:" + for raw_key in "${secret_keys[@]}"; do + key="$(trim "$raw_key")" + [[ -n "$key" ]] || continue + if [[ -n "${!key:-}" ]]; then + echo " - $key (present locally)" + else + echo " - $key (NOT set locally)" + fi + secret_flags+=(--secrets "$key") + done +} + +check_artifact_repo() { + hf download "$ARTIFACT_REPO" --repo-type dataset --dry-run --quiet >/dev/null || \ + fail "Artifact repo $ARTIFACT_REPO is not accessible. Create it first and ensure your current Hugging Face credentials can access it." +} + +submit_bundle_job() { + check_artifact_repo + + tar -czf "$tmpdir/bundle.tar.gz" -C "$tmpdir" bundle + hf upload "$ARTIFACT_REPO" "$tmpdir/bundle.tar.gz" "inputs/$RUN_ID/bundle.tar.gz" \ + --repo-type dataset \ + --commit-message "inputs: $RUN_ID" >/dev/null + + prepare_secret_flags + + if [[ "$AUTO_CONFIRM" != "1" ]]; then + read -r -p "Proceed with HF Job submission? [y/N] " confirm + [[ "$confirm" =~ ^[Yy]$ ]] || fail "Cancelled." + fi + + ns_flags=() + if [[ -n "$NAMESPACE" ]]; then + ns_flags+=(--namespace "$NAMESPACE") + fi + + job_id="$( + hf jobs run \ + --detach \ + --flavor "$FLAVOR" \ + --timeout "$TIMEOUT" \ + "${ns_flags[@]}" \ + "${secret_flags[@]}" \ + "${env_flags[@]}" \ + -- \ + "$IMAGE" \ + bash -lc "$job_cmd" + )" + + job_id="$(echo "$job_id" | tail -n 1 | xargs)" + + if [[ "$JSON_OUTPUT" == "1" ]]; then + cat < "$bundle_dir/manifest.json" < int: + parser = argparse.ArgumentParser(description="Run ruff lint checks for the repo.") + parser.add_argument("--fix", action="store_true", help="Apply safe ruff fixes.") + parser.add_argument("paths", nargs="*", default=DEFAULT_PATHS, help="Optional paths to lint.") + args = parser.parse_args() + + command = ["ruff", "check", *args.paths] + if args.fix: + command.insert(2, "--fix") + + try: + completed = subprocess.run(command, check=False) + except FileNotFoundError: + print("Error: `ruff` is not installed in the current environment.", file=sys.stderr) + return 1 + + return completed.returncode + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/typecheck.py b/scripts/typecheck.py new file mode 100644 index 0000000..775e753 --- /dev/null +++ b/scripts/typecheck.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import argparse +import subprocess +import sys + +DEFAULT_PATHS = ["src", "tests", "scripts"] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run ty type checks for the repo.") + parser.add_argument( + "paths", + nargs="*", + default=DEFAULT_PATHS, + help="Optional paths to type check.", + ) + args = parser.parse_args() + + command = ["ty", "check", *args.paths] + + try: + completed = subprocess.run(command, check=False) + except FileNotFoundError: + print("Error: `ty` is not installed in the current environment.", file=sys.stderr) + return 1 + + return completed.returncode + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/smoke_test.sh b/smoke_test.sh new file mode 100755 index 0000000..fed7d08 --- /dev/null +++ b/smoke_test.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$ROOT_DIR" + +TASK="${TASK:-write a good pull request description}" +MODEL="${MODEL:-qwen35}" +GENERATE_MODEL="${GENERATE_MODEL:-$MODEL}" +TEST_GEN_MODEL="${TEST_GEN_MODEL:-opus}" +START_AT="${START_AT:-prepare}" +ARTIFACT_REPO="${ARTIFACT_REPO:?Set ARTIFACT_REPO to /upskill-evals}" +JOBS_SECRETS="${JOBS_SECRETS:?Set JOBS_SECRETS, e.g. HF_TOKEN,OPENROUTER_API_KEY}" +JOBS_TIMEOUT="${JOBS_TIMEOUT:-45m}" +JOBS_FLAVOR="${JOBS_FLAVOR:-cpu-basic}" +OUT_ROOT="${OUT_ROOT:-$ROOT_DIR/.smoke-test}" +SKILL_OUTPUT="${SKILL_OUTPUT:-$OUT_ROOT/generated-skill}" +LOCAL_RUNS_DIR="${LOCAL_RUNS_DIR:-$OUT_ROOT/local-runs}" +REMOTE_RUNS_DIR="${REMOTE_RUNS_DIR:-$OUT_ROOT/remote-runs}" + +mkdir -p "$OUT_ROOT" + +if [[ "$START_AT" != "prepare" && "$START_AT" != "remote" && "$START_AT" != "local" ]]; then + echo "START_AT must be one of: prepare, remote, local" >&2 + exit 1 +fi + +has_prepared_skill=0 +if [[ -f "$SKILL_OUTPUT/SKILL.md" && -f "$SKILL_OUTPUT/skill_meta.json" ]]; then + has_prepared_skill=1 +fi + +echo "== Model secret check ==" +fast-agent check models --for-model "$MODEL" --json || true + +if [[ "$START_AT" == "prepare" ]]; then + echo + echo "== Prepare skill + tests (no eval) ==" + rm -rf "$SKILL_OUTPUT" + mkdir -p "$(dirname "$SKILL_OUTPUT")" + export SMOKE_TASK="$TASK" + export SMOKE_GENERATE_MODEL="$GENERATE_MODEL" + export SMOKE_TEST_GEN_MODEL="$TEST_GEN_MODEL" + export SMOKE_SKILL_OUTPUT="$SKILL_OUTPUT" + uv run python - <<'PY' +import asyncio +import os +from pathlib import Path + +from upskill.cli import _fast_agent_context, _set_agent_model +from upskill.config import Config +from upskill.generate import generate_skill, generate_tests + + +async def main() -> None: + task = os.environ["SMOKE_TASK"] + generate_model = os.environ["SMOKE_GENERATE_MODEL"] + test_gen_model = os.environ["SMOKE_TEST_GEN_MODEL"] + output_path = Path(os.environ["SMOKE_SKILL_OUTPUT"]) + config = Config.load() + + async with _fast_agent_context(config) as agent: + await _set_agent_model(agent.skill_gen, generate_model) + skill = await generate_skill( + task=task, + generator=agent.skill_gen, + model=generate_model, + ) + await _set_agent_model(agent.test_gen, test_gen_model) + tests = await generate_tests( + task=task, + generator=agent.test_gen, + model=test_gen_model, + ) + skill.save(output_path, tests=tests) + print(f"Prepared skill with tests at {output_path}") + + +asyncio.run(main()) +PY + has_prepared_skill=1 +else + echo + echo "== Reusing prepared skill ==" + if [[ "$has_prepared_skill" != "1" ]]; then + echo "Prepared skill not found at $SKILL_OUTPUT" >&2 + echo "Run with START_AT=prepare first." >&2 + exit 1 + fi + echo "Using $SKILL_OUTPUT" +fi + +if [[ "$START_AT" == "prepare" || "$START_AT" == "remote" ]]; then + echo + echo "== Remote eval via HF Jobs ==" + uv run upskill eval "$SKILL_OUTPUT" \ + --executor jobs \ + --artifact-repo "$ARTIFACT_REPO" \ + -m "$MODEL" \ + --wait \ + --jobs-timeout "$JOBS_TIMEOUT" \ + --jobs-flavor "$JOBS_FLAVOR" \ + --jobs-secrets "$JOBS_SECRETS" \ + --runs-dir "$REMOTE_RUNS_DIR" +fi + +if [[ "$START_AT" == "prepare" || "$START_AT" == "remote" || "$START_AT" == "local" ]]; then + echo + echo "== Local eval via local shell-out executor ==" + uv run upskill eval "$SKILL_OUTPUT" \ + --executor local \ + -m "$GENERATE_MODEL" \ + --runs-dir "$LOCAL_RUNS_DIR" +fi + +echo +echo "Smoke test complete." +echo " Skill output: $SKILL_OUTPUT" +echo " Local runs: $LOCAL_RUNS_DIR" +echo " Remote runs: $REMOTE_RUNS_DIR" diff --git a/src/upskill/__init__.py b/src/upskill/__init__.py index 40b0c50..088a9c6 100644 --- a/src/upskill/__init__.py +++ b/src/upskill/__init__.py @@ -22,33 +22,32 @@ RunResult, Skill, SkillMetadata, + SkillRecord, + SkillState, TestCase, TestResult, ) __all__ = [ - # Config + "BatchSummary", "Config", - # Models + "ConversationStats", + "EvalResults", + "RunMetadata", + "RunResult", "Skill", "SkillMetadata", + "SkillRecord", + "SkillState", "TestCase", "TestResult", - "EvalResults", - "RunMetadata", - "RunResult", - "ConversationStats", - "BatchSummary", - # Generation - "generate_skill", - "generate_tests", - "refine_skill", - # Evaluation - "evaluate_skill", - # Logging "create_batch_folder", "create_run_folder", + "evaluate_skill", "extract_stats_from_summary", + "generate_skill", + "generate_tests", + "refine_skill", "summarize_runs_to_csv", "write_batch_summary", "write_run_metadata", diff --git a/src/upskill/agent_cards/evaluator.md b/src/upskill/agent_cards/evaluator.md index ecc2cd6..e321bcc 100644 --- a/src/upskill/agent_cards/evaluator.md +++ b/src/upskill/agent_cards/evaluator.md @@ -1,8 +1,24 @@ --- +# This file describes the Agent that is used to evaluate the skill. +# The system prompt used for the Agent is below the frontmatter. +# Content included with {{file:}}, {{fileSilent:}} or {{url:https://....}} (good for remote control) description: Evaluate skill performance against test cases. -skills: ["./skills"] # you can add mcp servers in here if needed. (reference name from config file) + + +#mcp_connect: +# - target: "https://huggingface.co/mcp" +# headers: +# Authorization: "Bearer ${TOKEN}" + +# Note: MCP Servers hosted on Hugging Face get HF_TOKEN handling automatically +# Target can include npx/uvx package names, or a shell command to start STDIO + --- -You are an evaluator of skills. You are given a skill and a test case. You need to evaluate the skill on the test case and return a score. +You are an evaluator of skills. You are given a skill and a test case. + +You need to evaluate the skill on the test case and return a score. + +{{agentSkills}} -{{agentSkills}} \ No newline at end of file +{{env}} diff --git a/src/upskill/agent_cards/skill_gen.md b/src/upskill/agent_cards/skill_gen.md index b0ab3b0..a595a93 100644 --- a/src/upskill/agent_cards/skill_gen.md +++ b/src/upskill/agent_cards/skill_gen.md @@ -1,6 +1,9 @@ --- type: agent description: Generate skill documents from task descriptions. +skills: [] +shell: false +model: $system.skill_gen --- You generate "skills" - instruction documents that teach AI coding agents how to perform tasks. diff --git a/src/upskill/agent_cards/test_gen.md b/src/upskill/agent_cards/test_gen.md index 3de8ebe..607a6a1 100644 --- a/src/upskill/agent_cards/test_gen.md +++ b/src/upskill/agent_cards/test_gen.md @@ -1,5 +1,6 @@ --- type: agent description: Generate test cases for evaluating skills. +model: $system.test_gen --- You generate test cases for evaluating AI agent skills. Output only valid JSON. diff --git a/src/upskill/artifacts.py b/src/upskill/artifacts.py new file mode 100644 index 0000000..b80da7f --- /dev/null +++ b/src/upskill/artifacts.py @@ -0,0 +1,132 @@ +"""Helpers for evaluation artifact materialization.""" + +from __future__ import annotations + +import json +import re +import shutil +from dataclasses import asdict +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from upskill.executors.contracts import ExecutionRequest + +_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+") +_AGENT_CARD_FILE_EXTENSIONS = { + ".json", + ".markdown", + ".md", + ".yaml", + ".yml", +} + + +def sanitize_artifact_name(value: str) -> str: + """Convert a human-facing label into a filesystem-friendly name.""" + normalized = _NON_ALNUM_RE.sub("-", value.strip().lower()).strip("-") + return normalized or "execution" + + +def ensure_directory(path: Path) -> Path: + """Create a directory and return it.""" + path.mkdir(parents=True, exist_ok=True) + return path + + +def validate_workspace_relative_path(relative_path: str) -> Path: + """Validate that a workspace file path stays within the workspace root.""" + normalized = Path(relative_path) + if normalized.is_absolute(): + raise ValueError(f"Workspace file path must be relative: {relative_path}") + if any(part == ".." for part in normalized.parts): + raise ValueError(f"Workspace file path must not traverse parents: {relative_path}") + return normalized + + +def materialize_workspace(workspace_dir: Path, workspace_files: dict[str, str]) -> None: + """Write test workspace files into a preserved workspace directory.""" + ensure_directory(workspace_dir) + for relative_path, content in workspace_files.items(): + file_path = workspace_dir / validate_workspace_relative_path(relative_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text(content, encoding="utf-8") + + +def bundle_cards( + source_dir: Path, + destination_dir: Path, +) -> Path: + """Copy the agent card bundle into the artifact directory.""" + if destination_dir.exists(): + shutil.rmtree(destination_dir) + shutil.copytree(source_dir, destination_dir) + return destination_dir + + +def bundle_agent_card( + source_dir: Path, + destination_dir: Path, + *, + agent_name: str, +) -> Path: + """Copy only the selected agent card plus shared non-card resources.""" + if destination_dir.exists(): + shutil.rmtree(destination_dir) + ensure_directory(destination_dir) + + if source_dir.is_file(): + if source_dir.stem != agent_name: + raise FileNotFoundError( + f"Requested agent card {agent_name!r} does not match source file {source_dir.name!r}." + ) + shutil.copy2(source_dir, destination_dir / source_dir.name) + return destination_dir + + matched_card = False + for item in source_dir.iterdir(): + destination = destination_dir / item.name + if item.is_dir(): + shutil.copytree(item, destination) + continue + if item.stem == agent_name and item.suffix in _AGENT_CARD_FILE_EXTENSIONS: + shutil.copy2(item, destination) + matched_card = True + continue + if item.suffix not in _AGENT_CARD_FILE_EXTENSIONS: + shutil.copy2(item, destination) + + if not matched_card: + raise FileNotFoundError( + f"Could not find an agent card named {agent_name!r} in {source_dir}." + ) + return destination_dir + + +def materialize_skill_bundle( + destination_dir: Path, + request: ExecutionRequest, +) -> Path: + """Create the explicit skills root for a run.""" + ensure_directory(destination_dir) + if request.skill is not None: + request.skill.save(destination_dir / request.skill.name) + return destination_dir + + +def write_request_file(path: Path, request: ExecutionRequest) -> None: + """Persist request metadata for debugging and provenance.""" + payload = asdict(request) + if request.skill is not None: + payload["skill"] = request.skill.model_dump(mode="json") + path.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8") + + +def copy_config_file(source: Path, destination: Path) -> Path | None: + """Preserve the fast-agent config used for a run when one exists.""" + if not source.exists(): + return None + + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, destination) + return destination diff --git a/src/upskill/cli.py b/src/upskill/cli.py index 1a9d63c..d38396a 100644 --- a/src/upskill/cli.py +++ b/src/upskill/cli.py @@ -1,15 +1,15 @@ """CLI interface for upskill.""" + from __future__ import annotations import asyncio -import inspect import json import sys -from collections.abc import AsyncIterator +from collections.abc import Callable, Mapping from contextlib import asynccontextmanager from importlib import resources from pathlib import Path -from typing import TypedDict +from typing import TYPE_CHECKING, Literal, Protocol, TypedDict, TypeVar, cast import click from dotenv import load_dotenv @@ -20,8 +20,11 @@ from rich.tree import Tree from upskill.config import Config, resolve_upskill_config_path -from upskill.evaluate import evaluate_skill, get_failure_descriptions +from upskill.evaluate import build_eval_requests, evaluate_skill, get_failure_descriptions +from upskill.executors.local_fast_agent import LocalFastAgentExecutor +from upskill.executors.remote_fast_agent import RemoteFastAgentExecutor from upskill.generate import generate_skill, generate_tests, improve_skill, refine_skill +from upskill.hf_jobs import JobsConfig, verify_artifact_repo_access from upskill.logging import ( aggregate_conversation_stats, create_batch_folder, @@ -33,23 +36,144 @@ write_run_metadata, write_run_result, ) -from upskill.model_resolution import ResolvedModels, resolve_models +from upskill.model_resolution import ( + ResolvedModels, + build_fastagent_model_references, + resolve_models, +) from upskill.models import ( BatchSummary, + EvalResults, RunMetadata, RunResult, Skill, + SkillRecord, TestCase, TestResult, ) +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from fast_agent.agents.llm_agent import LlmAgent + from fast_agent.interfaces import AgentProtocol + + from upskill.executors.base import Executor + load_dotenv() console = Console() +class FastAgentSession(Protocol): + """Typed view of the loaded fast-agent session used by upskill.""" + + skill_gen: AgentProtocol + test_gen: AgentProtocol + evaluator: LlmAgent + + +class FastAgentConfig(Protocol): + """Typed view of the fast-agent settings object used by upskill.""" + + model_references: Mapping[str, Mapping[str, str]] + + +class FastAgentApp(Protocol): + """Typed view of the fast-agent app container used by upskill.""" + + _config_or_path: FastAgentConfig + + +class FastAgentWithConfig(Protocol): + """Typed view of FastAgent for model-reference injection.""" + + app: FastAgentApp + + +EvalPlotLabelField = Literal["model", "skill_name"] +ExecutorName = Literal["local", "jobs"] +CommandFunction = TypeVar("CommandFunction", bound=Callable[..., object]) + + +def _jobs_execution_options( + *, + executor_help: str, + runs_dir_help: str, +) -> Callable[[CommandFunction], CommandFunction]: + """Attach the shared remote-execution CLI options to a command.""" + # TODO: add a resumable remote-job collection flow before revisiting the wait-by-default + # behavior for generate/eval jobs. + options = ( + click.option( + "--executor", + type=click.Choice(["local", "jobs"]), + default=None, + help=f"{executor_help}. Overrides `executor` in upskill.config.yaml.", + ), + click.option( + "--artifact-repo", + help="Dataset repo for remote job artifacts (required with --executor jobs)", + ), + click.option( + "--wait/--no-wait", + default=True, + help="Wait for remote jobs and download results (default: wait)", + ), + click.option( + "--jobs-timeout", + default="2h", + show_default=True, + help="HF Jobs timeout for remote runs", + ), + click.option( + "--jobs-flavor", + default="cpu-basic", + show_default=True, + help="HF Jobs hardware flavor for remote runs", + ), + click.option( + "--jobs-secrets", + default=None, + help=( + "Comma-separated HF Job secret names to forward (environment variables). Overrides " + "`jobs_secrets` in upskill.config.yaml." + ), + ), + click.option( + "--jobs-namespace", + help="Hugging Face Jobs namespace (recommended for remote jobs)", + ), + click.option( + "--max-parallel", + type=click.IntRange(min=1), + default=None, + help=( + "Maximum concurrent evaluation executions per phase. Overrides " + "`max_parallel` in upskill.config.yaml." + ), + ), + click.option("--runs-dir", type=click.Path(), help=runs_dir_help), + click.option( + "--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)" + ), + ) + + def decorator(function: CommandFunction) -> CommandFunction: + wrapped = function + for option in reversed(options): + wrapped = option(wrapped) + return wrapped + + return decorator + + @asynccontextmanager -async def _fast_agent_context(config: Config | None = None) -> AsyncIterator[object]: +async def _fast_agent_context( + config: Config | None = None, + *, + model_references: Mapping[str, Mapping[str, str]] | None = None, +) -> AsyncIterator[FastAgentSession]: config = config or Config.load() fast = FastAgent( "upskill", @@ -59,27 +183,41 @@ async def _fast_agent_context(config: Config | None = None) -> AsyncIterator[obj ) @fast.agent() - async def empty(): + async def empty() -> None: pass cards = resources.files("upskill").joinpath("agent_cards") with resources.as_file(cards) as cards_path: fast.load_agents(cards_path) + _install_fast_agent_model_references( + cast("FastAgentWithConfig", fast), + model_references=model_references, + ) + async with fast.run() as agent: - yield agent + yield cast("FastAgentSession", agent) -async def _set_agent_model(agent: object, model: str | None) -> None: - """Best-effort model assignment for a fast-agent instance.""" - if not model: - return - set_model = getattr(agent, "set_model", None) - if not callable(set_model): +def _install_fast_agent_model_references( + fast: FastAgentWithConfig, + *, + model_references: Mapping[str, Mapping[str, str]] | None, +) -> None: + """Merge upskill's model slots into the fast-agent config before agent creation.""" + if not model_references: return - result = set_model(model) - if inspect.isawaitable(result): - await result + + fast_config = fast.app._config_or_path + merged_references = { + namespace: dict(entries) for namespace, entries in fast_config.model_references.items() + } + + for namespace, entries in model_references.items(): + namespace_references = merged_references.setdefault(namespace, {}) + namespace_references.update(entries) + + fast_config.model_references = merged_references def _require_resolved_model(value: str | None, *, field: str, command: str) -> str: @@ -100,6 +238,106 @@ def _require_resolved_models(values: list[str], *, field: str, command: str) -> return values +def _require_path(value: Path | None, *, field: str, command: str) -> Path: + """Require a resolved filesystem path for logging flows.""" + if value is None: + raise RuntimeError(f"Internal bug: `{command}` requires `{field}` to be set.") + return value + + +def _build_executor( + name: ExecutorName, + *, + jobs_config: JobsConfig | None = None, + progress_callback: Callable[[str], None] | None = None, +) -> Executor: + """Construct an evaluation executor from a user-facing executor name.""" + if name == "local": + return LocalFastAgentExecutor() + if jobs_config is None: + raise click.ClickException("The jobs executor requires jobs configuration.") + _ensure_jobs_artifact_repo_access(jobs_config) + return RemoteFastAgentExecutor( + jobs_config=jobs_config, + progress_callback=progress_callback, + ) + + +def _resolve_executor_name(config: Config, cli_executor_name: ExecutorName | None) -> ExecutorName: + """Resolve the effective execution backend from CLI override or config.""" + return cli_executor_name or config.executor + + +def _resolve_num_runs( + config: Config, + cli_num_runs: int | None, + *, + command: Literal["eval", "benchmark"], +) -> int: + """Resolve the effective run count from CLI override or config.""" + if cli_num_runs is not None: + return cli_num_runs + return config.effective_num_runs(command) + + +def _resolve_max_parallel(config: Config, cli_max_parallel: int | None) -> int: + """Resolve the effective concurrency from CLI override or config.""" + if cli_max_parallel is not None: + return cli_max_parallel + return config.max_parallel + + +def _resolve_jobs_secrets(config: Config, cli_jobs_secrets: str | None) -> str: + """Resolve the effective HF Jobs secret list from CLI override or config.""" + if cli_jobs_secrets is not None: + return cli_jobs_secrets + return config.jobs_secrets + + +def _require_jobs_config( + *, + executor_name: ExecutorName, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str, + jobs_namespace: str | None, + jobs_image: str, +) -> JobsConfig | None: + """Build jobs config when the jobs executor is selected.""" + if executor_name != "jobs": + return None + if not artifact_repo: + raise click.ClickException("--artifact-repo is required when using --executor jobs.") + return JobsConfig( + artifact_repo=artifact_repo, + wait=wait, + jobs_timeout=jobs_timeout, + jobs_flavor=jobs_flavor, + jobs_secrets=jobs_secrets, + jobs_namespace=jobs_namespace, + jobs_image=jobs_image, + ) + + +def _ensure_jobs_artifact_repo_access(jobs_config: JobsConfig) -> None: + """Preflight the jobs artifact repo and surface a CLI-friendly failure.""" + try: + verify_artifact_repo_access(jobs_config.artifact_repo) + except RuntimeError as exc: + message = ( + "Artifact repo is not accessible.\n" + f"Repo: {jobs_config.artifact_repo}\n" + "Create it before submitting jobs and ensure the current Hugging Face " + "credentials can access it." + ) + detail = str(exc) + if "Repository Not Found" in detail or "404 Not Found" in detail: + message += "\nThe dataset repo does not exist or the name is wrong." + raise click.ClickException(message) from exc + + def _print_model_plan(command: str, resolved: ResolvedModels, runs: int | None = None) -> None: """Print resolved model plan for command execution.""" console.print("[dim]Resolved model plan:[/dim]") @@ -117,12 +355,12 @@ def _print_model_plan(command: str, resolved: ResolvedModels, runs: int | None = console.print(f" Evaluation Model(s): {models}") if runs is not None: console.print(f" Runs per model: {runs}") - baseline_state = "off (benchmark mode)" if resolved.is_benchmark_mode else ( - "on" if resolved.run_baseline else "off" - ) - console.print( - f" Baseline: {baseline_state}" + baseline_state = ( + "off (benchmark mode)" + if resolved.is_benchmark_mode + else ("on" if resolved.run_baseline else "off") ) + console.print(f" Baseline: {baseline_state}") console.print(f" Test Generation Model: {resolved.test_generation_model}") @@ -131,11 +369,16 @@ def _render_bar(value: float, width: int = 20) -> str: if width <= 0: return "" clamped = max(0.0, min(1.0, value)) - filled = int(round(clamped * width)) + filled = round(clamped * width) empty = width - filled return "█" * filled + "░" * empty +def _print_eval_progress(message: str) -> None: + """Render a lightweight evaluation progress line.""" + console.print(f"[dim]{message}[/dim]") + + class EvalPlotResult(TypedDict): """Structured plot data for eval runs.""" @@ -169,6 +412,562 @@ def _select_baseline_run( return baseline_runs[-1] +def _build_logged_run_result( + *, + model: str, + task: str, + batch_id: str, + run_number: int, + test_results: list[TestResult], + assertions_total: int, + passed: bool, + run_type: str, + skill_name: str, +) -> RunResult: + """Construct a persisted run summary from reconstructed test results.""" + assertions_passed = 0 + computed_assertions_total = 0 + for result in test_results: + if result.validation_result is not None: + assertions_passed += result.validation_result.assertions_passed + computed_assertions_total += result.validation_result.assertions_total + continue + + assertions_passed += int(result.success) + computed_assertions_total += 1 + + return RunResult( + metadata=RunMetadata( + model=model, + task=task, + batch_id=batch_id, + run_number=run_number, + ), + stats=aggregate_conversation_stats(test_results), + passed=passed, + assertions_passed=assertions_passed, + assertions_total=computed_assertions_total or assertions_total, + run_type=run_type, + skill_name=skill_name, + ) + + +def _persist_logged_run(run_folder: Path, run_result: RunResult) -> None: + """Write the standard metadata and result files for a run.""" + write_run_metadata(run_folder, run_result.metadata) + write_run_result(run_folder, run_result) + + +def _persist_comparison_run_results( + *, + batch_folder: Path, + model: str, + task: str, + batch_id: str, + first_run_number: int, + results: EvalResults, + assertions_total: int, + run_baseline: bool, + with_skill_passed: bool, + skill_name: str, +) -> list[RunResult]: + """Persist baseline/with-skill summaries for one evaluation pass.""" + persisted_results: list[RunResult] = [] + run_number = first_run_number + + if run_baseline: + baseline_result = _build_logged_run_result( + model=model, + task=task, + batch_id=batch_id, + run_number=run_number, + test_results=results.baseline_results, + assertions_total=assertions_total, + passed=results.baseline_success_rate > 0.5, + run_type="baseline", + skill_name=skill_name, + ) + _persist_logged_run(create_run_folder(batch_folder, run_number), baseline_result) + persisted_results.append(baseline_result) + run_number += 1 + + with_skill_result = _build_logged_run_result( + model=model, + task=task, + batch_id=batch_id, + run_number=run_number, + test_results=results.with_skill_results, + assertions_total=assertions_total, + passed=with_skill_passed, + run_type="with_skill", + skill_name=skill_name, + ) + _persist_logged_run(create_run_folder(batch_folder, run_number), with_skill_result) + persisted_results.append(with_skill_result) + return persisted_results + + +def _load_test_cases_from_payload(data: object) -> list[TestCase]: + """Normalize test case JSON payloads into ``TestCase`` objects.""" + payload: object + if isinstance(data, dict): + mapping = cast("dict[object, object]", data) + payload = mapping.get("cases", data) + else: + payload = data + if not isinstance(payload, list): + raise click.ClickException("Test payload must be a list or an object with `cases`.") + return [TestCase.model_validate(test_case) for test_case in payload] + + +async def _load_test_cases( + *, + config: Config, + skill_record: SkillRecord, + tests_path: str | None, + test_gen_model: str, + model_references: Mapping[str, Mapping[str, str]], +) -> tuple[list[TestCase], str]: + """Load explicit, persisted, or generated test cases for a skill.""" + if tests_path: + with open(tests_path, encoding="utf-8") as file_obj: + data = json.load(file_obj) + return _load_test_cases_from_payload(data), f"tests file: {tests_path}" + + if skill_record.state.tests: + return skill_record.state.tests, "skill_meta.json" + + async with _fast_agent_context(config, model_references=model_references) as agent: + console.print(f"Generating test cases from skill with {test_gen_model}...", style="dim") + test_cases = await generate_tests( + skill_record.skill.description, + generator=agent.test_gen, + ) + return test_cases, "generated" + + +def _count_invalid_expected_cases(test_cases: list[TestCase]) -> int: + """Count generated or loaded tests missing enough expected strings.""" + invalid_expected = 0 + for test_case in test_cases: + expected_values = [value.strip() for value in test_case.expected.contains if value.strip()] + if len(expected_values) < 2: + invalid_expected += 1 + return invalid_expected + + +def _raise_on_execution_errors(results: EvalResults, *, context: str) -> None: + """Raise a CLI-friendly error when evaluation batches contain execution failures.""" + execution_errors: list[str] = [] + for phase_label, phase_results in ( + ("with-skill", results.with_skill_results), + ("baseline", results.baseline_results), + ): + for index, result in enumerate(phase_results, start=1): + if result.error is None: + continue + execution_errors.append(f"{phase_label} test {index}: {result.error}") + + if not execution_errors: + return + + preview = "\n".join(f" - {message}" for message in execution_errors[:3]) + remaining = len(execution_errors) - 3 + remainder = f"\n ... and {remaining} more" if remaining > 0 else "" + raise click.ClickException(f"{context} encountered execution errors:\n{preview}{remainder}") + + +def _load_trace_context(trace_path: Path) -> str: + """Load a trace file into a prompt-sized context snippet.""" + trace_content = trace_path.read_text(encoding="utf-8") + if trace_path.suffix.lower() != ".json": + return trace_content[:4000] + + try: + trace_data = json.loads(trace_content) + except json.JSONDecodeError: + return trace_content[:4000] + return json.dumps(trace_data, indent=2)[:4000] + + +async def _create_generate_skill_record( + *, + task: str, + examples: list[str] | None, + from_skill: str | None, + from_trace: str | None, + agent: FastAgentSession, + skill_gen_model: str, +) -> tuple[SkillRecord, str]: + """Create or improve the skill record used by ``generate``.""" + if from_trace: + trace_path = Path(from_trace) + console.print(f"Generating skill from trace: {from_trace}", style="dim") + task_with_trace = ( + f"{task}\n\nBased on this agent trace:\n\n{_load_trace_context(trace_path)}" + ) + console.print(f"Generating skill with {skill_gen_model}...", style="dim") + return ( + await generate_skill( + task=task_with_trace, + examples=examples, + generator=agent.skill_gen, + model=skill_gen_model, + ), + task_with_trace, + ) + + if from_skill: + existing_skill = SkillRecord.load(Path(from_skill)) + console.print( + f"Improving [bold]{existing_skill.skill.name}[/bold] with {skill_gen_model}...", + style="dim", + ) + return ( + await improve_skill( + existing_skill, + instructions=task, + generator=agent.skill_gen, + model=skill_gen_model, + ), + task, + ) + + console.print(f"Generating skill with {skill_gen_model}...", style="dim") + return ( + await generate_skill( + task=task, + examples=examples, + generator=agent.skill_gen, + model=skill_gen_model, + ), + task, + ) + + +async def _submit_remote_eval_jobs( + *, + skill: Skill, + test_cases: list[TestCase], + model: str, + jobs_config: JobsConfig, + fastagent_config_path: Path, + cards_path: Path, + artifact_root: Path, + run_baseline: bool, + operation: str, +) -> list[str]: + """Submit remote fast-agent requests for an evaluation batch.""" + _ensure_jobs_artifact_repo_access(jobs_config) + remote_executor = RemoteFastAgentExecutor( + jobs_config=jobs_config, + progress_callback=_print_eval_progress, + ) + requests = build_eval_requests( + skill=skill, + test_cases=test_cases, + model=model, + fastagent_config_path=fastagent_config_path, + cards_source_dir=cards_path, + artifact_root=artifact_root, + run_baseline=run_baseline, + operation=operation, + ) + job_refs: list[str] = [] + for pending_request in requests: + submission = await remote_executor.submit(pending_request.request) + job_refs.append(submission.job_id) + return job_refs + + +async def _submit_generate_jobs_eval( + *, + skill: Skill, + test_cases: list[TestCase], + model: str, + jobs_config: JobsConfig, + config: Config, + cards_path: Path, + batch_folder: Path, +) -> list[str]: + """Submit generate-time remote fast-agent requests without waiting for results.""" + return await _submit_remote_eval_jobs( + skill=skill, + test_cases=test_cases, + model=model, + jobs_config=jobs_config, + fastagent_config_path=config.effective_fastagent_config, + cards_path=cards_path, + artifact_root=batch_folder / "remote_downloads" / "attempt_1", + run_baseline=True, + operation="generate", + ) + + +async def _run_generate_refinement_loop( + *, + skill_record: SkillRecord, + task: str, + test_cases: list[TestCase], + executor: Executor, + config: Config, + cards_path: Path, + batch_id: str, + batch_folder: Path, + skill_gen_model: str, + log_runs: bool, + max_parallel: int, + agent: FastAgentSession, +) -> tuple[SkillRecord, EvalResults | None, list[RunResult]]: + """Run generate-time eval/refinement attempts on the main model.""" + run_results: list[RunResult] = [] + prev_success_rate = 0.0 + results: EvalResults | None = None + attempts = max(1, config.max_refine_attempts) + + for attempt in range(attempts): + attempt_number = attempt + 1 + console.print(f"Evaluating on {skill_gen_model}... (attempt {attempt_number})", style="dim") + console.print("[dim]Starting evaluation run...[/dim]") + + results = await evaluate_skill( + skill_record.skill, + test_cases=test_cases, + executor=executor, + model=skill_gen_model, + fastagent_config_path=config.effective_fastagent_config, + cards_source_dir=cards_path, + artifact_root=batch_folder / f"attempt_{attempt_number}", + max_parallel=max_parallel, + progress_callback=_print_eval_progress, + operation="generate", + ) + _raise_on_execution_errors(results, context="Generate refinement") + + if log_runs: + run_results.extend( + _persist_comparison_run_results( + batch_folder=batch_folder, + model=skill_gen_model, + task=task, + batch_id=batch_id, + first_run_number=attempt * 2 + 1, + results=results, + assertions_total=len(test_cases), + run_baseline=True, + with_skill_passed=results.is_beneficial, + skill_name=skill_record.skill.name, + ) + ) + + lift = results.skill_lift + lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" + + if results.is_beneficial: + console.print( + f" {results.baseline_success_rate:.0%} -> " + f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]" + ) + break + + console.print( + f" {results.baseline_success_rate:.0%} -> " + f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough" + ) + + if abs(results.with_skill_success_rate - prev_success_rate) < 0.05: + console.print(" [yellow]Plateaued, stopping[/yellow]") + break + + prev_success_rate = results.with_skill_success_rate + if attempt >= attempts - 1: + continue + + console.print("Refining...", style="dim") + failures = get_failure_descriptions(results) + skill_record = await refine_skill( + skill_record, + failures, + generator=agent.skill_gen, + model=skill_gen_model, + ) + + return skill_record, results, run_results + + +async def _run_generate_extra_eval( + *, + skill_record: SkillRecord, + task: str, + test_cases: list[TestCase], + executor: Executor, + config: Config, + cards_path: Path, + batch_id: str, + batch_folder: Path, + model: str, + log_runs: bool, + max_parallel: int, + first_run_number: int, +) -> tuple[EvalResults, list[RunResult]]: + """Run the optional cross-model eval pass for ``generate``.""" + console.print(f"Evaluating on {model}...", style="dim") + results = await evaluate_skill( + skill_record.skill, + test_cases, + executor=executor, + model=model, + fastagent_config_path=config.effective_fastagent_config, + cards_source_dir=cards_path, + artifact_root=batch_folder / f"eval_{model}", + max_parallel=max_parallel, + progress_callback=_print_eval_progress, + operation="generate", + ) + _raise_on_execution_errors(results, context=f"Generate eval on {model}") + + run_results: list[RunResult] = [] + if log_runs: + run_results = _persist_comparison_run_results( + batch_folder=batch_folder, + model=model, + task=task, + batch_id=batch_id, + first_run_number=first_run_number, + results=results, + assertions_total=len(test_cases), + run_baseline=True, + with_skill_passed=results.is_beneficial, + skill_name=skill_record.skill.name, + ) + + lift = results.skill_lift + lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" + console.print( + f" {results.baseline_success_rate:.0%} -> " + f"{results.with_skill_success_rate:.0%} ({lift_str})" + ) + return results, run_results + + +async def _run_with_skill_benchmark( + *, + skill_record: SkillRecord, + evaluation_models: list[str], + num_runs: int, + test_cases: list[TestCase], + executor: Executor, + config: Config, + cards_path: Path, + batch_id: str, + batch_folder: Path, + verbose: bool, + log_runs: bool, + max_parallel: int, +) -> tuple[dict[str, list[RunResult]], list[RunResult]]: + """Run a with-skill-only benchmark matrix across models and runs.""" + skill = skill_record.skill + model_results: dict[str, list[RunResult]] = {model: [] for model in evaluation_models} + all_run_results: list[RunResult] = [] + + for model in evaluation_models: + console.print(f"[bold]{model}[/bold]") + + for run_num in range(1, num_runs + 1): + run_folder = create_run_folder(batch_folder, len(all_run_results) + 1) + results = await evaluate_skill( + skill, + test_cases, + executor=executor, + model=model, + fastagent_config_path=config.effective_fastagent_config, + cards_source_dir=cards_path, + artifact_root=run_folder / "eval", + run_baseline=False, + max_parallel=max_parallel, + progress_callback=_print_eval_progress if verbose else None, + operation="benchmark", + ) + _raise_on_execution_errors(results, context=f"Benchmark run on {model}") + run_result = _build_logged_run_result( + model=model, + task=skill.description, + batch_id=batch_id, + run_number=run_num, + test_results=results.with_skill_results, + assertions_total=len(test_cases), + passed=results.with_skill_success_rate > 0.5, + run_type="with_skill", + skill_name=skill.name, + ) + + if log_runs: + _persist_logged_run(run_folder, run_result) + + model_results[model].append(run_result) + all_run_results.append(run_result) + + if verbose: + status = "[green]PASS[/green]" if run_result.passed else "[red]FAIL[/red]" + console.print( + f" Run {run_num}: {status} " + f"({run_result.assertions_passed}/{run_result.assertions_total} " + "assertions passed)" + ) + + console.print() + + return model_results, all_run_results + + +def _print_benchmark_summary(model_results: dict[str, list[RunResult]]) -> None: + """Render the standard per-model benchmark summary.""" + console.print("\n[bold]Summary[/bold]\n") + for model, results in model_results.items(): + total_runs = len(results) + passed_runs = sum(1 for result in results if result.passed) + avg_tokens = ( + sum(result.stats.total_tokens for result in results) / total_runs if total_runs else 0 + ) + avg_turns = sum(result.stats.turns for result in results) / total_runs if total_runs else 0 + pass_rate = passed_runs / total_runs if total_runs else 0 + if pass_rate > 0.5: + pass_rate_style = "green" + elif pass_rate > 0: + pass_rate_style = "yellow" + else: + pass_rate_style = "red" + + console.print(f"[bold]{model}[/bold]") + console.print( + " Runs: " + f"{total_runs} | Passed: {passed_runs} ([{pass_rate_style}]" + f"{pass_rate:.0%}[/{pass_rate_style}])" + ) + console.print(f" Avg tokens: {avg_tokens:.0f} | Avg turns: {avg_turns:.1f}") + console.print() + + +def _write_benchmark_summary( + *, + batch_folder: Path, + batch_id: str, + evaluation_models: list[str], + task: str, + all_run_results: list[RunResult], +) -> None: + """Persist the standard benchmark batch summary.""" + summary = BatchSummary( + batch_id=batch_id, + model=", ".join(evaluation_models), + task=task, + total_runs=len(all_run_results), + passed_runs=sum(1 for result in all_run_results if result.passed), + results=all_run_results, + ) + write_batch_summary(batch_folder, summary) + + def _load_eval_results(runs_path: Path) -> list[EvalPlotResult]: """Load eval results from batch summaries or run folders.""" results: list[EvalPlotResult] = [] @@ -238,7 +1037,6 @@ def main(): @main.command() @click.argument("task") @click.option("-e", "--example", multiple=True, help="Input -> output example") -@click.option("--tool", help="Generate from MCP tool schema (path#tool_name)") @click.option( "-f", "--from", @@ -258,18 +1056,27 @@ def main(): @click.option("-o", "--output", type=click.Path(), help="Output directory for skill") @click.option("--no-eval", is_flag=True, help="Skip eval and refinement") @click.option("--eval-model", help="Optional extra cross-model eval pass after generation") -@click.option("--runs-dir", type=click.Path(), help="Directory for run logs (default: ./runs)") -@click.option("--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)") +@_jobs_execution_options( + executor_help="Execution backend for evaluation/refinement runs", + runs_dir_help="Directory for run logs (default: ./runs)", +) def generate( task: str, example: tuple[str, ...], - tool: str | None, # noqa: ARG001 from_source: str | None, model: str | None, test_gen_model: str | None, output: str | None, no_eval: bool, eval_model: str | None, + executor: ExecutorName | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, + max_parallel: int | None, runs_dir: str | None, log_runs: bool, ): @@ -287,6 +1094,10 @@ def generate( upskill generate "validate forms" -o ./my-skills/validation + # Remote execution on Hugging Face Jobs: + + upskill generate "parse invoices" --executor jobs --artifact-repo /upskill-tests + # Improve an existing skill (auto-detects directory): upskill generate "add more error handling examples" --from ./skills/api-errors/ @@ -324,6 +1135,14 @@ def generate( output, no_eval, eval_model, + executor, + artifact_repo, + wait, + jobs_timeout, + jobs_flavor, + jobs_secrets, + jobs_namespace, + max_parallel, runs_dir, log_runs, ) @@ -340,11 +1159,32 @@ async def _generate_async( output: str | None, no_eval: bool, eval_model: str | None, + executor_name: ExecutorName | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, + max_parallel: int | None, runs_dir: str | None, log_runs: bool, ): """Async implementation of generate command.""" config = Config.load() + executor_name = _resolve_executor_name(config, executor_name) + max_parallel = _resolve_max_parallel(config, max_parallel) + jobs_secrets = _resolve_jobs_secrets(config, jobs_secrets) + jobs_config = _require_jobs_config( + executor_name=executor_name, + artifact_repo=artifact_repo, + wait=wait, + jobs_timeout=jobs_timeout, + jobs_flavor=jobs_flavor, + jobs_secrets=jobs_secrets, + jobs_namespace=jobs_namespace, + jobs_image=config.jobs_image, + ) resolved = resolve_models( "generate", config=config, @@ -363,310 +1203,148 @@ async def _generate_async( command="generate", ) extra_eval_model = resolved.extra_eval_model + model_references = build_fastagent_model_references(config=config, resolved=resolved) _print_model_plan("generate", resolved) - # Setup run logging if enabled - batch_id = None - batch_folder = None + # Setup artifact storage and optional run logging + runs_path = Path(runs_dir) if runs_dir else config.runs_dir + batch_id, batch_folder = create_batch_folder(runs_path) run_results: list[RunResult] = [] - + console.print(f"Artifacts saved to: {batch_folder}", style="dim") if log_runs: - runs_path = Path(runs_dir) if runs_dir else config.runs_dir - batch_id, batch_folder = create_batch_folder(runs_path) console.print(f"Logging runs to: {batch_folder}", style="dim") - async with _fast_agent_context(config) as agent: - # Generate from trace file - if from_trace: - console.print(f"Generating skill from trace: {from_trace}", style="dim") - trace_path = Path(from_trace) - with open(trace_path, encoding="utf-8") as f: - trace_content = f.read() - - # Try to parse as JSON, otherwise use as plain text - if trace_path.suffix.lower() == ".json": - try: - trace_data = json.loads(trace_content) - trace_context = json.dumps(trace_data, indent=2)[:4000] - except json.JSONDecodeError: - trace_context = trace_content[:4000] - else: - # Plain text, markdown, etc. - trace_context = trace_content[:4000] - - task = f"{task}\n\nBased on this agent trace:\n\n{trace_context}" - console.print(f"Generating skill with {skill_gen_model}...", style="dim") - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await generate_skill( + async with _fast_agent_context(config, model_references=model_references) as agent: + cards = resources.files("upskill").joinpath("agent_cards") + with resources.as_file(cards) as cards_path: + skill_record, eval_task = await _create_generate_skill_record( task=task, examples=examples, - generator=agent.skill_gen, - model=skill_gen_model, - ) - # Improve existing skill - elif from_skill: - existing_skill = Skill.load(Path(from_skill)) - console.print( - f"Improving [bold]{existing_skill.name}[/bold] with {skill_gen_model}...", - style="dim", - ) - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await improve_skill( - existing_skill, - instructions=task, - generator=agent.skill_gen, - model=skill_gen_model, - ) - else: - console.print(f"Generating skill with {skill_gen_model}...", style="dim") - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await generate_skill( - task=task, - examples=examples, - generator=agent.skill_gen, - model=skill_gen_model, - ) - if no_eval: - _save_and_display(skill, output, config) - return - - console.print("Generating test cases...", style="dim") - await _set_agent_model(agent.test_gen, test_gen_model) - test_cases = await generate_tests(task, generator=agent.test_gen, model=test_gen_model) - - # Eval loop with refinement (on skill generation model) - prev_success_rate = 0.0 - results = None - attempts = max(1, config.max_refine_attempts) - for attempt in range(attempts): - console.print( - f"Evaluating on {skill_gen_model}... (attempt {attempt + 1})", - style="dim", + from_skill=from_skill, + from_trace=from_trace, + agent=agent, + skill_gen_model=skill_gen_model, ) - # Create run folder for logging (2 folders per attempt: baseline + with_skill) - run_folder = None - if log_runs and batch_folder: - baseline_run_num = attempt * 2 + 1 - run_folder = create_run_folder(batch_folder, baseline_run_num) - write_run_metadata( - run_folder, - RunMetadata( - model=skill_gen_model, - task=task, - batch_id=batch_id or "", - run_number=baseline_run_num, - ), - ) - - console.print("[dim]Starting evaluation run...[/dim]") - - results = await evaluate_skill( - skill, - test_cases=test_cases, - evaluator=agent.evaluator, - model=skill_gen_model, - show_baseline_progress=False, + console.print(f"Generating test cases with {test_gen_model}...", style="dim") + test_cases = await generate_tests( + eval_task, + generator=agent.test_gen, ) - - # Log run results (both baseline and with-skill for plot command) - if log_runs and run_folder: - # Log baseline result - baseline_result = RunResult( - metadata=RunMetadata( - model=skill_gen_model, - task=task, - batch_id=batch_id or "", - run_number=baseline_run_num, - ), - stats=aggregate_conversation_stats(results.baseline_results), - passed=results.baseline_success_rate > 0.5, - assertions_passed=int(results.baseline_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="baseline", - skill_name=skill.name, - ) - write_run_result(run_folder, baseline_result) - run_results.append(baseline_result) - - # Log with-skill result (in a separate folder) - with_skill_folder = create_run_folder(batch_folder, attempt * 2 + 2) - with_skill_result = RunResult( - metadata=RunMetadata( - model=skill_gen_model, - task=task, - batch_id=batch_id or "", - run_number=attempt * 2 + 2, - ), - stats=aggregate_conversation_stats(results.with_skill_results), - passed=results.is_beneficial, - assertions_passed=int(results.with_skill_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="with_skill", - skill_name=skill.name, + skill_record.state.tests = list(test_cases) + + if no_eval: + _save_and_display(skill_record, output, config, artifact_path=batch_folder) + return + + if executor_name == "jobs" and not wait: + if jobs_config is None: + raise RuntimeError("Jobs config was not initialized.") + job_refs = await _submit_generate_jobs_eval( + skill=skill_record.skill, + test_cases=test_cases, + model=skill_gen_model, + jobs_config=jobs_config, + config=config, + cards_path=cards_path, + batch_folder=batch_folder, ) - write_run_metadata(with_skill_folder, with_skill_result.metadata) - write_run_result(with_skill_folder, with_skill_result) - run_results.append(with_skill_result) - - lift = results.skill_lift - lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" - - if results.is_beneficial: console.print( - f" {results.baseline_success_rate:.0%} -> " - f"{results.with_skill_success_rate:.0%} ({lift_str}) [green]OK[/green]" + "[yellow]Remote fast-agent requests submitted without --wait; " + "refinement is skipped for this run.[/yellow]" ) - break - - console.print( - f" {results.baseline_success_rate:.0%} -> " - f"{results.with_skill_success_rate:.0%} ({lift_str}) not good enough" + console.print(f"Remote fast-agent job id(s): {', '.join(job_refs)}") + _save_and_display(skill_record, output, config, artifact_path=batch_folder) + return + + executor = _build_executor( + executor_name, + jobs_config=jobs_config, + progress_callback=_print_eval_progress, ) - if abs(results.with_skill_success_rate - prev_success_rate) < 0.05: - console.print(" [yellow]Plateaued, stopping[/yellow]") - break - - prev_success_rate = results.with_skill_success_rate - - if attempt < attempts - 1: - console.print("Refining...", style="dim") - failures = get_failure_descriptions(results) - await _set_agent_model(agent.skill_gen, skill_gen_model) - skill = await refine_skill( - skill, - failures, - generator=agent.skill_gen, - model=skill_gen_model, - ) - - # If eval_model specified, also eval on that model - eval_results = None - if extra_eval_model: - console.print(f"Evaluating on {extra_eval_model}...", style="dim") - - # Create run folder for eval model - run_folder = None - if log_runs and batch_folder: - run_number = len(run_results) + 1 - run_folder = create_run_folder(batch_folder, run_number) - write_run_metadata( - run_folder, - RunMetadata( - model=extra_eval_model, - task=task, - batch_id=batch_id or "", - run_number=run_number, - ), - ) - - eval_results = await evaluate_skill( - skill, - test_cases, - evaluator=agent.evaluator, - model=extra_eval_model, - show_baseline_progress=False, + skill_record, results, run_results = await _run_generate_refinement_loop( + skill_record=skill_record, + task=eval_task, + test_cases=test_cases, + executor=executor, + config=config, + cards_path=cards_path, + batch_id=batch_id, + batch_folder=batch_folder, + skill_gen_model=skill_gen_model, + log_runs=log_runs, + max_parallel=max_parallel, + agent=agent, ) - # Log eval run results (both baseline and with-skill) - if log_runs and run_folder: - # Log baseline result - baseline_result = RunResult( - metadata=RunMetadata( - model=extra_eval_model, - task=task, - batch_id=batch_id or "", - run_number=run_number, - ), - stats=aggregate_conversation_stats(eval_results.baseline_results), - passed=eval_results.baseline_success_rate > 0.5, - assertions_passed=int(eval_results.baseline_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="baseline", - skill_name=skill.name, - ) - write_run_result(run_folder, baseline_result) - run_results.append(baseline_result) - - # Log with-skill result - with_skill_folder = create_run_folder(batch_folder, run_number + 1) - with_skill_result = RunResult( - metadata=RunMetadata( - model=extra_eval_model, - task=task, - batch_id=batch_id or "", - run_number=run_number + 1, - ), - stats=aggregate_conversation_stats(eval_results.with_skill_results), - passed=eval_results.is_beneficial, - assertions_passed=int(eval_results.with_skill_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="with_skill", - skill_name=skill.name, + # If eval_model specified, also eval on that model + eval_results = None + if extra_eval_model: + eval_results, extra_run_results = await _run_generate_extra_eval( + skill_record=skill_record, + task=eval_task, + test_cases=test_cases, + executor=executor, + config=config, + cards_path=cards_path, + batch_id=batch_id, + batch_folder=batch_folder, + model=extra_eval_model, + log_runs=log_runs, + max_parallel=max_parallel, + first_run_number=len(run_results) + 1, ) - write_run_metadata(with_skill_folder, with_skill_result.metadata) - write_run_result(with_skill_folder, with_skill_result) - run_results.append(with_skill_result) - - lift = eval_results.skill_lift - lift_str = f"+{lift:.0%}" if lift > 0 else f"{lift:.0%}" - console.print( - f" {eval_results.baseline_success_rate:.0%} -> " - f"{eval_results.with_skill_success_rate:.0%} ({lift_str})" - ) + run_results.extend(extra_run_results) - # Write batch summary - if log_runs and batch_folder and batch_id: - summary = BatchSummary( - batch_id=batch_id, - model=skill_gen_model, - task=task, - total_runs=len(run_results), - passed_runs=sum(1 for r in run_results if r.passed), - results=run_results, - ) - write_batch_summary(batch_folder, summary) + # Write batch summary + if log_runs: + summary = BatchSummary( + batch_id=batch_id, + model=skill_gen_model, + task=eval_task, + total_runs=len(run_results), + passed_runs=sum(1 for r in run_results if r.passed), + results=run_results, + ) + write_batch_summary(batch_folder, summary) - if not no_eval and skill is not None: + if not no_eval: if results: - skill.metadata.test_pass_rate = results.with_skill_success_rate + skill_record.state.metadata.test_pass_rate = results.with_skill_success_rate else: console.print( "[yellow]No evaluation results available; skipping report output.[/yellow]" ) _save_and_display( - skill, + skill_record, output, config, results, eval_results, skill_gen_model, extra_eval_model, + batch_folder, ) - - - def _save_and_display( - skill: Skill, + skill_record: SkillRecord, output: str | None, config: Config, - results=None, - eval_results=None, + results: EvalResults | None = None, + eval_results: EvalResults | None = None, skill_gen_model: str | None = None, eval_model: str | None = None, + artifact_path: Path | None = None, ): """Save skill and display summary.""" - if output: - output_path = Path(output) - else: - output_path = config.skills_dir / skill.name + skill = skill_record.skill + output_path = Path(output) if output else config.skills_dir / skill.name - skill.save(output_path) + skill_record.save(output_path) console.print("[dim]Rendering report output...[/dim]") @@ -733,6 +1411,8 @@ def _save_and_display( console.print() console.print(f"Saved to {output_path}") + if artifact_path is not None: + console.print(f"Artifacts saved to {artifact_path}") @main.command("eval") @@ -749,23 +1429,39 @@ def _save_and_display( "--test-gen-model", help="Override test generation model when tests must be generated", ) -@click.option("--runs", "num_runs", type=int, default=1, help="Number of runs per model") +@click.option( + "--runs", + "num_runs", + type=click.IntRange(min=1), + default=None, + help="Number of runs per model. Overrides `num_runs` in upskill.config.yaml.", +) @click.option( "--no-baseline", is_flag=True, help="Skip baseline comparison in simple eval mode (ignored in benchmark mode)", ) @click.option("-v", "--verbose", is_flag=True, help="Show per-test results") -@click.option("--log-runs/--no-log-runs", default=True, help="Log run data (default: enabled)") -@click.option("--runs-dir", type=click.Path(), help="Directory for run logs") +@_jobs_execution_options( + executor_help="Execution backend for evaluation runs", + runs_dir_help="Directory for run logs", +) def eval_cmd( skill_path: str, tests: str | None, models: tuple[str, ...], test_gen_model: str | None, - num_runs: int, + num_runs: int | None, no_baseline: bool, verbose: bool, + executor: ExecutorName | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, + max_parallel: int | None, log_runs: bool, runs_dir: str | None, ): @@ -803,27 +1499,62 @@ def eval_cmd( num_runs, no_baseline, verbose, + executor, + artifact_repo, + wait, + jobs_timeout, + jobs_flavor, + jobs_secrets, + jobs_namespace, + max_parallel, log_runs, runs_dir, ) ) -async def _eval_async( +async def _eval_async( # noqa: C901 skill_path: str, tests: str | None, models: list[str] | None, test_gen_model: str | None, - num_runs: int, + num_runs: int | None, no_baseline: bool, verbose: bool, + executor_name: ExecutorName | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, + max_parallel: int | None, log_runs: bool, runs_dir: str | None, ): """Async implementation of eval command.""" - from upskill.evaluate import run_test - config = Config.load() + executor_name = _resolve_executor_name(config, executor_name) + num_runs = _resolve_num_runs(config, num_runs, command="eval") + max_parallel = _resolve_max_parallel(config, max_parallel) + jobs_secrets = _resolve_jobs_secrets(config, jobs_secrets) + jobs_config = _require_jobs_config( + executor_name=executor_name, + artifact_repo=artifact_repo, + wait=wait, + jobs_timeout=jobs_timeout, + jobs_flavor=jobs_flavor, + jobs_secrets=jobs_secrets, + jobs_namespace=jobs_namespace, + jobs_image=config.jobs_image, + ) + executor = None + if executor_name == "local" or wait: + executor = _build_executor( + executor_name, + jobs_config=jobs_config, + progress_callback=_print_eval_progress, + ) resolved = resolve_models( "eval", config=config, @@ -842,6 +1573,7 @@ async def _eval_async( field="test_generation_model", command="eval", ) + model_references = build_fastagent_model_references(config=config, resolved=resolved) _print_model_plan("eval", resolved, runs=num_runs) if resolved.is_benchmark_mode and no_baseline: @@ -852,202 +1584,110 @@ async def _eval_async( skill_dir = Path(skill_path) try: - skill = Skill.load(skill_dir) + skill_record = SkillRecord.load(skill_dir) except FileNotFoundError: console.print(f"[red]No SKILL.md found in {skill_dir}[/red]") sys.exit(1) + skill = skill_record.skill - async with _fast_agent_context(config) as agent: - # Load test cases - test_cases: list[TestCase] = [] - if tests: - with open(tests, encoding="utf-8") as f: - data = json.load(f) - if "cases" in data: - test_cases = [TestCase(**tc) for tc in data["cases"]] - else: - test_cases = [TestCase(**tc) for tc in data] - test_source = f"tests file: {tests}" - elif skill.tests: - test_cases = skill.tests - test_source = "skill_meta.json" - else: - console.print("Generating test cases from skill...", style="dim") - await _set_agent_model(agent.test_gen, test_gen_model) - test_cases = await generate_tests( - skill.description, - generator=agent.test_gen, - model=test_gen_model, - ) - test_source = "generated" + test_cases, test_source = await _load_test_cases( + config=config, + skill_record=skill_record, + tests_path=tests, + test_gen_model=test_gen_model, + model_references=model_references, + ) - invalid_expected = 0 - for tc in test_cases: - expected_values = [value.strip() for value in tc.expected.contains if value.strip()] - if len(expected_values) < 2: - invalid_expected += 1 - console.print( - f"[dim]Loaded {len(test_cases)} test case(s) from {test_source}[/dim]" - ) - if invalid_expected: - console.print( - f"[yellow]{invalid_expected} test case(s) missing expected strings[/yellow]" + invalid_expected = _count_invalid_expected_cases(test_cases) + console.print(f"[dim]Loaded {len(test_cases)} test case(s) from {test_source}[/dim]") + if invalid_expected: + console.print(f"[yellow]{invalid_expected} test case(s) missing expected strings[/yellow]") + + runs_path = Path(runs_dir) if runs_dir else config.runs_dir + batch_id, batch_folder = create_batch_folder(runs_path) + console.print(f"Artifacts saved to: {batch_folder}", style="dim") + if log_runs: + console.print(f"Logging to: {batch_folder}", style="dim") + + if executor_name == "jobs" and not wait: + if jobs_config is None: + raise RuntimeError("Jobs config was not initialized.") + cards = resources.files("upskill").joinpath("agent_cards") + with resources.as_file(cards) as cards_path: + if resolved.is_benchmark_mode: + submitted_job_refs: list[str] = [] + + for model in evaluation_models: + console.print(f"[bold]{model}[/bold]") + for run_num in range(1, num_runs + 1): + job_refs = await _submit_remote_eval_jobs( + skill=skill, + test_cases=test_cases, + model=model, + jobs_config=jobs_config, + fastagent_config_path=config.effective_fastagent_config, + cards_path=cards_path, + artifact_root=batch_folder + / "remote_downloads" + / model + / f"run_{run_num}", + run_baseline=False, + operation="benchmark", + ) + submitted_job_refs.extend(job_refs) + console.print(f"Remote fast-agent job id(s): {', '.join(job_refs)}") + console.print( + f"Submitted remote fast-agent job id(s): {', '.join(submitted_job_refs)}" + ) + return + + job_refs = await _submit_remote_eval_jobs( + skill=skill, + test_cases=test_cases, + model=evaluation_models[0], + jobs_config=jobs_config, + fastagent_config_path=config.effective_fastagent_config, + cards_path=cards_path, + artifact_root=batch_folder / "remote_downloads", + run_baseline=resolved.run_baseline, + operation="eval", ) + console.print(f"Remote fast-agent job id(s): {', '.join(job_refs)}") + return - # Setup run logging - batch_id = None - batch_folder = None - if log_runs: - runs_path = Path(runs_dir) if runs_dir else config.runs_dir - batch_id, batch_folder = create_batch_folder(runs_path) - console.print(f"Logging to: {batch_folder}", style="dim") + if executor is None: + raise RuntimeError("Local executor was not initialized.") + cards = resources.files("upskill").joinpath("agent_cards") + with resources.as_file(cards) as cards_path: if resolved.is_benchmark_mode: - # Benchmark mode: multiple models and/or runs console.print( f"\nEvaluating [bold]{skill.name}[/bold] across {len(evaluation_models)} model(s)" ) - console.print( - f" {len(test_cases)} test case(s), " - f"{num_runs} run(s) per model\n" + console.print(f" {len(test_cases)} test case(s), {num_runs} run(s) per model\n") + model_results, all_run_results = await _run_with_skill_benchmark( + skill_record=skill_record, + evaluation_models=evaluation_models, + num_runs=num_runs, + test_cases=test_cases, + executor=executor, + config=config, + cards_path=cards_path, + batch_id=batch_id, + batch_folder=batch_folder, + verbose=verbose, + log_runs=log_runs, + max_parallel=max_parallel, ) - - model_results: dict[str, list[RunResult]] = {m: [] for m in evaluation_models} - all_run_results: list[RunResult] = [] - - for model in evaluation_models: - console.print(f"[bold]{model}[/bold]") - - for run_num in range(1, num_runs + 1): - run_folder = None - if log_runs and batch_folder: - run_folder = create_run_folder( - batch_folder, len(all_run_results) + 1 - ) - - # Run each test case - total_assertions_passed = 0 - total_assertions = 0 - all_passed = True - run_test_results: list[TestResult] = [] - - for tc_idx, tc in enumerate(test_cases, 1): - if verbose: - console.print( - f" Running test {tc_idx}/{len(test_cases)}...", - style="dim", - ) - - try: - result = await run_test( - tc, - evaluator=agent.evaluator, - skill=skill, - model=model, - instance_name=( - f"eval ({model} run {run_num} test {tc_idx})" - ), - ) - except Exception as e: - console.print(f" [red]Test error: {e}[/red]") - result = TestResult(test_case=tc, success=False, error=str(e)) - - # Extract assertion counts - if result.validation_result: - total_assertions_passed += result.validation_result.assertions_passed - total_assertions += result.validation_result.assertions_total - if verbose and result.validation_result.error_message: - console.print( - f" Validation: {result.validation_result.error_message}", - style="dim", - ) - elif result.error: - if verbose: - console.print(f" Error: {result.error}", style="dim") - total_assertions += 1 - else: - total_assertions += 1 - if result.success: - total_assertions_passed += 1 - - run_test_results.append(result) - if not result.success: - all_passed = False - - aggregated_stats = aggregate_conversation_stats(run_test_results) - - run_result = RunResult( - metadata=RunMetadata( - model=model, - task=skill.description, - batch_id=batch_id or "", - run_number=run_num, - ), - stats=aggregated_stats, - passed=all_passed, - assertions_passed=total_assertions_passed, - assertions_total=total_assertions, - run_type="with_skill", - skill_name=skill.name, - ) - - if run_folder: - write_run_metadata(run_folder, run_result.metadata) - write_run_result(run_folder, run_result) - - model_results[model].append(run_result) - all_run_results.append(run_result) - - # Display progress - status = "[green]PASS[/green]" if all_passed else "[red]FAIL[/red]" - if verbose: - console.print( - f" Run {run_num}: {status} " - f"({total_assertions_passed}/{total_assertions} assertions passed)" - ) - - console.print() - - # Summary report - console.print("\n[bold]Summary[/bold]\n") - - for model, results in model_results.items(): - total_runs = len(results) - passed_runs = sum(1 for r in results if r.passed) - avg_tokens = ( - sum(r.stats.total_tokens for r in results) / total_runs if total_runs else 0 - ) - avg_turns = sum(r.stats.turns for r in results) / total_runs if total_runs else 0 - - pass_rate = passed_runs / total_runs if total_runs else 0 - pass_rate_str = f"{pass_rate:.0%}" - if pass_rate > 0.5: - pass_rate_style = "green" - elif pass_rate > 0: - pass_rate_style = "yellow" - else: - pass_rate_style = "red" - - console.print(f"[bold]{model}[/bold]") - console.print( - " Runs: " - f"{total_runs} | Passed: {passed_runs} ([{pass_rate_style}]" - f"{pass_rate_str}[/{pass_rate_style}])" - ) - console.print(f" Avg tokens: {avg_tokens:.0f} | Avg turns: {avg_turns:.1f}") - console.print() - - # Write batch summary - if log_runs and batch_folder and batch_id: - summary = BatchSummary( + _print_benchmark_summary(model_results) + if log_runs: + _write_benchmark_summary( + batch_folder=batch_folder, batch_id=batch_id, - model=", ".join(evaluation_models), + evaluation_models=evaluation_models, task=skill.description, - total_runs=len(all_run_results), - passed_runs=sum(1 for r in all_run_results if r.passed), - results=all_run_results, + all_run_results=all_run_results, ) - write_batch_summary(batch_folder, summary) else: # Simple eval mode: single model, single run @@ -1057,64 +1697,41 @@ async def _eval_async( results = await evaluate_skill( skill, test_cases, - evaluator=agent.evaluator, + executor=executor, model=model, + fastagent_config_path=config.effective_fastagent_config, + cards_source_dir=cards_path, + artifact_root=batch_folder / "eval", run_baseline=resolved.run_baseline, - show_baseline_progress=verbose, + max_parallel=max_parallel, + progress_callback=_print_eval_progress, + operation="eval", ) + _raise_on_execution_errors(results, context=f"Evaluation on {model}") # Log results (both baseline and with-skill) run_results: list[RunResult] = [] - if log_runs and batch_folder: - # Log baseline result - if resolved.run_baseline: - baseline_folder = create_run_folder(batch_folder, 1) - baseline_result = RunResult( - metadata=RunMetadata( - model=model, - task=skill.description, - batch_id=batch_id or "", - run_number=1, - ), - stats=aggregate_conversation_stats(results.baseline_results), - passed=results.baseline_success_rate > 0.5, - assertions_passed=int(results.baseline_success_rate * len(test_cases)), - assertions_total=len(test_cases), - run_type="baseline", - skill_name=skill.name, - ) - write_run_metadata(baseline_folder, baseline_result.metadata) - write_run_result(baseline_folder, baseline_result) - run_results.append(baseline_result) - - # Log with-skill result - with_skill_folder = create_run_folder( - batch_folder, - 2 if resolved.run_baseline else 1, - ) - with_skill_result = RunResult( - metadata=RunMetadata( - model=model, - task=skill.description, - batch_id=batch_id or "", - run_number=2 if resolved.run_baseline else 1, - ), - stats=aggregate_conversation_stats(results.with_skill_results), - passed=results.is_beneficial - if resolved.run_baseline - else results.with_skill_success_rate > 0.5, - assertions_passed=int(results.with_skill_success_rate * len(test_cases)), + if log_runs: + run_results = _persist_comparison_run_results( + batch_folder=batch_folder, + model=model, + task=skill.description, + batch_id=batch_id, + first_run_number=1, + results=results, assertions_total=len(test_cases), - run_type="with_skill", + run_baseline=resolved.run_baseline, + with_skill_passed=( + results.is_beneficial + if resolved.run_baseline + else results.with_skill_success_rate > 0.5 + ), skill_name=skill.name, ) - write_run_metadata(with_skill_folder, with_skill_result.metadata) - write_run_result(with_skill_folder, with_skill_result) - run_results.append(with_skill_result) # Write batch summary summary = BatchSummary( - batch_id=batch_id or "", + batch_id=batch_id, model=model, task=skill.description, total_runs=len(run_results), @@ -1126,7 +1743,8 @@ async def _eval_async( if verbose and resolved.run_baseline: console.print() for i, (with_r, base_r) in enumerate( - zip(results.with_skill_results, results.baseline_results), 1 + zip(results.with_skill_results, results.baseline_results, strict=True), + 1, ): base_icon = "[green]OK[/green]" if base_r.success else "[red]FAIL[/red]" skill_icon = "[green]OK[/green]" if with_r.success else "[red]FAIL[/red]" @@ -1171,6 +1789,7 @@ async def _eval_async( console.print(f" with skill {with_skill_bar} {with_skill_rate:>5.0%}") console.print(f" tokens: {results.with_skill_total_tokens}") + console.print(f"\nArtifacts saved to: {batch_folder}") if resolved.run_baseline: if results.is_beneficial: console.print("\n[green]Recommendation: keep skill[/green]") @@ -1184,10 +1803,7 @@ async def _eval_async( def list_cmd(skills_dir: str | None, verbose: bool): """List generated skills.""" config = Config.load() - if skills_dir: - path = Path(skills_dir) - else: - path = config.skills_dir + path = Path(skills_dir) if skills_dir else config.skills_dir if not path.exists(): console.print(f"No skills directory found at {path}") @@ -1266,22 +1882,76 @@ def list_cmd(skills_dir: str | None, verbose: bool): required=True, help="Evaluation model(s) to benchmark (repeatable)", ) -@click.option("--runs", "num_runs", type=int, default=3, help="Runs per model (default: 3)") +@click.option( + "--runs", + "num_runs", + type=click.IntRange(min=1), + default=None, + help="Runs per model. Overrides `num_runs` in upskill.config.yaml.", +) @click.option("-t", "--tests", type=click.Path(exists=True), help="Test cases JSON file") @click.option( "--test-gen-model", help="Override test generation model when tests must be generated", ) +@click.option( + "--executor", + type=click.Choice(["local", "jobs"]), + default=None, + help="Execution backend for benchmark runs. Overrides `executor` in upskill.config.yaml.", +) +@click.option("--artifact-repo", help="Dataset repo for remote fast-agent job artifacts") +@click.option( + "--wait/--no-wait", default=True, help="Wait for remote fast-agent jobs and download results" +) +@click.option( + "--jobs-timeout", + default="2h", + show_default=True, + help="HF Jobs timeout for remote fast-agent runs", +) +@click.option( + "--jobs-flavor", + default="cpu-basic", + show_default=True, + help="HF Jobs hardware flavor for remote fast-agent runs", +) +@click.option( + "--jobs-secrets", + default=None, + help=( + "Comma-separated HF Job secret names to forward. Overrides " + "`jobs_secrets` in upskill.config.yaml." + ), +) +@click.option("--jobs-namespace", help="Optional Hugging Face Jobs namespace") @click.option("-o", "--output", type=click.Path(), help="Output directory for results") @click.option("-v", "--verbose", is_flag=True, help="Show per-run details") +@click.option( + "--max-parallel", + type=click.IntRange(min=1), + default=None, + help=( + "Maximum concurrent evaluation executions per phase. Overrides " + "`max_parallel` in upskill.config.yaml." + ), +) def benchmark_cmd( skill_path: str, models: tuple[str, ...], - num_runs: int, + num_runs: int | None, tests: str | None, test_gen_model: str | None, + executor: ExecutorName | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, output: str | None, verbose: bool, + max_parallel: int | None, ): """Benchmark a skill across multiple models. @@ -1306,8 +1976,16 @@ def benchmark_cmd( test_gen_model, num_runs, tests, + executor, + artifact_repo, + wait, + jobs_timeout, + jobs_flavor, + jobs_secrets, + jobs_namespace, output, verbose, + max_parallel, ) ) @@ -1316,15 +1994,45 @@ async def _benchmark_async( skill_path: str, models: list[str], test_gen_model: str | None, - num_runs: int, + num_runs: int | None, tests_path: str | None, + executor_name: ExecutorName | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, output_dir: str | None, verbose: bool, + max_parallel: int | None, ): """Async implementation of benchmark command.""" - from upskill.evaluate import run_test - config = Config.load() + executor_name = _resolve_executor_name(config, executor_name) + num_runs = _resolve_num_runs(config, num_runs, command="benchmark") + max_parallel = _resolve_max_parallel(config, max_parallel) + jobs_secrets = _resolve_jobs_secrets(config, jobs_secrets) + jobs_config = _require_jobs_config( + executor_name=executor_name, + artifact_repo=artifact_repo, + wait=wait, + jobs_timeout=jobs_timeout, + jobs_flavor=jobs_flavor, + jobs_secrets=jobs_secrets, + jobs_namespace=jobs_namespace, + jobs_image=config.jobs_image, + ) + if executor_name == "jobs" and not wait: + raise click.ClickException( + "`benchmark --executor jobs` currently requires `--wait` to assemble results from " + "downloaded fast-agent artifacts." + ) + executor = _build_executor( + executor_name, + jobs_config=jobs_config, + progress_callback=_print_eval_progress, + ) resolved = resolve_models( "benchmark", config=config, @@ -1342,169 +2050,56 @@ async def _benchmark_async( field="test_generation_model", command="benchmark", ) + model_references = build_fastagent_model_references(config=config, resolved=resolved) _print_model_plan("benchmark", resolved, runs=num_runs) - skill = Skill.load(Path(skill_path)) + skill_record = SkillRecord.load(Path(skill_path)) + skill = skill_record.skill - async with _fast_agent_context(config) as agent: - # Load test cases - if tests_path: - with open(tests_path, encoding="utf-8") as f: - data = json.load(f) - if "cases" in data: - test_cases = [TestCase(**tc) for tc in data["cases"]] - else: - test_cases = [TestCase(**tc) for tc in data] - elif skill.tests: - test_cases = skill.tests - else: - console.print("Generating test cases from skill...", style="dim") - await _set_agent_model(agent.test_gen, test_gen_model) - test_cases = await generate_tests( - skill.description, - generator=agent.test_gen, - model=test_gen_model, - ) + cards = resources.files("upskill").joinpath("agent_cards") + with resources.as_file(cards) as cards_path: + test_cases, _ = await _load_test_cases( + config=config, + skill_record=skill_record, + tests_path=tests_path, + test_gen_model=test_gen_model, + model_references=model_references, + ) # Setup output directory - if output_dir: - out_path = Path(output_dir) - else: - out_path = config.runs_dir + out_path = Path(output_dir) if output_dir else config.runs_dir batch_id, batch_folder = create_batch_folder(out_path) console.print(f"Results will be saved to: {batch_folder}", style="dim") - # Track results per model - model_results: dict[str, list[RunResult]] = {m: [] for m in evaluation_models} - all_run_results: list[RunResult] = [] - console.print( f"\nBenchmarking [bold]{skill.name}[/bold] across {len(evaluation_models)} model(s)" ) console.print(f" {len(test_cases)} test case(s), {num_runs} run(s) per model\n") - - for model in evaluation_models: - console.print(f"[bold]{model}[/bold]") - - for run_num in range(1, num_runs + 1): - run_folder = create_run_folder(batch_folder, len(all_run_results) + 1) - - # Run each test case - total_assertions_passed = 0 - total_assertions = 0 - all_passed = True - run_results: list[TestResult] = [] - - for tc_idx, tc in enumerate(test_cases, 1): - if verbose: - console.print(f" Running test {tc_idx}/{len(test_cases)}...", style="dim") - - try: - result = await run_test( - tc, - evaluator=agent.evaluator, - skill=skill, - model=model, - instance_name=( - f"benchmark ({model} run {run_num} test {tc_idx})" - ), - ) - except Exception as e: - console.print(f" [red]Test error: {e}[/red]") - result = TestResult(test_case=tc, success=False, error=str(e)) - - # Extract assertion counts from validation result - if result.validation_result: - total_assertions_passed += result.validation_result.assertions_passed - total_assertions += result.validation_result.assertions_total - if verbose and result.validation_result.error_message: - console.print( - f" Validation: {result.validation_result.error_message}", - style="dim", - ) - elif result.error: - if verbose: - console.print(f" Error: {result.error}", style="dim") - # Legacy: count as 1 assertion (failed) - total_assertions += 1 - else: - # Legacy: count as 1 assertion - total_assertions += 1 - if result.success: - total_assertions_passed += 1 - - run_results.append(result) - - if not result.success: - all_passed = False - - aggregated_stats = aggregate_conversation_stats(run_results) - - # Create run result - run_result = RunResult( - metadata=RunMetadata( - model=model, - task=skill.description, - batch_id=batch_id, - run_number=run_num, - ), - stats=aggregated_stats, - passed=all_passed, - assertions_passed=total_assertions_passed, - assertions_total=total_assertions, - run_type="with_skill", - skill_name=skill.name, - ) - - write_run_metadata(run_folder, run_result.metadata) - write_run_result(run_folder, run_result) - model_results[model].append(run_result) - all_run_results.append(run_result) - - # Display progress - status = "[green]PASS[/green]" if all_passed else "[red]FAIL[/red]" - if verbose: - console.print( - f" Run {run_num}: {status} " - f"({total_assertions_passed}/{total_assertions} assertions passed)" - ) - - console.print("\n[bold]Summary[/bold]\n") - - for model, results in model_results.items(): - total_runs = len(results) - passed_runs = sum(1 for r in results if r.passed) - avg_tokens = ( - sum(r.stats.total_tokens for r in results) / total_runs if total_runs else 0 - ) - avg_turns = ( - sum(r.stats.turns for r in results) / total_runs if total_runs else 0 - ) - - pass_rate = passed_runs / total_runs if total_runs else 0 - pass_rate_str = f"{pass_rate:.0%}" - pass_rate_style = "green" if pass_rate > 0.5 else "yellow" if pass_rate > 0 else "red" - - console.print(f"[bold]{model}[/bold]") - console.print( - " Runs: " - f"{total_runs} | Passed: {passed_runs} ([{pass_rate_style}]" - f"{pass_rate_str}[/{pass_rate_style}])" - ) - console.print(f" Avg tokens: {avg_tokens:.0f} | Avg turns: {avg_turns:.1f}") - console.print() - - summary = BatchSummary( + model_results, all_run_results = await _run_with_skill_benchmark( + skill_record=skill_record, + evaluation_models=evaluation_models, + num_runs=num_runs, + test_cases=test_cases, + executor=executor, + config=config, + cards_path=cards_path, + batch_id=batch_id, + batch_folder=batch_folder, + verbose=verbose, + log_runs=True, + max_parallel=max_parallel, + ) + _print_benchmark_summary(model_results) + _write_benchmark_summary( + batch_folder=batch_folder, batch_id=batch_id, - model=", ".join(evaluation_models), + evaluation_models=evaluation_models, task=skill.description, - total_runs=len(all_run_results), - passed_runs=sum(1 for r in all_run_results if r.passed), - results=all_run_results, + all_run_results=all_run_results, ) - write_batch_summary(batch_folder, summary) + @main.command("runs") @click.option("-d", "--dir", "runs_dir", type=click.Path(exists=True), help="Runs directory") @@ -1582,13 +2177,13 @@ def runs_cmd( sys.exit(0) # Aggregate by model and skill (take most recent / highest) - aggregated: dict[tuple[str, str], dict] = {} + aggregated: dict[tuple[str, str], EvalPlotResult] = {} for r in all_results: key = (r["model"], r["skill_name"]) if key not in aggregated or r["with_skill_rate"] > aggregated[key]["with_skill_rate"]: aggregated[key] = r - results_list = list(aggregated.values()) + results_list: list[EvalPlotResult] = list(aggregated.values()) # Determine display mode unique_skills = set(r["skill_name"] for r in results_list) @@ -1598,7 +2193,7 @@ def runs_cmd( if len(unique_skills) == 1 and len(unique_models) >= 1: # Single skill, multiple models - use Panel - skill_name = list(unique_skills)[0] + skill_name = next(iter(unique_skills)) # Build content for panel content_lines = [] @@ -1611,7 +2206,7 @@ def runs_cmd( elif len(unique_models) == 1 and len(unique_skills) >= 1: # Single model, multiple skills - use Panel - model_name = list(unique_models)[0] + model_name = next(iter(unique_models)) content_lines = [] for r in sorted(results_list, key=lambda x: x["skill_name"]): @@ -1667,10 +2262,10 @@ def plot_cmd( def _format_comparison_bars( result: EvalPlotResult, metric: str, - label_field: str = "model", + label_field: EvalPlotLabelField = "model", ) -> str: """Format baseline vs with-skill comparison bars for a single result as string.""" - label = result[label_field] + label = result["skill_name"] if label_field == "skill_name" else result["model"] has_baseline = result["has_baseline"] lines = [f"[bold]{label}[/bold]"] @@ -1694,8 +2289,7 @@ def _format_comparison_bars( ) else: lines.append( - " with skill " - f"{with_skill_bar} {with_skill_val:>5.0%} [dim](no baseline)[/dim]" + f" with skill {with_skill_bar} {with_skill_val:>5.0%} [dim](no baseline)[/dim]" ) else: # tokens with_skill_val = result["with_skill_tokens"] @@ -1720,8 +2314,7 @@ def _format_comparison_bars( else: with_skill_bar = _render_bar(1.0 if with_skill_val > 0 else 0) lines.append( - " with skill " - f"{with_skill_bar} {with_skill_val:>6} [dim](no baseline)[/dim]" + f" with skill {with_skill_bar} {with_skill_val:>6} [dim](no baseline)[/dim]" ) return "\n".join(lines) @@ -1730,7 +2323,7 @@ def _format_comparison_bars( def _print_comparison_bars( result: EvalPlotResult, metric: str, - label_field: str = "model", + label_field: EvalPlotLabelField = "model", ) -> None: """Print baseline vs with-skill comparison bars for a single result.""" console.print(_format_comparison_bars(result, metric, label_field)) diff --git a/src/upskill/config.py b/src/upskill/config.py index 01f1f5c..9b86b56 100644 --- a/src/upskill/config.py +++ b/src/upskill/config.py @@ -5,6 +5,7 @@ import os from dataclasses import dataclass from pathlib import Path +from typing import Literal import yaml from pydantic import AliasChoices, BaseModel, ConfigDict, Field @@ -130,9 +131,32 @@ class Config(BaseModel): ) # Generation settings - auto_eval: bool = Field(default=True, description="Run eval after generation") max_refine_attempts: int = Field(default=2, description="Max refinement iterations") + # Execution settings + executor: Literal["local", "jobs"] = Field( + default="local", + description="Default execution backend for evaluation and refinement", + ) + num_runs: int | None = Field( + default=None, + ge=1, + description="Default runs per model for eval/benchmark when CLI --runs is omitted", + ) + max_parallel: int = Field( + default=5, + ge=1, + description="Maximum concurrent evaluation executions per phase", + ) + jobs_secrets: str = Field( + default="HF_TOKEN", + description="Comma-separated env var names to forward to HF Jobs when using executor=jobs", + ) + jobs_image: str = Field( + default="ghcr.io/astral-sh/uv:python3.13-bookworm", + description="Container image used for HF Jobs when using executor=jobs", + ) + # FastAgent settings fastagent_config: Path | None = Field(default=None, description="Path to fastagent.config.yaml") @@ -176,6 +200,14 @@ def effective_eval_model(self) -> str: """Get the model to use for evaluation.""" return self.eval_model or self.skill_generation_model + def effective_num_runs(self, command: Literal["eval", "benchmark"]) -> int: + """Get the number of runs to use when CLI ``--runs`` is omitted.""" + if self.num_runs is not None: + return self.num_runs + if command == "benchmark": + return 3 + return 1 + @property def model(self) -> str: """Backward-compatible alias for ``skill_generation_model``.""" diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py index fdc07c4..b5412ca 100644 --- a/src/upskill/evaluate.py +++ b/src/upskill/evaluate.py @@ -1,29 +1,22 @@ -"""Skill evaluation - compare agent performance with and without skills using FastAgent.""" +"""Skill evaluation orchestration backed by an execution backend.""" from __future__ import annotations import asyncio +import json import logging -import shutil -import tempfile -from collections.abc import Generator -from contextlib import contextmanager, nullcontext -from pathlib import Path - -from fast_agent import ConversationSummary -from fast_agent.agents.llm_agent import LlmAgent - -try: - from fast_agent.ui.rich_progress import progress_display -except Exception: # pragma: no cover - defensive import for older fast-agent versions - progress_display = None - -from upskill.fastagent_integration import ( - compose_instruction, -) -from upskill.logging import extract_stats_from_summary +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Callable + from pathlib import Path + + from upskill.executors.base import Executor + +from upskill.artifacts import ensure_directory, sanitize_artifact_name +from upskill.executors.contracts import ExecutionRequest from upskill.models import ( - ConversationStats, EvalResults, ExpectedSpec, Skill, @@ -33,49 +26,117 @@ ) from upskill.validators import get_validator +logger = logging.getLogger(__name__) -def _hide_progress_task(task_name: str | None) -> None: - """Best-effort hide of a completed task from the shared progress display.""" - if not task_name or progress_display is None: - return - hide_task = getattr(progress_display, "hide_task", None) - if not callable(hide_task): - return - try: - hide_task(task_name) - except Exception: - # Progress cleanup is best-effort and should never fail evaluations. - return -logger = logging.getLogger(__name__) +@dataclass(frozen=True, slots=True) +class PendingEvaluationRequest: + """A single evaluation request prepared for backend submission.""" -PROMPT = ( - "You are an evaluator of skills. You are given a skill and a test case. " - "You need to evaluate the skill on the test case and return a score." -) + phase_label: str + test_index: int + request: ExecutionRequest -@contextmanager -def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Generator[Path]: - """Create an isolated workspace for a test run. +def _format_execution_error( + error: str, + *, + metadata: dict[str, str | int | float | bool | None] | None, +) -> str: + """Append useful execution identifiers to surfaced backend errors.""" + if metadata is None: + return error + + job_id = metadata.get("job_id") + if isinstance(job_id, str) and job_id: + return f"{error} (job {job_id})" + return error + + +def _write_test_result_summary(path: Path, result: TestResult) -> None: + """Persist a per-test result summary alongside raw artifacts.""" + path.write_text( + json.dumps(result.model_dump(mode="json"), indent=2), + encoding="utf-8", + ) - Args: - base_dir: Optional parent directory for the workspace - cleanup: Whether to clean up the workspace after (default True) - Yields: - Path to the temporary workspace directory - """ - workspace = tempfile.mkdtemp(dir=base_dir, prefix="upskill_run_") - workspace_path = Path(workspace) +def _load_test_result_summary(path: Path) -> TestResult | None: + """Load a persisted per-test result summary.""" + if not path.exists(): + return None try: - yield workspace_path - finally: - if cleanup: - try: - shutil.rmtree(workspace_path, ignore_errors=True) - except Exception: - pass # Ignore cleanup errors + return TestResult.model_validate_json(path.read_text(encoding="utf-8")) + except ValueError: + return None + + +def apply_eval_metrics(results: EvalResults, test_cases: list[TestCase]) -> EvalResults: + """Populate aggregate metrics on an ``EvalResults`` instance.""" + successes = sum(1 for r in results.with_skill_results if r.success) + results.with_skill_success_rate = successes / len(test_cases) if test_cases else 0 + results.with_skill_total_tokens = sum(r.stats.total_tokens for r in results.with_skill_results) + results.with_skill_avg_turns = ( + sum(r.stats.turns for r in results.with_skill_results) / len(test_cases) + if test_cases + else 0 + ) + + if results.baseline_results: + successes = sum(1 for r in results.baseline_results if r.success) + results.baseline_success_rate = successes / len(test_cases) if test_cases else 0 + results.baseline_total_tokens = sum(r.stats.total_tokens for r in results.baseline_results) + results.baseline_avg_turns = ( + sum(r.stats.turns for r in results.baseline_results) / len(test_cases) + if test_cases + else 0 + ) + + return results + + +def load_eval_results_from_artifact_root( + *, + skill_name: str, + model: str, + artifact_root: Path, +) -> EvalResults | None: + """Reconstruct eval results from persisted per-test summaries.""" + if not artifact_root.exists(): + return None + + with_skill_results = [ + loaded + for loaded in ( + _load_test_result_summary(summary_path) + for summary_path in sorted( + (artifact_root / "with-skill").glob("test_*/test_result.json") + ) + ) + if loaded is not None + ] + baseline_results = [ + loaded + for loaded in ( + _load_test_result_summary(summary_path) + for summary_path in sorted((artifact_root / "baseline").glob("test_*/test_result.json")) + ) + if loaded is not None + ] + + if not with_skill_results and not baseline_results: + return None + + reconstructed = EvalResults( + skill_name=skill_name, + model=model, + with_skill_results=with_skill_results, + baseline_results=baseline_results, + ) + test_cases = [result.test_case for result in with_skill_results] + if not test_cases: + test_cases = [result.test_case for result in baseline_results] + return apply_eval_metrics(reconstructed, test_cases) def check_expected( @@ -115,208 +176,285 @@ def check_expected( return True, None +def format_test_prompt(test_case: TestCase) -> str: + """Build the evaluator prompt, preserving legacy inline file context.""" + prompt = test_case.input + if test_case.context and test_case.context.files: + for filename, content in test_case.context.files.items(): + prompt += f"\n\n```{filename}\n{content}\n```" + return prompt + + +def build_eval_execution_request( + test_case: TestCase, + *, + skill: Skill | None, + model: str, + fastagent_config_path: Path, + cards_source_dir: Path, + artifact_dir: Path, + agent_name: str = "evaluator", + instance_name: str | None = None, + operation: str = "eval", +) -> ExecutionRequest: + """Build the normalized execution request for a single evaluation test.""" + workspace_files = ( + dict(test_case.context.files) if test_case.context and test_case.context.files else {} + ) + normalized_artifact_dir = artifact_dir.resolve() + return ExecutionRequest( + prompt=format_test_prompt(test_case), + model=model, + agent=agent_name, + fastagent_config_path=fastagent_config_path.resolve(), + artifact_dir=normalized_artifact_dir, + cards_source_dir=cards_source_dir.resolve(), + label=instance_name or (skill.name if skill else "baseline"), + skill=skill, + workspace_files=workspace_files, + metadata={ + "instance_name": instance_name, + "operation": operation, + "skill_name": skill.name if skill else None, + "has_validator": bool(test_case.validator), + }, + ) + + +def build_eval_requests( + *, + skill: Skill, + test_cases: list[TestCase], + model: str, + fastagent_config_path: Path, + cards_source_dir: Path, + artifact_root: Path, + run_baseline: bool = True, + operation: str = "eval", +) -> list[PendingEvaluationRequest]: + """Build all execution requests needed for an evaluation run.""" + requests: list[PendingEvaluationRequest] = [] + + for phase_label, batch_skill in _iter_evaluation_phases(skill, run_baseline): + batch_root = ensure_directory(artifact_root / sanitize_artifact_name(phase_label)) + for index, test_case in enumerate(test_cases, start=1): + instance_name = f"eval ({phase_label} test {index})" + requests.append( + PendingEvaluationRequest( + phase_label=phase_label, + test_index=index, + request=build_eval_execution_request( + test_case, + skill=batch_skill, + model=model, + fastagent_config_path=fastagent_config_path, + cards_source_dir=cards_source_dir, + artifact_dir=batch_root / f"test_{index}", + instance_name=instance_name, + operation=operation, + ), + ) + ) + + return requests + + +def _iter_evaluation_phases( + skill: Skill, + run_baseline: bool, +) -> list[tuple[str, Skill | None]]: + phases: list[tuple[str, Skill | None]] = [("with-skill", skill)] + if run_baseline: + phases.append(("baseline", None)) + return phases + + async def _run_test_with_evaluator( test_case: TestCase, - evaluator: LlmAgent, - instruction: str | None, + executor: Executor, *, - use_workspace: bool | None = None, + skill: Skill | None, + model: str, + fastagent_config_path: Path, + cards_source_dir: Path, + artifact_dir: Path, + agent_name: str = "evaluator", instance_name: str | None = None, + operation: str = "eval", ) -> TestResult: - """Run a single test case using a provided evaluator agent.""" - user_content = test_case.input - if test_case.context and test_case.context.files: - for filename, content in test_case.context.files.items(): - user_content += f"\n\n```{filename}\n{content}\n```" - - # Determine if we need workspace isolation - needs_workspace = use_workspace if use_workspace is not None else bool(test_case.validator) - - async def _run_in_workspace(workspace: Path | None) -> TestResult: - clone: LlmAgent | None = None - try: - clone = await evaluator.spawn_detached_instance(name=instance_name) - if workspace is not None: - enable_shell = getattr(clone, "enable_shell", None) - shell_enabled = getattr(clone, "shell_runtime_enabled", False) - if shell_enabled and callable(enable_shell): - enable_shell(working_directory=workspace) - - if instruction is None: - clone.set_instruction("") - else: - clone.set_instruction(instruction) - output = await clone.send(user_content) - stats = ConversationStats() - - # Extract stats from agent history - try: - history = clone.message_history - summary = ConversationSummary(messages=history) - stats = extract_stats_from_summary(summary) - except Exception as exc: - logger.exception("Failed to extract stats from evaluator history", exc_info=exc) - - # Check expected with custom validator support - if workspace and test_case.validator: - success, validation_result = check_expected( - output or "", - test_case.expected, - workspace, - test_case, - ) - else: - success, validation_result = check_expected( - output or "", - test_case.expected, - None, - test_case, - ) + """Run a single test case through the configured executor.""" + request = build_eval_execution_request( + test_case, + skill=skill, + model=model, + fastagent_config_path=fastagent_config_path, + cards_source_dir=cards_source_dir, + artifact_dir=artifact_dir, + agent_name=agent_name, + instance_name=instance_name, + operation=operation, + ) + normalized_artifact_dir = request.artifact_dir - return TestResult( - test_case=test_case, - success=success, - output=output, - tokens_used=stats.total_tokens, - turns=stats.turns, - stats=stats, - validation_result=validation_result, - ) - except Exception as exc: - return TestResult(test_case=test_case, success=False, error=str(exc)) - finally: - if clone is not None: - try: - await clone.shutdown() - except Exception as exc: - logger.exception("Failed to shutdown evaluator clone", exc_info=exc) - _hide_progress_task(instance_name) - - if needs_workspace: - with isolated_workspace() as workspace: - return await _run_in_workspace(workspace) - return await _run_in_workspace(None) + try: + handle = await executor.execute(request) + execution_result = await executor.collect(handle) + except Exception as exc: + logger.exception("Evaluation execution failed", exc_info=exc) + result = TestResult(test_case=test_case, success=False, error=str(exc)) + _write_test_result_summary(normalized_artifact_dir / "test_result.json", result) + return result + + if execution_result.error is not None: + result = TestResult( + test_case=test_case, + success=False, + output=execution_result.output_text, + tokens_used=execution_result.stats.total_tokens, + turns=execution_result.stats.turns, + error=_format_execution_error( + execution_result.error, + metadata=execution_result.metadata, + ), + stats=execution_result.stats, + ) + _write_test_result_summary(normalized_artifact_dir / "test_result.json", result) + return result + + success, validation_result = check_expected( + execution_result.output_text or "", + test_case.expected, + execution_result.workspace_dir, + test_case, + ) + result = TestResult( + test_case=test_case, + success=success, + output=execution_result.output_text, + tokens_used=execution_result.stats.total_tokens, + turns=execution_result.stats.turns, + stats=execution_result.stats, + validation_result=validation_result, + ) + _write_test_result_summary(normalized_artifact_dir / "test_result.json", result) + return result async def run_test( test_case: TestCase, - evaluator: LlmAgent, + executor: Executor, skill: Skill | None, - use_workspace: bool | None = None, - model: str | None = None, + *, + model: str, + fastagent_config_path: Path, + cards_source_dir: Path, + artifact_dir: Path, instance_name: str | None = None, + operation: str = "eval", ) -> TestResult: - """Run a single test case using an evaluator agent. + """Run a single test case via the execution backend. Args: test_case: The test case to run - evaluator: Evaluator agent to run the test case + executor: Execution backend to use skill: Optional skill to inject (None for baseline) - use_workspace: Force workspace isolation (auto-detected from test_case.validator) model: Model to evaluate with for this test case + fastagent_config_path: Fast-agent config to pass through to execution + cards_source_dir: Source directory for bundled agent cards + artifact_dir: Output directory for raw execution artifacts instance_name: Optional evaluator instance display name + operation: High-level command family for labeling submitted jobs """ - - try: - if model is not None: - await evaluator.set_model(model) - instruction = compose_instruction(evaluator.instruction, skill) if skill else None - return await _run_test_with_evaluator( - test_case, - evaluator, - instruction, - use_workspace=use_workspace, - instance_name=instance_name, - ) - except Exception as exc: - return TestResult(test_case=test_case, success=False, error=str(exc)) + return await _run_test_with_evaluator( + test_case, + executor, + skill=skill, + model=model, + fastagent_config_path=fastagent_config_path, + cards_source_dir=cards_source_dir, + artifact_dir=artifact_dir, + instance_name=instance_name, + operation=operation, + ) async def evaluate_skill( skill: Skill, test_cases: list[TestCase], - evaluator: LlmAgent, - model: str | None = None, + executor: Executor, + *, + model: str, + fastagent_config_path: Path, + cards_source_dir: Path, + artifact_root: Path, run_baseline: bool = True, - show_baseline_progress: bool = False, + max_parallel: int = 5, + progress_callback: Callable[[str], None] | None = None, + operation: str = "eval", ) -> EvalResults: """Evaluate a skill against test cases using FastAgent. Args: skill: The skill to evaluate test_cases: Test cases to run - evaluator: Evaluator agent to run the test cases - model: Model to evaluate on (defaults to config.eval_model) + executor: Execution backend to use + model: Model to evaluate on + fastagent_config_path: Fast-agent config path to propagate + cards_source_dir: Source directory for evaluator cards + artifact_root: Artifact root for preserved raw execution outputs run_baseline: Whether to also run without the skill - show_baseline_progress: Whether to render baseline progress output + max_parallel: Maximum number of concurrent test executions + progress_callback: Optional callback for lightweight progress updates + operation: High-level command family for labeling submitted jobs Returns: EvalResults comparing skill vs baseline """ - results = EvalResults(skill_name=skill.name, model=model) - - base_instruction = evaluator.instruction + semaphore = asyncio.Semaphore(max_parallel) + ensure_directory(artifact_root) async def _run_batch( - instruction: str | None, + batch_skill: Skill | None, label: str, ) -> list[TestResult]: - tasks = [] - for index, tc in enumerate(test_cases, start=1): + batch_root = ensure_directory(artifact_root / sanitize_artifact_name(label)) + + async def _run_single(index: int, test_case: TestCase) -> TestResult: instance_name = f"eval ({label} test {index})" - tasks.append( - _run_test_with_evaluator( - tc, - evaluator, - instruction, + test_artifact_dir = batch_root / f"test_{index}" + if progress_callback is not None: + progress_callback(f"starting {label} test {index}/{len(test_cases)}") + async with semaphore: + result = await run_test( + test_case, + executor, + batch_skill, + model=model, + fastagent_config_path=fastagent_config_path, + cards_source_dir=cards_source_dir, + artifact_dir=test_artifact_dir, instance_name=instance_name, + operation=operation, ) - ) + if progress_callback is not None: + status = "ok" if result.success else "failed" + progress_callback(f"finished {label} test {index}/{len(test_cases)} ({status})") + return result + + tasks = [ + asyncio.create_task(_run_single(index, test_case)) + for index, test_case in enumerate(test_cases, start=1) + ] return await asyncio.gather(*tasks) - if model is not None: - await evaluator.set_model(model) - # Run with skill - skill_instruction = compose_instruction(base_instruction, skill) - results.with_skill_results = await _run_batch(skill_instruction, "with-skill") - - # Calculate with-skill metrics - successes = sum(1 for r in results.with_skill_results if r.success) - results.with_skill_success_rate = successes / len(test_cases) if test_cases else 0 - results.with_skill_total_tokens = sum( - r.stats.total_tokens for r in results.with_skill_results - ) - results.with_skill_avg_turns = ( - sum(r.stats.turns for r in results.with_skill_results) / len(test_cases) - if test_cases - else 0 - ) + results.with_skill_results = await _run_batch(skill, "with-skill") # Run baseline if requested if run_baseline: - pause_cm = nullcontext() - if not show_baseline_progress and progress_display is not None: - paused = getattr(progress_display, "paused", None) - if callable(paused): - pause_cm = paused() - - with pause_cm: - results.baseline_results = await _run_batch(None, "baseline") - - successes = sum(1 for r in results.baseline_results if r.success) - results.baseline_success_rate = successes / len(test_cases) if test_cases else 0 - results.baseline_total_tokens = sum( - r.stats.total_tokens for r in results.baseline_results - ) - results.baseline_avg_turns = ( - sum(r.stats.turns for r in results.baseline_results) / len(test_cases) - if test_cases - else 0 - ) - - return results + results.baseline_results = await _run_batch(None, "baseline") + return apply_eval_metrics(results, test_cases) def get_failure_descriptions(results: EvalResults) -> list[str]: diff --git a/src/upskill/executors/__init__.py b/src/upskill/executors/__init__.py new file mode 100644 index 0000000..916d7ba --- /dev/null +++ b/src/upskill/executors/__init__.py @@ -0,0 +1 @@ +"""Execution backends for evaluation flows.""" diff --git a/src/upskill/executors/base.py b/src/upskill/executors/base.py new file mode 100644 index 0000000..d3bad5e --- /dev/null +++ b/src/upskill/executors/base.py @@ -0,0 +1,21 @@ +"""Internal executor protocol for evaluation runs.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult + + +class Executor(Protocol): + """Internal execution interface used by evaluation orchestration.""" + + async def execute(self, request: ExecutionRequest) -> ExecutionHandle: + """Start execution for a single request.""" + + async def collect(self, handle: ExecutionHandle) -> ExecutionResult: + """Wait for a previously started execution and collect artifacts/results.""" + + async def cancel(self, handle: ExecutionHandle) -> None: + """Cancel a previously started execution.""" diff --git a/src/upskill/executors/contracts.py b/src/upskill/executors/contracts.py new file mode 100644 index 0000000..2c7f680 --- /dev/null +++ b/src/upskill/executors/contracts.py @@ -0,0 +1,53 @@ +"""Execution request and result contracts for evaluation.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +from upskill.models import ConversationStats, Skill + +if TYPE_CHECKING: + import asyncio + from pathlib import Path + +ExecutionMetadataValue = str | int | float | bool | None + + +@dataclass(slots=True) +class ExecutionRequest: + """Semantic execution request for a single evaluation run.""" + + prompt: str + model: str + agent: str + fastagent_config_path: Path + artifact_dir: Path + cards_source_dir: Path + label: str + skill: Skill | None = None + workspace_files: dict[str, str] = field(default_factory=dict) + metadata: dict[str, ExecutionMetadataValue] = field(default_factory=dict) + + +@dataclass(slots=True) +class ExecutionResult: + """Collected execution result plus preserved artifact paths.""" + + output_text: str | None + raw_results_path: Path | None + stdout_path: Path + stderr_path: Path + artifact_dir: Path + workspace_dir: Path + stats: ConversationStats = field(default_factory=ConversationStats) + error: str | None = None + metadata: dict[str, ExecutionMetadataValue] = field(default_factory=dict) + + +@dataclass(slots=True) +class ExecutionHandle: + """In-flight execution handle.""" + + request: ExecutionRequest + task: asyncio.Task[ExecutionResult] diff --git a/src/upskill/executors/local_fast_agent.py b/src/upskill/executors/local_fast_agent.py new file mode 100644 index 0000000..29ec02d --- /dev/null +++ b/src/upskill/executors/local_fast_agent.py @@ -0,0 +1,142 @@ +"""Local shell-out executor for fast-agent-backed evaluation.""" + +from __future__ import annotations + +import asyncio +import json +from dataclasses import replace + +from upskill.artifacts import ( + bundle_agent_card, + copy_config_file, + ensure_directory, + materialize_skill_bundle, + materialize_workspace, + write_request_file, +) +from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult +from upskill.fast_agent_cli import build_fast_agent_command +from upskill.models import ConversationStats +from upskill.result_parsing import parse_fast_agent_results + + +class LocalFastAgentExecutor: + """Execute evaluation requests by shelling out to ``fast-agent`` locally.""" + + def __init__(self, *, fast_agent_bin: str = "fast-agent") -> None: + self._fast_agent_bin = fast_agent_bin + + async def execute(self, request: ExecutionRequest) -> ExecutionHandle: + """Start a local subprocess execution.""" + task = asyncio.create_task(self._run_request(request)) + return ExecutionHandle(request=request, task=task) + + async def collect(self, handle: ExecutionHandle) -> ExecutionResult: + """Collect a previously started subprocess execution.""" + return await handle.task + + async def cancel(self, handle: ExecutionHandle) -> None: + """Cancel a previously started subprocess execution.""" + handle.task.cancel() + try: + await handle.task + except asyncio.CancelledError: + return + + async def _run_request(self, request: ExecutionRequest) -> ExecutionResult: + artifact_dir = ensure_directory(request.artifact_dir.resolve()) + normalized_request = replace( + request, + fastagent_config_path=request.fastagent_config_path.resolve(), + artifact_dir=artifact_dir, + cards_source_dir=request.cards_source_dir.resolve(), + ) + workspace_dir = ensure_directory(artifact_dir / "workspace") + materialize_workspace(workspace_dir, normalized_request.workspace_files) + + cards_dir = bundle_agent_card( + normalized_request.cards_source_dir, + artifact_dir / "cards", + agent_name=normalized_request.agent, + ) + skills_dir = materialize_skill_bundle(artifact_dir / "skills", normalized_request) + preserved_config_path = copy_config_file( + normalized_request.fastagent_config_path, + artifact_dir / "fastagent.config.yaml", + ) + workspace_config_path = copy_config_file( + normalized_request.fastagent_config_path, + workspace_dir / "fastagent.config.yaml", + ) + del preserved_config_path, workspace_config_path + + request_path = artifact_dir / "request.json" + write_request_file(request_path, normalized_request) + + prompt_path = artifact_dir / "prompt.txt" + prompt_path.write_text(normalized_request.prompt, encoding="utf-8") + + results_path = artifact_dir / "results.json" + stdout_path = artifact_dir / "stdout.txt" + stderr_path = artifact_dir / "stderr.txt" + command = build_fast_agent_command( + normalized_request, + config_path=normalized_request.fastagent_config_path + if normalized_request.fastagent_config_path.exists() + else None, + cards_dir=cards_dir, + skills_dir=skills_dir, + prompt_path=prompt_path, + results_path=results_path, + fast_agent_bin=self._fast_agent_bin, + ) + command_path = artifact_dir / "command.json" + command_path.write_text(json.dumps(command, indent=2), encoding="utf-8") + + process = await asyncio.create_subprocess_exec( + *command, + cwd=workspace_dir, + stdin=asyncio.subprocess.DEVNULL, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout_bytes, stderr_bytes = await process.communicate() + stdout_text = stdout_bytes.decode("utf-8", errors="replace") + stderr_text = stderr_bytes.decode("utf-8", errors="replace") + stdout_path.write_text(stdout_text, encoding="utf-8") + stderr_path.write_text(stderr_text, encoding="utf-8") + + error: str | None = None + parsed_output: str | None = None + parsed_stats = None + + if not results_path.exists(): + error = "fast-agent run did not produce a results artifact." + else: + try: + parsed = parse_fast_agent_results(results_path) + except Exception as exc: + error = f"Failed to parse fast-agent results: {exc}" + else: + parsed_output = parsed.output_text + parsed_stats = parsed.stats + + if process.returncode != 0: + exit_error = f"fast-agent exited with code {process.returncode}." + error = f"{error} {exit_error}".strip() if error else exit_error + + result = ExecutionResult( + output_text=parsed_output, + raw_results_path=results_path if results_path.exists() else None, + stdout_path=stdout_path, + stderr_path=stderr_path, + artifact_dir=artifact_dir, + workspace_dir=workspace_dir, + stats=parsed_stats or ConversationStats(), + error=error, + metadata={ + **normalized_request.metadata, + "return_code": process.returncode, + }, + ) + return result diff --git a/src/upskill/executors/remote_fast_agent.py b/src/upskill/executors/remote_fast_agent.py new file mode 100644 index 0000000..fef26be --- /dev/null +++ b/src/upskill/executors/remote_fast_agent.py @@ -0,0 +1,322 @@ +"""Remote HF Jobs-backed executor for fast-agent evaluation.""" + +from __future__ import annotations + +import asyncio +import json +import shutil +import tarfile +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING + +from upskill.artifacts import ( + bundle_agent_card, + copy_config_file, + ensure_directory, + materialize_skill_bundle, + materialize_workspace, + write_request_file, +) +from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult +from upskill.hf_jobs import ( + JobsConfig, + SubmittedJob, + _make_run_id, + _sanitize_hf_job_label_value, + _submit_bundle_job, + parse_duration_seconds, + wait_for_job_outputs, +) +from upskill.models import ConversationStats +from upskill.result_parsing import parse_fast_agent_results + +if TYPE_CHECKING: + from collections.abc import Callable + + +class RemoteFastAgentExecutor: + """Execute evaluation requests by submitting one HF job per request.""" + + def __init__( + self, + *, + jobs_config: JobsConfig, + progress_callback: Callable[[str], None] | None = None, + ) -> None: + self._jobs_config = jobs_config + self._progress_callback = progress_callback + + async def execute(self, request: ExecutionRequest) -> ExecutionHandle: + """Start a remote job-backed execution.""" + task = asyncio.create_task(asyncio.to_thread(self._run_request_sync, request)) + return ExecutionHandle(request=request, task=task) + + async def submit(self, request: ExecutionRequest) -> SubmittedJob: + """Submit a remote execution request without waiting for results.""" + return await asyncio.to_thread(self._submit_request_sync, request) + + async def collect(self, handle: ExecutionHandle) -> ExecutionResult: + """Collect a previously started remote execution.""" + return await handle.task + + async def cancel(self, handle: ExecutionHandle) -> None: + """Cancel a previously started remote execution.""" + handle.task.cancel() + try: + await handle.task + except asyncio.CancelledError: + return + + def _submit_request_sync(self, request: ExecutionRequest) -> SubmittedJob: + normalized_request, artifact_dir = self._prepare_request(request) + return self._submit_prepared_request(normalized_request, artifact_dir) + + def _submit_prepared_request( + self, + request: ExecutionRequest, + artifact_dir: Path, + ) -> SubmittedJob: + temp_root, bundle_archive = self._create_bundle_archive(request) + try: + submission = self._submit_bundle(request, bundle_archive) + finally: + shutil.rmtree(temp_root, ignore_errors=True) + + request_path = artifact_dir / "submitted_job.json" + request_path.write_text( + json.dumps( + { + "job_id": submission.job_id, + "run_id": submission.run_id, + "artifact_repo": submission.artifact_repo, + }, + indent=2, + ), + encoding="utf-8", + ) + return submission + + def _run_request_sync(self, request: ExecutionRequest) -> ExecutionResult: + normalized_request, artifact_dir = self._prepare_request(request) + workspace_dir = artifact_dir / "workspace" + submission = self._submit_prepared_request(normalized_request, artifact_dir) + + remote_output_dir = wait_for_job_outputs( + submission, + destination_root=artifact_dir / "remote_download", + wait_timeout_seconds=parse_duration_seconds(self._jobs_config.jobs_timeout), + progress_callback=self._progress_callback, + ) + + stdout_path = artifact_dir / "stdout.txt" + stderr_path = artifact_dir / "stderr.txt" + results_path = artifact_dir / "results.json" + self._materialize_remote_outputs( + remote_output_dir=remote_output_dir, + artifact_dir=artifact_dir, + workspace_dir=workspace_dir, + stdout_path=stdout_path, + stderr_path=stderr_path, + results_path=results_path, + ) + + exit_code = self._read_exit_code(remote_output_dir) + error: str | None = None + parsed_output: str | None = None + parsed_stats = ConversationStats() + + if not results_path.exists(): + error = "fast-agent run did not produce a results artifact." + else: + try: + parsed = parse_fast_agent_results(results_path) + except Exception as exc: + error = f"Failed to parse fast-agent results: {exc}" + else: + parsed_output = parsed.output_text + parsed_stats = parsed.stats + + if exit_code not in {"", "0"}: + exit_error = f"fast-agent exited with code {exit_code}." + error = f"{error} {exit_error}".strip() if error else exit_error + + metadata = { + **normalized_request.metadata, + "job_id": submission.job_id, + "run_id": submission.run_id, + "return_code": int(exit_code) if exit_code else None, + } + return ExecutionResult( + output_text=parsed_output, + raw_results_path=results_path if results_path.exists() else None, + stdout_path=stdout_path, + stderr_path=stderr_path, + artifact_dir=artifact_dir, + workspace_dir=workspace_dir, + stats=parsed_stats, + error=error, + metadata=metadata, + ) + + def _prepare_request(self, request: ExecutionRequest) -> tuple[ExecutionRequest, Path]: + artifact_dir = ensure_directory(request.artifact_dir.resolve()) + normalized_request = ExecutionRequest( + prompt=request.prompt, + model=request.model, + agent=request.agent, + fastagent_config_path=request.fastagent_config_path.resolve(), + artifact_dir=artifact_dir, + cards_source_dir=request.cards_source_dir.resolve(), + label=request.label, + skill=request.skill, + workspace_files=dict(request.workspace_files), + metadata=dict(request.metadata), + ) + workspace_dir = ensure_directory(artifact_dir / "workspace") + materialize_workspace(workspace_dir, normalized_request.workspace_files) + + bundle_agent_card( + normalized_request.cards_source_dir, + artifact_dir / "cards", + agent_name=normalized_request.agent, + ) + materialize_skill_bundle(artifact_dir / "skills", normalized_request) + copy_config_file( + normalized_request.fastagent_config_path, + artifact_dir / "fastagent.config.yaml", + ) + copy_config_file( + normalized_request.fastagent_config_path, + workspace_dir / "fastagent.config.yaml", + ) + + request_path = artifact_dir / "request.json" + write_request_file(request_path, normalized_request) + (artifact_dir / "prompt.txt").write_text(normalized_request.prompt, encoding="utf-8") + return normalized_request, artifact_dir + + def _submit_bundle(self, request: ExecutionRequest, bundle_archive: Path) -> SubmittedJob: + run_id = _make_run_id("request", request.model, request.label) + labels = self._build_job_labels(request, run_id=run_id) + submission = _submit_bundle_job( + bundle_archive=bundle_archive, + jobs_config=self._jobs_config, + run_id=run_id, + model=request.model, + labels=labels, + ) + if self._progress_callback is not None: + self._progress_callback( + f"submitted remote request {request.label} as job " + f"{submission.job_id} (run_id={submission.run_id})" + ) + return submission + + def _build_job_labels(self, request: ExecutionRequest, *, run_id: str) -> dict[str, str]: + operation = request.metadata.get("operation") + labels = { + "upskill-agent": _sanitize_hf_job_label_value(request.agent, default="agent"), + "upskill-executor": "remote-fast-agent", + "upskill-model": _sanitize_hf_job_label_value(request.model, default="model"), + "upskill-operation": _sanitize_hf_job_label_value( + operation if isinstance(operation, str) else "eval", + default="eval", + ), + "upskill-request": _sanitize_hf_job_label_value(request.label, default="request"), + "upskill-run-id": _sanitize_hf_job_label_value(run_id, default="run"), + } + if request.skill is not None: + labels["upskill-skill"] = _sanitize_hf_job_label_value( + request.skill.name, + default="skill", + ) + return labels + + def _create_bundle_archive(self, request: ExecutionRequest) -> tuple[Path, Path]: + temp_root = Path(tempfile.mkdtemp(prefix="upskill_hf_request_")) + bundle_root = temp_root / "bundle" + ensure_directory(bundle_root) + ensure_directory(bundle_root / "skills") + if request.skill is not None: + request.skill.save(bundle_root / "skills" / request.skill.name) + bundle_agent_card( + request.cards_source_dir, + bundle_root / "cards", + agent_name=request.agent, + ) + copy_config_file(request.fastagent_config_path, bundle_root / "fastagent.config.yaml") + (bundle_root / "agent.txt").write_text(request.agent, encoding="utf-8") + + request_dir = ensure_directory(bundle_root / "requests" / "request_1") + (request_dir / "prompt.txt").write_text(request.prompt, encoding="utf-8") + request_workspace_dir = ensure_directory(request_dir / "workspace") + materialize_workspace(request_workspace_dir, request.workspace_files) + (bundle_root / "manifest.json").write_text( + json.dumps( + { + "request_count": 1, + "requests": [ + { + "id": "request_1", + "index": 1, + "has_workspace_files": bool(request.workspace_files), + } + ], + }, + indent=2, + ), + encoding="utf-8", + ) + + entrypoint_source = ( + Path(__file__).resolve().parents[3] + / "scripts" + / "hf" + / "job_entrypoint_eval_fast_agent.sh" + ) + shutil.copy2(entrypoint_source, bundle_root / "job_entrypoint.sh") + bundle_archive = temp_root / "bundle.tar.gz" + with tarfile.open(bundle_archive, "w:gz") as archive: + archive.add(bundle_root, arcname="bundle") + return temp_root, bundle_archive + + def _materialize_remote_outputs( + self, + *, + remote_output_dir: Path, + artifact_dir: Path, + workspace_dir: Path, + stdout_path: Path, + stderr_path: Path, + results_path: Path, + ) -> None: + preserved_output_dir = artifact_dir / "remote_output" + if preserved_output_dir.exists(): + shutil.rmtree(preserved_output_dir) + shutil.copytree(remote_output_dir, preserved_output_dir) + + remote_stdout = remote_output_dir / "logs" / "request_1.out.txt" + remote_stderr = remote_output_dir / "logs" / "request_1.err.txt" + remote_results = remote_output_dir / "results" / "request_1.json" + remote_workspace = remote_output_dir / "workspaces" / "request_1" + + if remote_stdout.exists(): + shutil.copy2(remote_stdout, stdout_path) + else: + stdout_path.write_text("", encoding="utf-8") + if remote_stderr.exists(): + shutil.copy2(remote_stderr, stderr_path) + else: + stderr_path.write_text("", encoding="utf-8") + if remote_results.exists(): + shutil.copy2(remote_results, results_path) + if remote_workspace.exists(): + shutil.rmtree(workspace_dir, ignore_errors=True) + shutil.copytree(remote_workspace, workspace_dir) + + def _read_exit_code(self, remote_output_dir: Path) -> str: + status_path = remote_output_dir / "status" / "request_1.exit_code.txt" + if not status_path.exists(): + return "" + return status_path.read_text(encoding="utf-8").strip() diff --git a/src/upskill/fast_agent_cli.py b/src/upskill/fast_agent_cli.py new file mode 100644 index 0000000..208ad3e --- /dev/null +++ b/src/upskill/fast_agent_cli.py @@ -0,0 +1,44 @@ +"""Helpers for building fast-agent CLI invocations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + from upskill.executors.contracts import ExecutionRequest + + +def build_fast_agent_command( + request: ExecutionRequest, + *, + config_path: Path | None, + cards_dir: Path, + skills_dir: Path, + prompt_path: Path, + results_path: Path, + fast_agent_bin: str = "fast-agent", +) -> list[str]: + """Build the canonical fast-agent automation command for a request.""" + command = [fast_agent_bin, "go"] + if config_path is not None: + command.extend(["--config-path", str(config_path)]) + command.extend( + [ + "--card", + str(cards_dir), + "--agent", + request.agent, + "--model", + request.model, + "--skills-dir", + str(skills_dir), + "--prompt-file", + str(prompt_path), + "--results", + str(results_path), + "--quiet", + ] + ) + return command diff --git a/src/upskill/fastagent_integration.py b/src/upskill/fastagent_integration.py index 32f8349..a7357aa 100644 --- a/src/upskill/fastagent_integration.py +++ b/src/upskill/fastagent_integration.py @@ -13,4 +13,3 @@ def compose_instruction(instruction: str, skill: Skill | None) -> str: if not skill: return instruction return f"{instruction}\n\n## Skill: {skill.name}\n\n{skill.body}" - diff --git a/src/upskill/generate.py b/src/upskill/generate.py index 1cca065..8abc943 100644 --- a/src/upskill/generate.py +++ b/src/upskill/generate.py @@ -1,14 +1,16 @@ -"""Skill generation from task descriptions using FastAgent.""" +"""Skill generation from task descriptions using fast-agent.""" from __future__ import annotations from datetime import UTC, datetime - -from fast_agent.interfaces import AgentProtocol -from fast_agent.skills.registry import SkillManifest +from typing import TYPE_CHECKING from upskill.manifest_utils import parse_skill_manifest_text -from upskill.models import Skill, SkillMetadata, TestCase, TestCaseSuite +from upskill.models import Skill, SkillMetadata, SkillRecord, SkillState, TestCase, TestCaseSuite + +if TYPE_CHECKING: + from fast_agent.interfaces import AgentProtocol + from fast_agent.skills.registry import SkillManifest # Few-shot examples for test generation TEST_EXAMPLES = """ @@ -72,31 +74,36 @@ "## Your Task\n\n" f"Task: {TASK_PLACEHOLDER}\n\n" "Generate test cases that verify the agent can apply the skill correctly.\n\n" - "Each TestCase MUST include at least a list of expected strings in the expected field.\n" "Focus on practical scenarios that test understanding of the core concepts." ) + def _build_skill_from_manifest( manifest: SkillManifest, *, model: str | None, - source_task: str, - base_skill: Skill | None = None, -) -> Skill: - references = base_skill.references if base_skill else {} - scripts = base_skill.scripts if base_skill else {} - return Skill( - name=manifest.name, - description=manifest.description, - body=manifest.body, - ## treating these as future for now as skill generator doesn't generate additional artifacts - references=references, - scripts=scripts, - metadata=SkillMetadata( - generated_by=model, - generated_at=datetime.now(UTC), - source_task=source_task, + source_task: str | None, + base_skill: SkillRecord | None = None, +) -> SkillRecord: + references = base_skill.skill.references if base_skill else {} + scripts = base_skill.skill.scripts if base_skill else {} + return SkillRecord( + skill=Skill( + name=manifest.name, + description=manifest.description, + body=manifest.body, + ## treating these as future for now as skill generator doesn't generate additional artifacts + references=references, + scripts=scripts, + ), + state=SkillState( + metadata=SkillMetadata( + generated_by=model, + generated_at=datetime.now(UTC), + source_task=source_task, + ), + tests=list(base_skill.state.tests) if base_skill else [], ), ) @@ -106,8 +113,8 @@ async def generate_skill( generator: AgentProtocol, examples: list[str] | None = None, model: str | None = None, -) -> Skill: - """Generate a skill from a task description using FastAgent.""" +) -> SkillRecord: + """Generate a skill from a task description using fast-agent.""" prompt = f"Create a skill document that teaches an AI agent how to: {task}" if examples: @@ -115,7 +122,6 @@ async def generate_skill( f"- {ex}" for ex in examples ) - skill_text = await generator.send(prompt) manifest, error = parse_skill_manifest_text(skill_text) if manifest is None: @@ -131,9 +137,8 @@ async def generate_skill( async def generate_tests( task: str, generator: AgentProtocol, - model: str | None = None, ) -> list[TestCase]: - """Generate synthetic test cases from a task description using FastAgent.""" + """Generate synthetic test cases from a task description using fast-agent.""" prompt = TEST_GENERATION_PROMPT.replace(TASK_PLACEHOLDER, task) @@ -162,18 +167,18 @@ async def generate_tests( async def refine_skill( - skill: Skill, + skill: SkillRecord, failures: list[str], generator: AgentProtocol, model: str | None = None, -) -> Skill: - """Refine a skill based on evaluation failures using FastAgent.""" +) -> SkillRecord: + """Refine a skill based on evaluation failures using fast-agent.""" prompt = f"""Improve this skill based on failures: -Name: {skill.name} -Description: {skill.description} -Body: {skill.body[:500]}... +Name: {skill.skill.name} +Description: {skill.skill.description} +Body: {skill.skill.body[:500]}... Failures: {chr(10).join(f"- {f}" for f in failures[:3])} @@ -190,7 +195,7 @@ async def refine_skill( return _build_skill_from_manifest( manifest, model=model, - source_task=skill.metadata.source_task, + source_task=skill.state.metadata.source_task, base_skill=skill, ) @@ -225,11 +230,11 @@ async def refine_skill( async def improve_skill( - skill: Skill, + skill: SkillRecord, instructions: str, generator: AgentProtocol, model: str | None = None, -) -> Skill: +) -> SkillRecord: """Improve an existing skill based on instructions. Args: @@ -245,13 +250,12 @@ async def improve_skill( # model = model or config.skill_generation_model prompt = IMPROVE_PROMPT.format( - name=skill.name, - description=skill.description, - body=skill.body, + name=skill.skill.name, + description=skill.skill.description, + body=skill.skill.body, instructions=instructions, ) - skill_text = await generator.send(prompt) manifest, error = parse_skill_manifest_text(skill_text) if manifest is None: @@ -260,6 +264,6 @@ async def improve_skill( return _build_skill_from_manifest( manifest, model=model, - source_task=f"Improved from {skill.name}: {instructions}", + source_task=f"Improved from {skill.skill.name}: {instructions}", base_skill=skill, ) diff --git a/src/upskill/hf_jobs.py b/src/upskill/hf_jobs.py new file mode 100644 index 0000000..aba5766 --- /dev/null +++ b/src/upskill/hf_jobs.py @@ -0,0 +1,618 @@ +"""Helpers for submitting and collecting Hugging Face Jobs-based eval runs.""" + +from __future__ import annotations + +import json +import re +import subprocess +import threading +import time +import uuid +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Callable, Mapping + + +@dataclass(frozen=True) +class JobsConfig: + """Configuration for remote Jobs-backed execution.""" + + artifact_repo: str + wait: bool = True + jobs_timeout: str = "2h" + jobs_flavor: str = "cpu-basic" + jobs_secrets: str = "HF_TOKEN" + jobs_namespace: str | None = None + jobs_image: str = "ghcr.io/astral-sh/uv:python3.13-bookworm" + + +@dataclass(frozen=True) +class SubmittedJob: + """A submitted Hugging Face Job plus its artifact identifiers.""" + + job_id: str + run_id: str + artifact_repo: str + + +_JOB_URL_RE = re.compile(r"https://huggingface\.co/jobs/(?P[^/]+)/(?P[^/\s]+)") +_HF_UPLOAD_CONFLICT_MARKERS = ( + "412 Precondition Failed", + "A commit has happened since. Please refresh and try again.", +) +_HF_AUTH_RATE_LIMIT_MARKERS = ( + "rate limit for the /whoami-v2 endpoint", + "whoami-v2", +) +_HF_SUBMISSION_LOCK = threading.RLock() +_VERIFIED_ARTIFACT_REPOS: set[str] = set() +_HF_HUB_CLI_SPEC = "huggingface_hub==1.7.2" +_FAST_AGENT_SPEC = "fast-agent-mcp==0.6.8" +_MAX_HF_JOB_LABEL_VALUE_LENGTH = 63 +_HF_RETRY_ATTEMPTS = 5 +_HF_INITIAL_RETRY_DELAY_SECONDS = 2.0 + + +def _normalize_job_id(value: str) -> str: + """Normalize a raw job reference into ``job_id`` or ``namespace/job_id`` form.""" + raw = value.strip() + match = _JOB_URL_RE.search(raw) + if match: + return f"{match.group('namespace')}/{match.group('job_id')}" + if raw.startswith("View at:"): + return _normalize_job_id(raw.removeprefix("View at:")) + return raw + + +def _split_job_reference(value: str) -> tuple[str | None, str]: + normalized = _normalize_job_id(value) + if "/" in normalized: + namespace, job_id = normalized.rsplit("/", 1) + return namespace, job_id + return None, normalized + + +def _namespace_from_repo_id(repo_id: str) -> str | None: + if "/" not in repo_id: + return None + namespace, _repo_name = repo_id.split("/", 1) + normalized = namespace.strip() + return normalized or None + + +def _resolve_jobs_namespace( + *, + job_id: str | None = None, + artifact_repo: str | None = None, + configured_namespace: str | None = None, +) -> str | None: + if configured_namespace: + return configured_namespace + if job_id is not None: + namespace, _bare_job_id = _split_job_reference(job_id) + if namespace is not None: + return namespace + if artifact_repo is not None: + return _namespace_from_repo_id(artifact_repo) + return None + + +def _lookup_job_stage(job_id: str, *, namespace: str | None = None) -> str | None: + """Best-effort lookup of an HF job stage from ``hf jobs ps --format json``.""" + bare_namespace, bare_job_id = _split_job_reference(job_id) + resolved_namespace = _resolve_jobs_namespace( + job_id=job_id, + configured_namespace=namespace or bare_namespace, + ) + command = ["hf", "jobs", "ps", "-a", "--format", "json"] + if resolved_namespace is not None: + command.extend(["--namespace", resolved_namespace]) + completed = _run_hf_command(command) + if completed.returncode != 0: + return None + try: + payload = json.loads(completed.stdout) + except json.JSONDecodeError: + return None + if not isinstance(payload, list): + return None + + for entry in payload: + if not isinstance(entry, dict): + continue + if str(entry.get("id", "")) != bare_job_id: + continue + owner = entry.get("owner") + owner_name = owner.get("name") if isinstance(owner, dict) else None + if resolved_namespace is not None and owner_name != resolved_namespace: + continue + status = entry.get("status") + if isinstance(status, dict): + stage = status.get("stage") + if isinstance(stage, str): + return stage + return None + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[2] + + +def _hf_command_output(completed: subprocess.CompletedProcess[str]) -> str: + """Return combined stdout/stderr for retry classification.""" + return f"{completed.stdout}\n{completed.stderr}" + + +def _has_retryable_hf_failure( + completed: subprocess.CompletedProcess[str], + *, + markers: tuple[str, ...], +) -> bool: + """Return whether a failed HF CLI call matches any retryable marker.""" + if completed.returncode == 0: + return False + output = _hf_command_output(completed) + return any(marker in output for marker in markers) + + +def _is_retryable_hf_upload_failure(completed: subprocess.CompletedProcess[str]) -> bool: + """Return whether a failed ``hf upload`` can be retried safely.""" + return _has_retryable_hf_failure(completed, markers=_HF_UPLOAD_CONFLICT_MARKERS) + + +def _is_retryable_hf_auth_failure(completed: subprocess.CompletedProcess[str]) -> bool: + """Return whether a failed HF CLI call hit auth-related rate limiting.""" + return _has_retryable_hf_failure(completed, markers=_HF_AUTH_RATE_LIMIT_MARKERS) + + +def _is_retryable_hf_failure( + completed: subprocess.CompletedProcess[str], + *, + retry_auth_rate_limit: bool, + retry_upload_conflicts: bool, +) -> bool: + """Return whether a failed HF CLI call should be retried.""" + auth_retry = retry_auth_rate_limit and _is_retryable_hf_auth_failure(completed) + upload_retry = retry_upload_conflicts and _is_retryable_hf_upload_failure(completed) + return auth_retry or upload_retry + + +def _retry_exhausted_hf_failure_message(completed: subprocess.CompletedProcess[str]) -> str: + """Return extra context when a retryable HF failure still exhausted retries.""" + if _is_retryable_hf_auth_failure(completed): + return ( + "The Hugging Face CLI continued hitting the /whoami-v2 auth rate limit after " + "retrying.\n" + ) + if _is_retryable_hf_upload_failure(completed): + return "The Hugging Face CLI continued hitting a retryable upload conflict.\n" + return "" + + +def _run_hf_command_with_retry( + command: list[str], + *, + retryable: Callable[[subprocess.CompletedProcess[str]], bool], + attempts: int = _HF_RETRY_ATTEMPTS, + initial_delay_seconds: float = _HF_INITIAL_RETRY_DELAY_SECONDS, +) -> subprocess.CompletedProcess[str]: + """Run an HF CLI command with retry/backoff for known transient failures.""" + delay_seconds = initial_delay_seconds + last_completed: subprocess.CompletedProcess[str] | None = None + + for attempt in range(1, attempts + 1): + completed = subprocess.run( + command, + cwd=_repo_root(), + check=False, + capture_output=True, + text=True, + ) + last_completed = completed + if completed.returncode == 0: + return completed + if attempt >= attempts or not retryable(completed): + return completed + time.sleep(delay_seconds) + delay_seconds *= 2 + + if last_completed is None: + raise RuntimeError("HF CLI retry loop completed without executing a command.") + return last_completed + + +def _run_hf_command( + command: list[str], + *, + retry_auth_rate_limit: bool = True, + retry_upload_conflicts: bool = False, + attempts: int = _HF_RETRY_ATTEMPTS, + initial_delay_seconds: float = _HF_INITIAL_RETRY_DELAY_SECONDS, +) -> subprocess.CompletedProcess[str]: + """Run an HF CLI command through the shared retry policy.""" + return _run_hf_command_with_retry( + command, + retryable=lambda completed: _is_retryable_hf_failure( + completed, + retry_auth_rate_limit=retry_auth_rate_limit, + retry_upload_conflicts=retry_upload_conflicts, + ), + attempts=attempts, + initial_delay_seconds=initial_delay_seconds, + ) + + +def _verify_artifact_repo_access(artifact_repo: str) -> None: + """Verify that the configured artifact dataset repo exists and is accessible.""" + with _HF_SUBMISSION_LOCK: + if artifact_repo in _VERIFIED_ARTIFACT_REPOS: + return + + completed = _run_hf_command( + [ + "hf", + "download", + artifact_repo, + "--repo-type", + "dataset", + "--dry-run", + "--quiet", + ] + ) + if completed.returncode != 0: + raise RuntimeError( + "Artifact repo is not accessible. Create it before submitting jobs and " + "ensure the current Hugging Face credentials can access it:\n" + f"repo: {artifact_repo}\n" + f"{_retry_exhausted_hf_failure_message(completed)}" + f"stdout:\n{completed.stdout}\n" + f"stderr:\n{completed.stderr}" + ) + + _VERIFIED_ARTIFACT_REPOS.add(artifact_repo) + + +def verify_artifact_repo_access(artifact_repo: str) -> None: + """Validate that the artifact dataset repo exists and is accessible.""" + _verify_artifact_repo_access(artifact_repo) + + +def parse_duration_seconds(value: str) -> float: + """Parse a simple HF-style duration like ``45m`` or ``2h``.""" + if not value: + raise ValueError("Duration value must not be empty.") + suffix = value[-1] + multiplier = { + "s": 1.0, + "m": 60.0, + "h": 3600.0, + "d": 86400.0, + }.get(suffix) + if multiplier is None: + suffix = "s" + multiplier = 1.0 + number = value + else: + number = value[:-1] + try: + return float(number) * multiplier + except ValueError as exc: + raise ValueError(f"Invalid duration value: {value}") from exc + + +def _sanitize_label(value: str) -> str: + sanitized = re.sub(r"[^a-z0-9]+", "-", value.strip().lower()).strip("-") + return sanitized or "eval" + + +def _sanitize_hf_job_label_value(value: str, *, default: str) -> str: + sanitized = re.sub(r"[^a-z0-9]+", "-", value.strip().lower()).strip("-") + truncated = sanitized[:_MAX_HF_JOB_LABEL_VALUE_LENGTH].strip("-") + return truncated or default + + +def _make_run_id(*parts: str) -> str: + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + suffix = "-".join(_sanitize_label(part) for part in parts if part) + entropy = uuid.uuid4().hex[:8] + prefix = f"{timestamp}_{suffix}" if suffix else timestamp + return f"{prefix}_{entropy}" + + +def wait_for_job_outputs( + job: SubmittedJob, + *, + destination_root: Path, + wait_timeout_seconds: float, + poll_interval_seconds: float = 15.0, + progress_callback: Callable[[str], None] | None = None, +) -> Path: + """Wait until a job uploads its exit marker, then download full outputs.""" + deadline = time.monotonic() + wait_timeout_seconds + marker_path = f"outputs/{job.run_id}/exit_code.txt" + poll_count = 0 + + if progress_callback is not None: + progress_callback(f"waiting for job {job.job_id} (run_id={job.run_id})") + + while time.monotonic() < deadline: + poll_count += 1 + stage = _lookup_job_stage( + job.job_id, + namespace=_resolve_jobs_namespace( + job_id=job.job_id, + artifact_repo=job.artifact_repo, + ), + ) + marker_download = _run_hf_command( + [ + "hf", + "download", + job.artifact_repo, + marker_path, + "--repo-type", + "dataset", + "--local-dir", + str(destination_root), + "--quiet", + ], + ) + if marker_download.returncode == 0: + if progress_callback is not None: + progress_callback(f"job {job.job_id} completed; downloading artifacts") + full_download = _run_hf_command( + [ + "hf", + "download", + job.artifact_repo, + "--repo-type", + "dataset", + "--include", + f"outputs/{job.run_id}/**", + "--local-dir", + str(destination_root), + ], + ) + if full_download.returncode != 0: + raise RuntimeError( + "HF job finished but artifacts could not be downloaded:\n" + f"{_retry_exhausted_hf_failure_message(full_download)}" + f"stdout:\n{full_download.stdout}\n" + f"stderr:\n{full_download.stderr}" + ) + if progress_callback is not None: + progress_callback(f"downloaded artifacts for job {job.job_id}") + return destination_root / "outputs" / job.run_id + if _is_retryable_hf_auth_failure(marker_download): + raise RuntimeError( + "Failed to check remote fast-agent job outputs after repeated Hugging Face " + "auth retries:\n" + f"stdout:\n{marker_download.stdout}\n" + f"stderr:\n{marker_download.stderr}" + ) + if stage in {"ERROR", "CANCELED", "DELETED"}: + raise RuntimeError( + f"HF job {job.job_id} ended with stage {stage}. " + f"Inspect logs with `hf jobs logs {job.job_id}`." + ) + if progress_callback is not None: + stage_suffix = f" ({stage.lower()})" if stage else "" + progress_callback(f"poll {poll_count}: job {job.job_id} still running{stage_suffix}") + time.sleep(poll_interval_seconds) + + raise TimeoutError( + f"Timed out waiting for HF job artifacts for job {job.job_id} (run_id={job.run_id})." + ) + + +def _hf_secret_flags(secrets: str) -> list[str]: + flags: list[str] = [] + for secret in (item.strip() for item in secrets.split(",")): + if not secret: + continue + flags.extend(["--secrets", secret]) + return flags + + +def _hf_label_flags(labels: Mapping[str, str] | None) -> list[str]: + flags: list[str] = [] + if not labels: + return flags + for key, value in sorted(labels.items()): + flags.extend(["--label", f"{key}={value}"]) + return flags + + +def _upload_bundle_input( + *, + bundle_archive: Path, + artifact_repo: str, + run_id: str, +) -> subprocess.CompletedProcess[str]: + """Upload a prepared request bundle into the artifact dataset.""" + with _HF_SUBMISSION_LOCK: + return _run_hf_command( + [ + "hf", + "upload", + artifact_repo, + str(bundle_archive), + f"inputs/{run_id}/bundle.tar.gz", + "--repo-type", + "dataset", + "--commit-message", + f"inputs: {run_id}", + ], + retry_upload_conflicts=True, + ) + + +def _render_bundle_job_script() -> str: + """Render the shell script executed inside the remote HF job container.""" + return "\n".join( + [ + "set -euo pipefail", + "run_hf_with_retries() {", + " local delay=2", + " local attempt", + f" for attempt in $(seq 1 {_HF_RETRY_ATTEMPTS}); do", + ' local log_file="$(mktemp)"', + ' if "$@" >"$log_file" 2>&1; then', + ' cat "$log_file"', + ' rm -f "$log_file"', + " return 0", + " fi", + f' if [[ "$attempt" -lt {_HF_RETRY_ATTEMPTS} ]] && (', + ' grep -q "rate limit for the /whoami-v2 endpoint" "$log_file" ||', + ' grep -q "whoami-v2" "$log_file" ||', + ' grep -q "412 Precondition Failed" "$log_file" ||', + ' grep -q "A commit has happened since" "$log_file"', + " ); then", + ' cat "$log_file" >&2', + ' rm -f "$log_file"', + ' sleep "$delay"', + " delay=$((delay * 2))", + " continue", + " fi", + ' cat "$log_file" >&2', + ' rm -f "$log_file"', + " return 1", + " done", + " return 1", + "}", + "download_with_retries() {", + ' local repo="$1"', + ' local path="$2"', + ' local local_dir="$3"', + ' run_hf_with_retries hf download "$repo" "$path" --repo-type dataset --local-dir "$local_dir"', + "}", + "upload_with_retries() {", + ' local repo="$1"', + ' local src="$2"', + ' local dest="$3"', + ' local message="$4"', + ' run_hf_with_retries hf upload "$repo" "$src" "$dest" --repo-type dataset --commit-message "$message"', + "}", + "WORK=/workspace", + 'mkdir -p "$WORK/out"', + 'cd "$WORK"', + f'uv pip install --system "{_HF_HUB_CLI_SPEC}" "{_FAST_AGENT_SPEC}"', + 'download_with_retries "$ARTIFACT_REPO" "inputs/$RUN_ID/bundle.tar.gz" "$WORK"', + 'tar -xzf "$WORK/inputs/$RUN_ID/bundle.tar.gz" -C "$WORK"', + "set +e", + 'bash "$WORK/bundle/job_entrypoint.sh" "$WORK/bundle" "$WORK/out"', + "status=$?", + "set -e", + 'echo "$status" > "$WORK/out/exit_code.txt"', + 'upload_with_retries "$ARTIFACT_REPO" "$WORK/out" "outputs/$RUN_ID" ' + '"outputs: $RUN_ID (exit=$status)"', + 'exit "$status"', + "", + ] + ) + + +def _build_hf_jobs_run_command( + *, + jobs_config: JobsConfig, + run_id: str, + model: str, + labels: Mapping[str, str] | None, + job_script: str, +) -> list[str]: + """Build the ``hf jobs run`` command for a prepared bundle submission.""" + namespace = _resolve_jobs_namespace( + artifact_repo=jobs_config.artifact_repo, + configured_namespace=jobs_config.jobs_namespace, + ) + command = [ + "hf", + "jobs", + "run", + "--detach", + "--flavor", + jobs_config.jobs_flavor, + "--timeout", + jobs_config.jobs_timeout, + *_hf_secret_flags(jobs_config.jobs_secrets), + *_hf_label_flags(labels), + "--env", + f"ARTIFACT_REPO={jobs_config.artifact_repo}", + "--env", + f"RUN_ID={run_id}", + "--env", + f"FAST_MODEL={model}", + ] + if namespace is not None: + command.extend(["--namespace", namespace]) + command.extend( + [ + "--", + jobs_config.jobs_image, + "bash", + "-lc", + job_script, + ] + ) + return command + + +def _submit_prepared_bundle_job( + *, + jobs_config: JobsConfig, + run_id: str, + model: str, + labels: Mapping[str, str] | None = None, +) -> SubmittedJob: + """Submit a remote job for a bundle that is already present in the dataset.""" + job_script = _render_bundle_job_script() + command = _build_hf_jobs_run_command( + jobs_config=jobs_config, + run_id=run_id, + model=model, + labels=labels, + job_script=job_script, + ) + with _HF_SUBMISSION_LOCK: + completed = _run_hf_command(command) + if completed.returncode != 0: + raise RuntimeError( + "Failed to submit remote fast-agent job:\n" + f"{_retry_exhausted_hf_failure_message(completed)}" + f"stdout:\n{completed.stdout}\n" + f"stderr:\n{completed.stderr}" + ) + job_ref = _normalize_job_id(completed.stdout.strip().splitlines()[-1]) + return SubmittedJob(job_id=job_ref, run_id=run_id, artifact_repo=jobs_config.artifact_repo) + + +def _submit_bundle_job( + *, + bundle_archive: Path, + jobs_config: JobsConfig, + run_id: str, + model: str, + labels: Mapping[str, str] | None = None, +) -> SubmittedJob: + upload = _upload_bundle_input( + bundle_archive=bundle_archive, + artifact_repo=jobs_config.artifact_repo, + run_id=run_id, + ) + if upload.returncode != 0: + raise RuntimeError( + "Failed to upload remote fast-agent bundle:\n" + f"{_retry_exhausted_hf_failure_message(upload)}" + f"stdout:\n{upload.stdout}\n" + f"stderr:\n{upload.stderr}" + ) + return _submit_prepared_bundle_job( + jobs_config=jobs_config, + run_id=run_id, + model=model, + labels=labels, + ) diff --git a/src/upskill/logging.py b/src/upskill/logging.py index 59c723e..705c330 100644 --- a/src/upskill/logging.py +++ b/src/upskill/logging.py @@ -5,14 +5,18 @@ import csv import json from datetime import datetime -from pathlib import Path +from typing import TYPE_CHECKING -from fast_agent import ConversationSummary from fast_agent.constants import FAST_AGENT_TIMING, FAST_AGENT_USAGE from fast_agent.mcp.helpers.content_helpers import get_text from upskill.models import BatchSummary, ConversationStats, RunMetadata, RunResult, TestResult +if TYPE_CHECKING: + from pathlib import Path + + from fast_agent import ConversationSummary + # CSV field names for run summaries (matching skills-test format) FIELDNAMES = [ "batch_id", @@ -174,9 +178,6 @@ def extract_tokens_from_messages( return input_tokens, output_tokens, total_tokens, usage_summaries - - - def extract_timing_from_messages(messages: list) -> list[dict[str, object]]: """Extract timing payloads from message channels.""" timings: list[dict[str, object]] = [] @@ -250,8 +251,6 @@ def extract_stats_from_summary(summary: ConversationSummary) -> ConversationStat ) - - def aggregate_conversation_stats(results: list[TestResult]) -> ConversationStats: """Aggregate ConversationStats across multiple test results.""" aggregate = ConversationStats() diff --git a/src/upskill/manifest_utils.py b/src/upskill/manifest_utils.py index c84c6f2..9d8aede 100644 --- a/src/upskill/manifest_utils.py +++ b/src/upskill/manifest_utils.py @@ -2,9 +2,14 @@ from __future__ import annotations -from pathlib import Path +from typing import TYPE_CHECKING -from fast_agent.skills.registry import SkillManifest, SkillRegistry +from fast_agent.skills.registry import SkillRegistry + +if TYPE_CHECKING: + from pathlib import Path + + from fast_agent.skills.registry import SkillManifest def parse_skill_manifest_text( diff --git a/src/upskill/model_resolution.py b/src/upskill/model_resolution.py index e7ae7f3..6501d03 100644 --- a/src/upskill/model_resolution.py +++ b/src/upskill/model_resolution.py @@ -6,9 +6,10 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Literal +from typing import TYPE_CHECKING, Literal -from upskill.config import Config +if TYPE_CHECKING: + from upskill.config import Config CommandName = Literal["generate", "eval", "benchmark"] @@ -25,6 +26,26 @@ class ResolvedModels: run_baseline: bool = True +def build_fastagent_model_references( + *, + config: Config, + resolved: ResolvedModels, +) -> dict[str, dict[str, str]]: + """Build fast-agent model references for the standard upskill card slots.""" + + skill_generation_model = resolved.skill_generation_model or config.skill_generation_model + test_generation_model = ( + resolved.test_generation_model or config.test_gen_model or skill_generation_model + ) + return { + "system": { + "default": skill_generation_model, + "skill_gen": skill_generation_model, + "test_gen": test_generation_model, + } + } + + def resolve_models( command: CommandName, *, diff --git a/src/upskill/models.py b/src/upskill/models.py index cf0a6ea..c4de86f 100644 --- a/src/upskill/models.py +++ b/src/upskill/models.py @@ -5,10 +5,13 @@ import json import re from datetime import datetime -from pathlib import Path +from typing import TYPE_CHECKING from pydantic import BaseModel, ConfigDict, Field, field_validator +if TYPE_CHECKING: + from pathlib import Path + class SkillMetadata(BaseModel): """Metadata about how a skill was generated (stored in skill_meta.json).""" @@ -59,6 +62,7 @@ class TestCaseContext(BaseModel): class TestCase(BaseModel): """A test case for skill evaluation.""" + __test__ = False model_config = ConfigDict(extra="forbid") input: str # Task/prompt to give the agent @@ -71,8 +75,6 @@ class TestCase(BaseModel): validator_config: dict[str, str | int | float | bool] | None = None - - class TestCaseSuite(BaseModel): """Structured container for a list of test cases.""" @@ -91,6 +93,112 @@ class SkillDraft(BaseModel): scripts: dict[str, str] | None = None +def _parse_skill_frontmatter( + content: str, + *, + default_name: str, +) -> tuple[str, str, list[str] | None, str | None, bool, bool, str]: + """Parse SKILL.md frontmatter and return normalized fields.""" + name = default_name + description = "" + allowed_tools: list[str] | None = None + argument_hint: str | None = None + user_invocable = True + disable_model_invocation = False + body = content + + if not content.startswith("---"): + return ( + name, + description, + allowed_tools, + argument_hint, + user_invocable, + disable_model_invocation, + body, + ) + + parts = content.split("---", 2) + if len(parts) < 3: + return ( + name, + description, + allowed_tools, + argument_hint, + user_invocable, + disable_model_invocation, + body, + ) + + frontmatter = parts[1].strip() + body = parts[2].strip() + + for line in frontmatter.splitlines(): + if ":" not in line: + continue + key, value = line.split(":", 1) + key = key.strip() + value = value.strip() + + if key == "name": + name = value + elif key == "description": + description = value + elif key == "allowed-tools": + allowed_tools = [tool.strip() for tool in value.split(",")] + elif key == "argument-hint": + argument_hint = value + elif key == "user-invocable": + user_invocable = value.lower() != "false" + elif key == "disable-model-invocation": + disable_model_invocation = value.lower() == "true" + + return ( + name, + description, + allowed_tools, + argument_hint, + user_invocable, + disable_model_invocation, + body, + ) + + +class SkillState(BaseModel): + """Upskill-managed state stored separately from ``SKILL.md``.""" + + metadata: SkillMetadata = Field(default_factory=SkillMetadata) + tests: list[TestCase] = Field(default_factory=list) + + +def _load_skill_state(path: Path) -> SkillState: + """Load optional upskill-managed state stored alongside a skill.""" + state = SkillState() + meta_path = path / "skill_meta.json" + if not meta_path.exists(): + return state + + meta_dict = json.loads(meta_path.read_text()) + if "metadata" in meta_dict: + state.metadata = SkillMetadata.model_validate(meta_dict["metadata"]) + if "tests" in meta_dict: + state.tests = [TestCase.model_validate(test_case) for test_case in meta_dict["tests"]] + return state + + +def _load_artifact_directory(path: Path, directory_name: str) -> dict[str, str]: + """Load filename-to-content mappings from a skill artifact directory.""" + directory = path / directory_name + if not directory.exists(): + return {} + + return { + file_path.name: file_path.read_text() + for file_path in directory.iterdir() + if file_path.is_file() + } + + class Skill(BaseModel): """A generated agent skill following the Claude Code SKILL.md spec.""" @@ -102,17 +210,11 @@ class Skill(BaseModel): user_invocable: bool = True disable_model_invocation: bool = False - # upskill metadata (persisted to skill_meta.json) - metadata: SkillMetadata = Field(default_factory=SkillMetadata) - # Content body: str # Main instructions markdown references: dict[str, str] = Field(default_factory=dict) # filename -> content scripts: dict[str, str] = Field(default_factory=dict) # filename -> code - # Test cases (persisted to skill_meta.json) - tests: list[TestCase] = Field(default_factory=list) - @field_validator("name") @classmethod def validate_name(cls, v: str) -> str: @@ -145,28 +247,13 @@ def render(self) -> str: return "\n".join(frontmatter_lines) + "\n\n" + self.body - def save(self, path: Path, tests: list[TestCase] | None = None) -> None: - """Write skill directory with all files. - - Args: - path: Directory to save skill to - tests: Optional test cases to persist (overrides self.tests if provided) - """ + def save(self, path: Path) -> None: + """Write the skill document and artifact files.""" path.mkdir(parents=True, exist_ok=True) # Write SKILL.md (Claude Code compatible) (path / "SKILL.md").write_text(self.render()) - # Write skill_meta.json (upskill-specific metadata + tests) - tests_to_save = tests if tests is not None else self.tests - meta_dict = { - "metadata": self.metadata.model_dump(mode="json"), - "tests": [t.model_dump(mode="json") for t in tests_to_save], - } - (path / "skill_meta.json").write_text( - json.dumps(meta_dict, indent=2, default=str) - ) - # Write references if self.references: refs_dir = path / "references" @@ -183,80 +270,23 @@ def save(self, path: Path, tests: list[TestCase] | None = None) -> None: @classmethod def load(cls, path: Path) -> Skill: - """Load a skill from a directory. - - Args: - path: Directory containing SKILL.md and optionally skill_meta.json - - Returns: - Loaded Skill instance - """ + """Load a skill document from a directory.""" skill_md_path = path / "SKILL.md" if not skill_md_path.exists(): raise FileNotFoundError(f"SKILL.md not found in {path}") content = skill_md_path.read_text() - - # Parse YAML frontmatter - name = path.name # Default to directory name - description = "" - allowed_tools: list[str] | None = None - argument_hint: str | None = None - user_invocable = True - disable_model_invocation = False - body = content - - if content.startswith("---"): - parts = content.split("---", 2) - if len(parts) >= 3: - frontmatter = parts[1].strip() - body = parts[2].strip() - - for line in frontmatter.split("\n"): - if ":" in line: - key, value = line.split(":", 1) - key = key.strip() - value = value.strip() - - if key == "name": - name = value - elif key == "description": - description = value - elif key == "allowed-tools": - allowed_tools = [t.strip() for t in value.split(",")] - elif key == "argument-hint": - argument_hint = value - elif key == "user-invocable": - user_invocable = value.lower() != "false" - elif key == "disable-model-invocation": - disable_model_invocation = value.lower() == "true" - - # Load metadata and tests from skill_meta.json if present - metadata = SkillMetadata() - tests: list[TestCase] = [] - meta_path = path / "skill_meta.json" - if meta_path.exists(): - meta_dict = json.loads(meta_path.read_text()) - if "metadata" in meta_dict: - metadata = SkillMetadata.model_validate(meta_dict["metadata"]) - if "tests" in meta_dict: - tests = [TestCase.model_validate(t) for t in meta_dict["tests"]] - - # Load references - references: dict[str, str] = {} - refs_dir = path / "references" - if refs_dir.exists(): - for ref_file in refs_dir.iterdir(): - if ref_file.is_file(): - references[ref_file.name] = ref_file.read_text() - - # Load scripts - scripts: dict[str, str] = {} - scripts_dir = path / "scripts" - if scripts_dir.exists(): - for script_file in scripts_dir.iterdir(): - if script_file.is_file(): - scripts[script_file.name] = script_file.read_text() + ( + name, + description, + allowed_tools, + argument_hint, + user_invocable, + disable_model_invocation, + body, + ) = _parse_skill_frontmatter(content, default_name=path.name) + references = _load_artifact_directory(path, "references") + scripts = _load_artifact_directory(path, "scripts") return cls( name=name, @@ -265,11 +295,34 @@ def load(cls, path: Path) -> Skill: argument_hint=argument_hint, user_invocable=user_invocable, disable_model_invocation=disable_model_invocation, - metadata=metadata, body=body, references=references, scripts=scripts, - tests=tests, + ) + + +class SkillRecord(BaseModel): + """Persisted skill document plus separately managed upskill state.""" + + skill: Skill + state: SkillState = Field(default_factory=SkillState) + + def save(self, path: Path) -> None: + """Write the skill document and managed metadata/tests.""" + path.mkdir(parents=True, exist_ok=True) + self.skill.save(path) + meta_dict = { + "metadata": self.state.metadata.model_dump(mode="json"), + "tests": [test.model_dump(mode="json") for test in self.state.tests], + } + (path / "skill_meta.json").write_text(json.dumps(meta_dict, indent=2, default=str)) + + @classmethod + def load(cls, path: Path) -> SkillRecord: + """Load a persisted skill record from disk.""" + return cls( + skill=Skill.load(path), + state=_load_skill_state(path), ) @@ -314,6 +367,7 @@ def tokens(self) -> int: class TestResult(BaseModel): """Result of running a single test case.""" + __test__ = False test_case: TestCase success: bool output: str | None = None diff --git a/src/upskill/result_parsing.py b/src/upskill/result_parsing.py new file mode 100644 index 0000000..908e3a0 --- /dev/null +++ b/src/upskill/result_parsing.py @@ -0,0 +1,48 @@ +"""Parse fast-agent result artifacts into upskill-friendly data.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from fast_agent import ConversationSummary +from fast_agent.mcp.prompt_serialization import load_messages + +from upskill.logging import extract_stats_from_summary + +if TYPE_CHECKING: + from collections.abc import Sequence + from pathlib import Path + + from upskill.models import ConversationStats + + +@dataclass(slots=True, frozen=True) +class ParsedExecutionResult: + """Parsed view of a fast-agent result export.""" + + output_text: str | None + stats: ConversationStats + + +def _extract_output_text(messages: Sequence[object]) -> str | None: + for message in reversed(messages): + role = getattr(message, "role", None) + if role != "assistant": + continue + last_text = getattr(message, "last_text", None) + if callable(last_text): + text = last_text() + if text: + return text + return None + + +def parse_fast_agent_results(results_path: Path) -> ParsedExecutionResult: + """Load and summarize a fast-agent JSON history export.""" + messages = load_messages(str(results_path)) + summary = ConversationSummary(messages=messages) + return ParsedExecutionResult( + output_text=_extract_output_text(messages), + stats=extract_stats_from_summary(summary), + ) diff --git a/tests/conftest.py b/tests/conftest.py index fdcbc1f..13941ea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,5 +5,6 @@ ROOT = Path(__file__).resolve().parents[1] SRC = ROOT / "src" -if str(SRC) not in sys.path: - sys.path.insert(0, str(SRC)) +for path in (ROOT, SRC): + if str(path) not in sys.path: + sys.path.insert(0, str(path)) diff --git a/tests/test_agent_card_guardrails.py b/tests/test_agent_card_guardrails.py index fe8dfe5..76ce772 100644 --- a/tests/test_agent_card_guardrails.py +++ b/tests/test_agent_card_guardrails.py @@ -2,12 +2,7 @@ from pathlib import Path -import pytest - AGENT_CARDS_DIR = Path("src/upskill/agent_cards") -GUARDED_CARDS = ("skill_gen.md", "test_gen.md") -# Intentional exceptions require both allowlist entry and frontmatter annotation. -ALLOWED_MODEL_PIN_OVERRIDES: dict[str, str] = {} def _parse_frontmatter(path: Path) -> dict[str, str]: @@ -36,29 +31,10 @@ def _parse_frontmatter(path: Path) -> dict[str, str]: return data -@pytest.mark.parametrize("card_name", GUARDED_CARDS) -def test_guarded_agent_cards_do_not_pin_model_unless_explicitly_allowed(card_name: str) -> None: - card_path = AGENT_CARDS_DIR / card_name - assert card_path.exists(), f"Missing guarded agent card: {card_path}" - - frontmatter = _parse_frontmatter(card_path) - if "model" not in frontmatter: - return - - assert card_name in ALLOWED_MODEL_PIN_OVERRIDES, ( - f"Unexpected model pin in {card_name}. Remove `model:` from frontmatter or add an " - "explicit temporary override in ALLOWED_MODEL_PIN_OVERRIDES with a justification." +def test_evaluator_card_does_not_pin_skills_dir() -> None: + """Evaluation skill loading should come from --skills-dir, not card frontmatter.""" + frontmatter = _parse_frontmatter(AGENT_CARDS_DIR / "evaluator.md") + assert "skills" not in frontmatter, ( + "evaluator.md should not define `skills:` in frontmatter. " + "Evaluation availability must be controlled by the executor's --skills-dir." ) - assert frontmatter.get("allow_model_pin", "").lower() == "true", ( - f"{card_name} is allowlisted but missing `allow_model_pin: true` annotation in frontmatter." - ) - - -def test_default_guarded_cards_have_no_model_pin() -> None: - """Regression guard: current default cards should not define a model pin.""" - for card_name in GUARDED_CARDS: - frontmatter = _parse_frontmatter(AGENT_CARDS_DIR / card_name) - assert "model" not in frontmatter, ( - f"Unexpected model pin in guarded card {card_name}. " - "Model selection should come from runtime resolution." - ) diff --git a/tests/test_check_script.py b/tests/test_check_script.py new file mode 100644 index 0000000..5a8cf4c --- /dev/null +++ b/tests/test_check_script.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from scripts.check import build_check_steps + + +def test_build_check_steps_includes_cpd_and_pytest() -> None: + steps = build_check_steps() + + assert [step.name for step in steps] == ["format", "lint", "typecheck", "cpd", "pytest"] + assert steps[3].command[-1] == "--check" + assert steps[4].command[1:] == ("-m", "pytest", "-v") + + +def test_build_check_steps_can_skip_pytest() -> None: + steps = build_check_steps(skip_tests=True) + + assert [step.name for step in steps] == ["format", "lint", "typecheck", "cpd"] diff --git a/tests/test_cli_eval_jobs.py b/tests/test_cli_eval_jobs.py new file mode 100644 index 0000000..48d8b68 --- /dev/null +++ b/tests/test_cli_eval_jobs.py @@ -0,0 +1,560 @@ +from __future__ import annotations + +from pathlib import Path + +import click +import pytest +from click.testing import CliRunner + +from upskill.cli import _eval_async, _jobs_execution_options, _raise_on_execution_errors +from upskill.config import Config +from upskill.evaluate import apply_eval_metrics +from upskill.hf_jobs import JobsConfig +from upskill.logging import load_batch_summary, load_run_result +from upskill.models import ( + ConversationStats, + EvalResults, + ExpectedSpec, + Skill, + SkillRecord, + SkillState, + TestCase, + TestResult, +) + + +def _make_eval_results( + *, + skill: Skill, + model: str, + test_cases: list[TestCase], + run_baseline: bool, +) -> EvalResults: + with_skill_results = [ + TestResult( + test_case=test_case, + success=True, + stats=ConversationStats(total_tokens=10, turns=1), + ) + for test_case in test_cases + ] + results = EvalResults( + skill_name=skill.name, + model=model, + with_skill_results=with_skill_results, + ) + if run_baseline: + results.baseline_results = [ + TestResult( + test_case=test_case, + success=False, + stats=ConversationStats(total_tokens=20, turns=1), + ) + for test_case in test_cases + ] + return apply_eval_metrics(results, test_cases) + + +def _write_skill_fixture(skill_dir: Path) -> SkillRecord: + record = SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ), + state=SkillState( + tests=[ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])), + ] + ), + ) + record.save(skill_dir) + return record + + +def test_jobs_execution_options_waits_by_default() -> None: + @click.command() + @_jobs_execution_options( + executor_help="Execution backend for tests", + runs_dir_help="Runs directory for tests", + ) + def command( + executor: str | None, + artifact_repo: str | None, + wait: bool, + jobs_timeout: str, + jobs_flavor: str, + jobs_secrets: str | None, + jobs_namespace: str | None, + max_parallel: int | None, + runs_dir: str | None, + log_runs: bool, + ) -> None: + del ( + executor, + artifact_repo, + jobs_timeout, + jobs_flavor, + jobs_secrets, + jobs_namespace, + max_parallel, + runs_dir, + log_runs, + ) + click.echo(f"wait={wait}") + + runner = CliRunner() + + default_result = runner.invoke(command) + assert default_result.exit_code == 0 + assert "wait=True" in default_result.output + + no_wait_result = runner.invoke(command, ["--no-wait"]) + assert no_wait_result.exit_code == 0 + assert "wait=False" in no_wait_result.output + + +def test_raise_on_execution_errors_surfaces_backend_failures() -> None: + test_case = TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])) + results = EvalResults( + skill_name="pull-request-descriptions", + model="haiku", + with_skill_results=[ + TestResult( + test_case=test_case, + success=False, + error="fast-agent exited with code 1.", + ) + ], + ) + + with pytest.raises(click.ClickException, match="execution errors") as exc_info: + _raise_on_execution_errors(results, context="Evaluation on haiku") + + assert "with-skill test 1" in str(exc_info.value) + assert "fast-agent exited with code 1." in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_eval_jobs_wait_persists_simple_run_summaries( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + skill_record = _write_skill_fixture(tmp_path / "skill") + skill = skill_record.skill + config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml") + fake_executor = object() + max_parallel_calls: list[int] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr("upskill.cli._build_executor", lambda *args, **kwargs: fake_executor) + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args + assert kwargs["executor"] is fake_executor + assert kwargs["operation"] == "eval" + max_parallel = kwargs["max_parallel"] + assert isinstance(max_parallel, int) + max_parallel_calls.append(max_parallel) + results = _make_eval_results( + skill=skill, + model=str(kwargs["model"]), + test_cases=skill_record.state.tests, + run_baseline=True, + ) + return results + + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=1, + no_baseline=False, + verbose=False, + executor_name="jobs", + artifact_repo="ns/repo", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=3, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + batch_folder = next(config.runs_dir.iterdir()) + summary = load_batch_summary(batch_folder) + assert summary is not None + assert summary.total_runs == 2 + assert summary.passed_runs == 1 + + baseline_result = load_run_result(batch_folder / "run_1") + with_skill_result = load_run_result(batch_folder / "run_2") + assert baseline_result is not None + assert with_skill_result is not None + assert baseline_result.run_type == "baseline" + assert with_skill_result.run_type == "with_skill" + assert max_parallel_calls == [3] + + +@pytest.mark.asyncio +async def test_eval_uses_config_execution_defaults_when_cli_unset( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + skill_record = _write_skill_fixture(tmp_path / "skill") + config = Config( + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + executor="jobs", + num_runs=2, + max_parallel=4, + jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY", + jobs_image="ghcr.io/example/custom:latest", + ) + fake_executor = object() + build_calls: list[str] = [] + calls: list[tuple[int, bool]] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + + def fake_build_executor(name: str, **kwargs: object) -> object: + build_calls.append(name) + jobs_config = kwargs["jobs_config"] + assert isinstance(jobs_config, JobsConfig) + assert jobs_config.jobs_secrets == "HF_TOKEN,ANTHROPIC_API_KEY" + assert jobs_config.jobs_image == "ghcr.io/example/custom:latest" + return fake_executor + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args + max_parallel = kwargs["max_parallel"] + run_baseline = kwargs["run_baseline"] + assert kwargs["executor"] is fake_executor + assert isinstance(max_parallel, int) + assert isinstance(run_baseline, bool) + calls.append((max_parallel, run_baseline)) + return _make_eval_results( + skill=skill_record.skill, + model=str(kwargs["model"]), + test_cases=skill_record.state.tests, + run_baseline=False, + ) + + monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor) + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=None, + no_baseline=False, + verbose=False, + executor_name=None, + artifact_repo="ns/repo", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets=None, + jobs_namespace=None, + max_parallel=None, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + assert build_calls == ["jobs"] + assert calls == [(4, False), (4, False)] + + +@pytest.mark.asyncio +async def test_eval_cli_execution_options_override_config_defaults( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + skill_record = _write_skill_fixture(tmp_path / "skill") + config = Config( + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + executor="jobs", + num_runs=2, + max_parallel=4, + jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY", + ) + fake_executor = object() + build_calls: list[str] = [] + calls: list[tuple[int, bool]] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + + def fake_build_executor(name: str, **kwargs: object) -> object: + del kwargs + build_calls.append(name) + return fake_executor + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args + max_parallel = kwargs["max_parallel"] + run_baseline = kwargs["run_baseline"] + assert kwargs["executor"] is fake_executor + assert isinstance(max_parallel, int) + assert isinstance(run_baseline, bool) + calls.append((max_parallel, run_baseline)) + return _make_eval_results( + skill=skill_record.skill, + model=str(kwargs["model"]), + test_cases=skill_record.state.tests, + run_baseline=True, + ) + + monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor) + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=1, + no_baseline=False, + verbose=False, + executor_name="local", + artifact_repo=None, + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=1, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + assert build_calls == ["local"] + assert calls == [(1, True)] + + +@pytest.mark.asyncio +async def test_eval_cli_jobs_secrets_override_config_defaults( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + skill_record = _write_skill_fixture(tmp_path / "skill") + config = Config( + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + executor="jobs", + jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY", + ) + fake_executor = object() + build_calls: list[str] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + + def fake_build_executor(name: str, **kwargs: object) -> object: + build_calls.append(name) + jobs_config = kwargs["jobs_config"] + assert isinstance(jobs_config, JobsConfig) + assert jobs_config.jobs_secrets == "HF_TOKEN,OPENAI_API_KEY" + return fake_executor + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args, kwargs + return _make_eval_results( + skill=skill_record.skill, + model="haiku", + test_cases=skill_record.state.tests, + run_baseline=True, + ) + + monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor) + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=1, + no_baseline=False, + verbose=False, + executor_name="jobs", + artifact_repo="ns/repo", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN,OPENAI_API_KEY", + jobs_namespace=None, + max_parallel=1, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + assert build_calls == ["jobs"] + + +@pytest.mark.asyncio +async def test_eval_jobs_wait_persists_benchmark_run_summaries( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + skill_record = _write_skill_fixture(tmp_path / "skill") + skill = skill_record.skill + config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml") + fake_executor = object() + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr("upskill.cli._build_executor", lambda *args, **kwargs: fake_executor) + + calls: list[tuple[int, bool]] = [] + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args + artifact_root = kwargs["artifact_root"] + assert isinstance(artifact_root, Path) + assert kwargs["operation"] == "benchmark" + max_parallel = kwargs["max_parallel"] + run_baseline = kwargs["run_baseline"] + assert isinstance(max_parallel, int) + assert isinstance(run_baseline, bool) + calls.append((max_parallel, run_baseline)) + results = _make_eval_results( + skill=skill, + model=str(kwargs["model"]), + test_cases=skill_record.state.tests, + run_baseline=False, + ) + return results + + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=2, + no_baseline=True, + verbose=False, + executor_name="jobs", + artifact_repo="ns/repo", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=4, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + batch_folder = next(config.runs_dir.iterdir()) + summary = load_batch_summary(batch_folder) + assert summary is not None + assert summary.total_runs == 2 + assert calls == [(4, False), (4, False)] + assert load_run_result(batch_folder / "run_1") is not None + assert load_run_result(batch_folder / "run_2") is not None + + +@pytest.mark.asyncio +async def test_eval_jobs_no_wait_submits_remote_requests( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _write_skill_fixture(tmp_path / "skill") + config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml") + submit_calls: list[tuple[str, bool, str]] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + + async def fake_submit_remote_eval_jobs(**kwargs: object) -> list[str]: + submit_calls.append( + ( + str(kwargs["model"]), + bool(kwargs["run_baseline"]), + str(kwargs["operation"]), + ) + ) + return ["evalstate/job-1", "evalstate/job-2"] + + def fail_build_executor(*args: object, **kwargs: object) -> object: + del args, kwargs + raise AssertionError("_build_executor should not be used for jobs --no-wait submission") + + async def fail_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args, kwargs + raise AssertionError("evaluate_skill should not be called for jobs --no-wait submission") + + monkeypatch.setattr("upskill.cli._submit_remote_eval_jobs", fake_submit_remote_eval_jobs) + monkeypatch.setattr("upskill.cli._build_executor", fail_build_executor) + monkeypatch.setattr("upskill.cli.evaluate_skill", fail_evaluate_skill) + + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=1, + no_baseline=False, + verbose=False, + executor_name="jobs", + artifact_repo="ns/repo", + wait=False, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=3, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + assert submit_calls == [("haiku", True, "eval")] + + +@pytest.mark.asyncio +async def test_eval_jobs_wait_fails_cleanly_when_artifact_repo_is_inaccessible( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _write_skill_fixture(tmp_path / "skill") + config = Config(runs_dir=tmp_path / "runs", fastagent_config=tmp_path / "fastagent.config.yaml") + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr( + "upskill.cli.verify_artifact_repo_access", + lambda _repo: (_ for _ in ()).throw( + RuntimeError("404 Not Found\nRepository Not Found for url") + ), + ) + + with pytest.raises(click.ClickException, match="Artifact repo is not accessible") as exc_info: + await _eval_async( + skill_path=str(tmp_path / "skill"), + tests=None, + models=["haiku"], + test_gen_model=None, + num_runs=1, + no_baseline=False, + verbose=False, + executor_name="jobs", + artifact_repo="evalstate/uskill-test", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=3, + log_runs=True, + runs_dir=str(config.runs_dir), + ) + + assert "Repo: evalstate/uskill-test" in str(exc_info.value) + assert "name is wrong" in str(exc_info.value) diff --git a/tests/test_cli_generate_benchmark.py b/tests/test_cli_generate_benchmark.py new file mode 100644 index 0000000..9554543 --- /dev/null +++ b/tests/test_cli_generate_benchmark.py @@ -0,0 +1,637 @@ +from __future__ import annotations + +from types import SimpleNamespace +from typing import TYPE_CHECKING + +import pytest +from click.testing import CliRunner + +from upskill.cli import ( + _benchmark_async, + _build_logged_run_result, + _generate_async, + _install_fast_agent_model_references, + _submit_generate_jobs_eval, + main, +) +from upskill.config import Config +from upskill.evaluate import apply_eval_metrics +from upskill.hf_jobs import JobsConfig +from upskill.logging import load_batch_summary +from upskill.models import ( + ConversationStats, + EvalResults, + ExpectedSpec, + Skill, + SkillRecord, + SkillState, + TestCase, + TestResult, + ValidationResult, +) + +if TYPE_CHECKING: + from pathlib import Path + + +class _FakeAgentContext: + async def __aenter__(self) -> SimpleNamespace: + return SimpleNamespace(skill_gen=object(), test_gen=object()) + + async def __aexit__(self, exc_type: object, exc: object, tb: object) -> bool: + del exc_type, exc, tb + return False + + +def _make_eval_results( + *, + skill: Skill, + model: str, + test_cases: list[TestCase], + run_baseline: bool, +) -> EvalResults: + with_skill_results = [ + TestResult( + test_case=test_case, + success=True, + stats=ConversationStats(total_tokens=10, turns=1), + ) + for test_case in test_cases + ] + results = EvalResults( + skill_name=skill.name, + model=model, + with_skill_results=with_skill_results, + ) + if run_baseline: + results.baseline_results = [ + TestResult( + test_case=test_case, + success=False, + stats=ConversationStats(total_tokens=20, turns=1), + ) + for test_case in test_cases + ] + return apply_eval_metrics(results, test_cases) + + +def test_build_logged_run_result_preserves_validator_assertion_counts() -> None: + test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"])) + run_result = _build_logged_run_result( + model="haiku", + task="Write good pull request descriptions.", + batch_id="batch-1", + run_number=1, + test_results=[ + TestResult( + test_case=test_case, + success=True, + validation_result=ValidationResult( + passed=True, + assertions_passed=2, + assertions_total=3, + ), + stats=ConversationStats(total_tokens=10, turns=1), + ), + TestResult( + test_case=test_case, + success=True, + stats=ConversationStats(total_tokens=12, turns=1), + ), + ], + assertions_total=2, + passed=False, + run_type="with_skill", + skill_name="pull-request-descriptions", + ) + + assert run_result.assertions_passed == 3 + assert run_result.assertions_total == 4 + + +def test_generate_help_does_not_expose_removed_tool_option() -> None: + runner = CliRunner() + + result = runner.invoke(main, ["generate", "--help"]) + + assert result.exit_code == 0 + assert "--tool" not in result.output + assert "--from PATH" in result.output + assert "--artifact-repo TEXT" in result.output + assert 'upskill generate "parse invoices"' in result.output + assert "--artifact-repo" in result.output + assert "/upskill-tests" in result.output + + +def test_install_fast_agent_model_references_merges_existing_namespaces() -> None: + fast = SimpleNamespace( + app=SimpleNamespace( + _config_or_path=SimpleNamespace( + model_references={ + "custom": {"router": "haiku"}, + "system": {"existing": "keep"}, + } + ) + ) + ) + + _install_fast_agent_model_references( + fast, + model_references={"system": {"skill_gen": "sonnet", "test_gen": "opus"}}, + ) + + assert fast.app._config_or_path.model_references == { + "custom": {"router": "haiku"}, + "system": { + "existing": "keep", + "skill_gen": "sonnet", + "test_gen": "opus", + }, + } + + +@pytest.mark.asyncio +async def test_generate_persists_generated_tests_in_skill_meta( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config( + skills_dir=tmp_path / "skills", + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + ) + test_cases = [ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])), + ] + fake_executor = object() + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr( + "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext() + ) + + monkeypatch.setattr("upskill.cli._build_executor", lambda *args, **kwargs: fake_executor) + + async def fake_generate_skill(**kwargs: object) -> SkillRecord: + del kwargs + return SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + ) + + async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]: + del args, kwargs + return test_cases + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + skill = args[0] + assert isinstance(skill, Skill) + assert kwargs["executor"] is fake_executor + assert kwargs["operation"] == "generate" + return _make_eval_results( + skill=skill, + model=str(kwargs["model"]), + test_cases=test_cases, + run_baseline=True, + ) + + monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill) + monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests) + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _generate_async( + task="write good pull request descriptions", + examples=None, + from_skill=None, + from_trace=None, + model="haiku", + test_gen_model=None, + output=None, + no_eval=False, + eval_model=None, + executor_name="local", + artifact_repo=None, + wait=False, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=2, + runs_dir=str(config.runs_dir), + log_runs=True, + ) + + saved = SkillRecord.load(config.skills_dir / "pull-request-descriptions") + assert len(saved.state.tests) == 2 + assert saved.state.tests[0].input == "prompt one" + + +@pytest.mark.asyncio +async def test_generate_no_eval_still_persists_generated_tests( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config( + skills_dir=tmp_path / "skills", + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + ) + test_cases = [ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])), + ] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr( + "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext() + ) + + async def fake_generate_skill(**kwargs: object) -> SkillRecord: + del kwargs + return SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + ) + + async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]: + del args, kwargs + return test_cases + + async def fail_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args, kwargs + raise AssertionError("evaluate_skill should not be called when --no-eval is set") + + monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill) + monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests) + monkeypatch.setattr("upskill.cli.evaluate_skill", fail_evaluate_skill) + + await _generate_async( + task="write good pull request descriptions", + examples=None, + from_skill=None, + from_trace=None, + model="haiku", + test_gen_model=None, + output=None, + no_eval=True, + eval_model=None, + executor_name="local", + artifact_repo=None, + wait=False, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=2, + runs_dir=str(config.runs_dir), + log_runs=True, + ) + + saved = SkillRecord.load(config.skills_dir / "pull-request-descriptions") + assert len(saved.state.tests) == 2 + assert saved.state.tests[1].input == "prompt two" + + +@pytest.mark.asyncio +async def test_generate_prints_test_generation_model( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config( + skills_dir=tmp_path / "skills", + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + ) + test_cases = [ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + ] + printed_messages: list[str] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr( + "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext() + ) + monkeypatch.setattr( + "upskill.cli.console.print", + lambda *args, **kwargs: printed_messages.append(" ".join(str(arg) for arg in args)), + ) + + async def fake_generate_skill(**kwargs: object) -> SkillRecord: + del kwargs + return SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + ) + + async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]: + del args, kwargs + return test_cases + + monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill) + monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests) + + await _generate_async( + task="write good pull request descriptions", + examples=None, + from_skill=None, + from_trace=None, + model="haiku", + test_gen_model="opus", + output=None, + no_eval=True, + eval_model=None, + executor_name="local", + artifact_repo=None, + wait=False, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=2, + runs_dir=str(config.runs_dir), + log_runs=True, + ) + + assert any("Generating test cases with opus..." in message for message in printed_messages) + + +@pytest.mark.asyncio +async def test_generate_jobs_no_wait_submits_remote_eval_requests( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config( + skills_dir=tmp_path / "skills", + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + ) + test_cases = [ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])), + ] + submit_models: list[str] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr( + "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext() + ) + + async def fake_generate_skill(**kwargs: object) -> SkillRecord: + del kwargs + return SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + ) + + async def fake_generate_tests(*args: object, **kwargs: object) -> list[TestCase]: + del args, kwargs + return test_cases + + async def fake_submit_generate_jobs_eval(**kwargs: object) -> list[str]: + submit_models.append(str(kwargs["model"])) + assert kwargs["test_cases"] == test_cases + return ["evalstate/job-1", "evalstate/job-2"] + + def fail_build_executor(*args: object, **kwargs: object) -> object: + del args, kwargs + raise AssertionError("_build_executor should not be used for jobs --no-wait submission") + + async def fail_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + del args, kwargs + raise AssertionError("evaluate_skill should not be called for jobs --no-wait submission") + + monkeypatch.setattr("upskill.cli.generate_skill", fake_generate_skill) + monkeypatch.setattr("upskill.cli.generate_tests", fake_generate_tests) + monkeypatch.setattr("upskill.cli._submit_generate_jobs_eval", fake_submit_generate_jobs_eval) + monkeypatch.setattr("upskill.cli._build_executor", fail_build_executor) + monkeypatch.setattr("upskill.cli.evaluate_skill", fail_evaluate_skill) + + await _generate_async( + task="write good pull request descriptions", + examples=None, + from_skill=None, + from_trace=None, + model="haiku", + test_gen_model=None, + output=None, + no_eval=False, + eval_model=None, + executor_name="jobs", + artifact_repo="ns/repo", + wait=False, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + max_parallel=2, + runs_dir=str(config.runs_dir), + log_runs=True, + ) + + saved = SkillRecord.load(config.skills_dir / "pull-request-descriptions") + assert len(saved.state.tests) == 2 + assert submit_models == ["haiku"] + + +@pytest.mark.asyncio +async def test_submit_generate_jobs_eval_marks_operation_as_generate( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config(fastagent_config=tmp_path / "fastagent.config.yaml") + test_cases = [TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"]))] + skill = Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + operation_calls: list[str] = [] + + async def fake_submit_remote_eval_jobs(**kwargs: object) -> list[str]: + operation_calls.append(str(kwargs["operation"])) + return ["evalstate/job-1"] + + monkeypatch.setattr("upskill.cli._submit_remote_eval_jobs", fake_submit_remote_eval_jobs) + + job_refs = await _submit_generate_jobs_eval( + skill=skill, + test_cases=test_cases, + model="haiku", + jobs_config=JobsConfig(artifact_repo="ns/repo"), + config=config, + cards_path=tmp_path / "cards", + batch_folder=tmp_path / "runs" / "batch_1", + ) + + assert job_refs == ["evalstate/job-1"] + assert operation_calls == ["generate"] + + +@pytest.mark.asyncio +async def test_benchmark_jobs_uses_remote_executor( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config( + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + ) + skill_record = SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ), + state=SkillState( + tests=[ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])), + ] + ), + ) + skill_dir = tmp_path / "skill" + skill_record.save(skill_dir) + fake_executor = object() + build_calls: list[str] = [] + eval_calls: list[int] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + monkeypatch.setattr( + "upskill.cli._fast_agent_context", lambda *_args, **_kwargs: _FakeAgentContext() + ) + + def fake_build_executor(name: str, **kwargs: object) -> object: + del kwargs + build_calls.append(name) + return fake_executor + + async def fake_evaluate_skill(*args: object, **kwargs: object) -> EvalResults: + skill = args[0] + assert isinstance(skill, Skill) + assert kwargs["executor"] is fake_executor + assert kwargs["operation"] == "benchmark" + max_parallel = kwargs["max_parallel"] + assert isinstance(max_parallel, int) + eval_calls.append(max_parallel) + return _make_eval_results( + skill=skill, + model=str(kwargs["model"]), + test_cases=skill_record.state.tests, + run_baseline=False, + ) + + monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor) + monkeypatch.setattr("upskill.cli.evaluate_skill", fake_evaluate_skill) + + await _benchmark_async( + skill_path=str(skill_dir), + models=["haiku"], + test_gen_model=None, + num_runs=2, + tests_path=None, + executor_name="jobs", + artifact_repo="ns/repo", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + output_dir=str(config.runs_dir), + verbose=False, + max_parallel=4, + ) + + assert build_calls == ["jobs"] + assert eval_calls == [4, 4] + batch_folder = next(config.runs_dir.iterdir()) + summary = load_batch_summary(batch_folder) + assert summary is not None + assert summary.total_runs == 2 + + +@pytest.mark.asyncio +async def test_benchmark_uses_config_execution_defaults_when_cli_unset( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config = Config( + runs_dir=tmp_path / "runs", + fastagent_config=tmp_path / "fastagent.config.yaml", + executor="jobs", + num_runs=2, + max_parallel=6, + ) + skill_record = SkillRecord( + skill=Skill( + name="pull-request-descriptions", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ), + state=SkillState( + tests=[ + TestCase(input="prompt one", expected=ExpectedSpec(contains=["answer"])), + TestCase(input="prompt two", expected=ExpectedSpec(contains=["answer"])), + ] + ), + ) + skill_dir = tmp_path / "skill" + skill_record.save(skill_dir) + fake_executor = object() + build_calls: list[str] = [] + benchmark_calls: list[tuple[int, int]] = [] + + monkeypatch.setattr("upskill.cli.Config.load", lambda: config) + + def fake_build_executor(name: str, **kwargs: object) -> object: + del kwargs + build_calls.append(name) + return fake_executor + + async def fake_run_with_skill_benchmark(*args: object, **kwargs: object): + del args + assert kwargs["executor"] is fake_executor + num_runs = kwargs["num_runs"] + max_parallel = kwargs["max_parallel"] + assert isinstance(num_runs, int) + assert isinstance(max_parallel, int) + benchmark_calls.append((num_runs, max_parallel)) + return {}, [] + + monkeypatch.setattr("upskill.cli._build_executor", fake_build_executor) + monkeypatch.setattr("upskill.cli._run_with_skill_benchmark", fake_run_with_skill_benchmark) + monkeypatch.setattr("upskill.cli._print_benchmark_summary", lambda _results: None) + monkeypatch.setattr("upskill.cli._write_benchmark_summary", lambda **_kwargs: None) + + await _benchmark_async( + skill_path=str(skill_dir), + models=["haiku"], + test_gen_model=None, + num_runs=None, + tests_path=None, + executor_name=None, + artifact_repo="ns/repo", + wait=True, + jobs_timeout="2h", + jobs_flavor="cpu-basic", + jobs_secrets="HF_TOKEN", + jobs_namespace=None, + output_dir=str(config.runs_dir), + verbose=False, + max_parallel=None, + ) + + assert build_calls == ["jobs"] + assert benchmark_calls == [(2, 6)] diff --git a/tests/test_config.py b/tests/test_config.py index 2ec6e1f..56769f0 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -36,7 +36,14 @@ def test_config_save_uses_env_override_path_when_file_is_missing(tmp_path, monke monkeypatch.setenv(UPSKILL_CONFIG_ENV, str(override_path)) monkeypatch.chdir(tmp_path) - config = Config(skill_generation_model="haiku") + config = Config( + skill_generation_model="haiku", + executor="jobs", + num_runs=4, + max_parallel=7, + jobs_secrets="HF_TOKEN,ANTHROPIC_API_KEY", + jobs_image="ghcr.io/example/custom:latest", + ) config.save() assert override_path.exists() @@ -46,3 +53,35 @@ def test_config_save_uses_env_override_path_when_file_is_missing(tmp_path, monke saved = yaml.safe_load(f) or {} assert saved["skill_generation_model"] == "haiku" + assert saved["executor"] == "jobs" + assert saved["num_runs"] == 4 + assert saved["max_parallel"] == 7 + assert saved["jobs_secrets"] == "HF_TOKEN,ANTHROPIC_API_KEY" + assert saved["jobs_image"] == "ghcr.io/example/custom:latest" + + +def test_config_load_reads_execution_settings(tmp_path, monkeypatch) -> None: + config_path = tmp_path / "upskill.config.yaml" + config_path.write_text( + "\n".join( + [ + "skill_generation_model: sonnet", + "executor: jobs", + "num_runs: 2", + "max_parallel: 6", + "jobs_secrets: HF_TOKEN,OPENAI_API_KEY", + "jobs_image: ghcr.io/example/custom:latest", + ] + ), + encoding="utf-8", + ) + monkeypatch.chdir(tmp_path) + + config = Config.load() + + assert config.skill_generation_model == "sonnet" + assert config.executor == "jobs" + assert config.num_runs == 2 + assert config.max_parallel == 6 + assert config.jobs_secrets == "HF_TOKEN,OPENAI_API_KEY" + assert config.jobs_image == "ghcr.io/example/custom:latest" diff --git a/tests/test_cpd.py b/tests/test_cpd.py new file mode 100644 index 0000000..a71322f --- /dev/null +++ b/tests/test_cpd.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from scripts.cpd import build_cpd_command, resolve_cli_exit_code, resolve_platform + +if TYPE_CHECKING: + from pathlib import Path + + +def test_resolve_platform_maps_common_linux_labels() -> None: + platform_config = resolve_platform(system="linux", arch="x86_64") + + assert platform_config.system == "linux" + assert platform_config.arch == "x86_64" + assert platform_config.os_label == "linux" + assert platform_config.arch_label == "x64" + assert platform_config.java_name == "java" + assert platform_config.pmd_name == "pmd" + + +def test_build_cpd_command_includes_expected_arguments(tmp_path: Path) -> None: + platform_config = resolve_platform(system="linux", arch="x86_64") + pmd_dir = tmp_path / "pmd-bin" + src_dir = tmp_path / "src" + excluded_path = src_dir / "skip_me.py" + + command = build_cpd_command( + platform_config=platform_config, + pmd_dir=pmd_dir, + src_dir=src_dir, + excluded_paths=[excluded_path], + min_tokens=120, + output_format="xml", + ) + + assert command == [ + str(pmd_dir / "bin" / "pmd"), + "cpd", + "--language", + "python", + "--minimum-tokens", + "120", + "--dir", + str(src_dir), + "--format", + "xml", + "--exclude", + str(excluded_path), + ] + + +def test_resolve_cli_exit_code_honors_check_mode() -> None: + assert resolve_cli_exit_code(cpd_exit_code=0, check=False) == 0 + assert resolve_cli_exit_code(cpd_exit_code=4, check=False) == 0 + assert resolve_cli_exit_code(cpd_exit_code=4, check=True) == 1 + assert resolve_cli_exit_code(cpd_exit_code=7, check=True) == 7 diff --git a/tests/test_execution_backends.py b/tests/test_execution_backends.py new file mode 100644 index 0000000..e14f960 --- /dev/null +++ b/tests/test_execution_backends.py @@ -0,0 +1,568 @@ +from __future__ import annotations + +import asyncio +import json +import shutil +import tarfile +from dataclasses import replace +from pathlib import Path + +import pytest +from fast_agent.mcp.prompt_message_extended import PromptMessageExtended +from fast_agent.mcp.prompt_serialization import save_json +from mcp.types import TextContent + +from upskill.artifacts import materialize_workspace +from upskill.evaluate import evaluate_skill, load_eval_results_from_artifact_root +from upskill.executors.contracts import ExecutionHandle, ExecutionRequest, ExecutionResult +from upskill.executors.local_fast_agent import LocalFastAgentExecutor +from upskill.executors.remote_fast_agent import RemoteFastAgentExecutor +from upskill.fast_agent_cli import build_fast_agent_command +from upskill.hf_jobs import JobsConfig, SubmittedJob +from upskill.models import ConversationStats, ExpectedSpec, Skill, TestCase, TestResult +from upskill.result_parsing import parse_fast_agent_results + + +def _write_result_history(path: Path, *, assistant_text: str) -> None: + messages = [ + PromptMessageExtended( + role="user", + content=[TextContent(type="text", text="Do the task")], + ), + PromptMessageExtended( + role="assistant", + content=[TextContent(type="text", text=assistant_text)], + ), + ] + save_json(messages, str(path)) + + +def _build_request(tmp_path: Path) -> ExecutionRequest: + cards_dir = tmp_path / "cards-source" + cards_dir.mkdir() + (cards_dir / "evaluator.md").write_text("---\ndescription: evaluator\n---\n{{agentSkills}}\n") + (cards_dir / "skill_gen.md").write_text( + "---\ndescription: skill generator\n---\nGenerate skills\n" + ) + (cards_dir / "test_gen.md").write_text( + "---\ndescription: test generator\n---\nGenerate tests\n" + ) + config_path = tmp_path / "fastagent.config.yaml" + config_path.write_text("default_model: sonnet\n") + return ExecutionRequest( + prompt="Do the task", + model="haiku", + agent="evaluator", + fastagent_config_path=config_path, + artifact_dir=tmp_path / "artifacts" / "run_1", + cards_source_dir=cards_dir, + label="test run", + skill=Skill( + name="write-good-prs", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ), + workspace_files={"context.txt": "hello"}, + ) + + +def test_build_fast_agent_command_uses_explicit_contract(tmp_path: Path) -> None: + request = _build_request(tmp_path) + prompt_path = tmp_path / "bundle" / "prompt.txt" + prompt_path.parent.mkdir(parents=True) + prompt_path.write_text(request.prompt, encoding="utf-8") + command = build_fast_agent_command( + request, + config_path=request.fastagent_config_path, + cards_dir=tmp_path / "bundle" / "cards", + skills_dir=tmp_path / "bundle" / "skills", + prompt_path=prompt_path, + results_path=tmp_path / "bundle" / "results.json", + fast_agent_bin="fast-agent", + ) + + assert command[:2] == ["fast-agent", "go"] + assert "--config-path" in command + assert "--card" in command + assert "--agent" in command + assert "--model" in command + assert "--skills-dir" in command + assert "--prompt-file" in command + assert "--results" in command + assert "--quiet" in command + + +def test_build_fast_agent_command_omits_missing_config_path(tmp_path: Path) -> None: + request = replace(_build_request(tmp_path), fastagent_config_path=tmp_path / "missing.yaml") + prompt_path = tmp_path / "bundle" / "prompt.txt" + prompt_path.parent.mkdir(parents=True) + prompt_path.write_text(request.prompt, encoding="utf-8") + + command = build_fast_agent_command( + request, + config_path=None, + cards_dir=tmp_path / "bundle" / "cards", + skills_dir=tmp_path / "bundle" / "skills", + prompt_path=prompt_path, + results_path=tmp_path / "bundle" / "results.json", + ) + + assert "--config-path" not in command + assert "--prompt-file" in command + + +def test_parse_fast_agent_results_extracts_output_text(tmp_path: Path) -> None: + results_path = tmp_path / "results.json" + _write_result_history(results_path, assistant_text="Structured answer") + + parsed = parse_fast_agent_results(results_path) + + assert parsed.output_text == "Structured answer" + assert parsed.stats.turns == 1 + + +@pytest.mark.asyncio +async def test_local_fast_agent_executor_preserves_artifacts_and_parses_results( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + request = _build_request(tmp_path) + executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent") + + class FakeProcess: + returncode = 0 + + async def communicate(self) -> tuple[bytes, bytes]: + return (b"assistant output\n", b"") + + async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess: + del kwargs + results_index = args.index("--results") + 1 + results_path = Path(args[results_index]) + _write_result_history(results_path, assistant_text="Final answer") + return FakeProcess() + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec) + + handle = await executor.execute(request) + result = await executor.collect(handle) + + assert result.error is None + assert result.output_text == "Final answer" + assert result.raw_results_path == request.artifact_dir / "results.json" + assert (request.artifact_dir / "request.json").exists() + assert (request.artifact_dir / "stdout.txt").exists() + assert (request.artifact_dir / "stderr.txt").exists() + assert (request.artifact_dir / "workspace" / "context.txt").read_text() == "hello" + assert (request.artifact_dir / "workspace" / "fastagent.config.yaml").exists() + assert (request.artifact_dir / "cards" / "evaluator.md").exists() + assert not (request.artifact_dir / "cards" / "skill_gen.md").exists() + assert not (request.artifact_dir / "cards" / "test_gen.md").exists() + assert (request.artifact_dir / "skills" / "write-good-prs" / "SKILL.md").exists() + + +@pytest.mark.asyncio +async def test_local_fast_agent_executor_fails_when_results_artifact_is_missing( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + request = _build_request(tmp_path) + executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent") + + class FakeProcess: + returncode = 0 + + async def communicate(self) -> tuple[bytes, bytes]: + return (b"", b"") + + async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess: + del args, kwargs + return FakeProcess() + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec) + + handle = await executor.execute(request) + result = await executor.collect(handle) + + assert result.error == "fast-agent run did not produce a results artifact." + assert result.raw_results_path is None + + +@pytest.mark.asyncio +async def test_local_fast_agent_executor_omits_missing_config_from_command_and_artifacts( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + request = replace(_build_request(tmp_path), fastagent_config_path=tmp_path / "missing.yaml") + executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent") + + class FakeProcess: + returncode = 0 + + async def communicate(self) -> tuple[bytes, bytes]: + return (b"", b"") + + async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess: + del kwargs + assert "--config-path" not in args + assert "--prompt-file" in args + results_index = args.index("--results") + 1 + _write_result_history(Path(args[results_index]), assistant_text="Final answer") + return FakeProcess() + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec) + + handle = await executor.execute(request) + result = await executor.collect(handle) + + assert result.error is None + assert not (request.artifact_dir / "fastagent.config.yaml").exists() + assert not (request.artifact_dir / "workspace" / "fastagent.config.yaml").exists() + + +@pytest.mark.asyncio +async def test_remote_fast_agent_executor_preserves_artifacts_and_parses_results( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + request = _build_request(tmp_path) + executor = RemoteFastAgentExecutor(jobs_config=JobsConfig(artifact_repo="ns/repo")) + submitted_labels: dict[str, str] = {} + + def fake_submit_bundle_job(**kwargs: object) -> SubmittedJob: + nonlocal submitted_labels + labels = kwargs["labels"] + assert isinstance(labels, dict) + assert all(isinstance(key, str) and isinstance(value, str) for key, value in labels.items()) + submitted_labels = {str(key): str(value) for key, value in labels.items()} + del kwargs + return SubmittedJob( + job_id="evalstate/job-123", + run_id="run-456", + artifact_repo="ns/repo", + ) + + def fake_wait_for_job_outputs( + job: SubmittedJob, + *, + destination_root: Path, + wait_timeout_seconds: float, + progress_callback: object = None, + ) -> Path: + del wait_timeout_seconds, progress_callback + output_dir = destination_root / "outputs" / job.run_id + (output_dir / "results").mkdir(parents=True, exist_ok=True) + (output_dir / "logs").mkdir(parents=True, exist_ok=True) + (output_dir / "status").mkdir(parents=True, exist_ok=True) + (output_dir / "workspaces" / "request_1").mkdir(parents=True, exist_ok=True) + _write_result_history( + output_dir / "results" / "request_1.json", assistant_text="Remote answer" + ) + (output_dir / "logs" / "request_1.out.txt").write_text("stdout\n", encoding="utf-8") + (output_dir / "logs" / "request_1.err.txt").write_text("", encoding="utf-8") + (output_dir / "status" / "request_1.exit_code.txt").write_text("0\n", encoding="utf-8") + (output_dir / "workspaces" / "request_1" / "context.txt").write_text( + "remote hello", + encoding="utf-8", + ) + return output_dir + + monkeypatch.setattr( + "upskill.executors.remote_fast_agent._submit_bundle_job", + fake_submit_bundle_job, + ) + monkeypatch.setattr( + "upskill.executors.remote_fast_agent._make_run_id", + lambda *_args: "run-456", + ) + monkeypatch.setattr( + "upskill.executors.remote_fast_agent.wait_for_job_outputs", + fake_wait_for_job_outputs, + ) + + handle = await executor.execute(request) + result = await executor.collect(handle) + + assert result.error is None + assert result.output_text == "Remote answer" + assert result.raw_results_path == request.artifact_dir / "results.json" + assert result.metadata["job_id"] == "evalstate/job-123" + assert (request.artifact_dir / "stdout.txt").exists() + assert (request.artifact_dir / "stderr.txt").exists() + assert (request.artifact_dir / "remote_output" / "results" / "request_1.json").exists() + assert (request.artifact_dir / "workspace" / "context.txt").read_text() == "remote hello" + assert not (request.artifact_dir / "cards" / "skill_gen.md").exists() + assert not (request.artifact_dir / "cards" / "test_gen.md").exists() + assert submitted_labels == { + "upskill-agent": "evaluator", + "upskill-executor": "remote-fast-agent", + "upskill-model": "haiku", + "upskill-operation": "eval", + "upskill-request": "test-run", + "upskill-run-id": "run-456", + "upskill-skill": "write-good-prs", + } + + +@pytest.mark.asyncio +async def test_remote_fast_agent_executor_submit_preserves_artifacts( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + request = _build_request(tmp_path) + executor = RemoteFastAgentExecutor(jobs_config=JobsConfig(artifact_repo="ns/repo")) + + def fake_submit_bundle_job(**kwargs: object) -> SubmittedJob: + del kwargs + return SubmittedJob( + job_id="evalstate/job-123", + run_id="run-456", + artifact_repo="ns/repo", + ) + + def fail_wait_for_job_outputs(*args: object, **kwargs: object) -> Path: + del args, kwargs + raise AssertionError("submit() should not wait for job outputs") + + monkeypatch.setattr( + "upskill.executors.remote_fast_agent._submit_bundle_job", + fake_submit_bundle_job, + ) + monkeypatch.setattr( + "upskill.executors.remote_fast_agent.wait_for_job_outputs", + fail_wait_for_job_outputs, + ) + + submission = await executor.submit(request) + + assert submission == SubmittedJob( + job_id="evalstate/job-123", + run_id="run-456", + artifact_repo="ns/repo", + ) + assert (request.artifact_dir / "request.json").exists() + assert (request.artifact_dir / "prompt.txt").exists() + assert (request.artifact_dir / "cards" / "evaluator.md").exists() + assert not (request.artifact_dir / "cards" / "skill_gen.md").exists() + assert not (request.artifact_dir / "cards" / "test_gen.md").exists() + assert (request.artifact_dir / "skills" / "write-good-prs" / "SKILL.md").exists() + submitted_job = json.loads((request.artifact_dir / "submitted_job.json").read_text()) + assert submitted_job["job_id"] == "evalstate/job-123" + assert submitted_job["run_id"] == "run-456" + + +def test_remote_fast_agent_executor_bundle_omits_missing_config(tmp_path: Path) -> None: + request = replace(_build_request(tmp_path), fastagent_config_path=tmp_path / "missing.yaml") + executor = RemoteFastAgentExecutor(jobs_config=JobsConfig(artifact_repo="ns/repo")) + + temp_root, bundle_archive = executor._create_bundle_archive(request) + try: + with tarfile.open(bundle_archive, "r:gz") as archive: + assert "bundle/fastagent.config.yaml" not in archive.getnames() + finally: + shutil.rmtree(temp_root, ignore_errors=True) + + +@pytest.mark.asyncio +async def test_local_fast_agent_executor_normalizes_paths_and_preserves_file_context( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.chdir(tmp_path) + + cards_dir = Path("cards-source") + cards_dir.mkdir() + (cards_dir / "evaluator.md").write_text("---\ndescription: evaluator\n---\n{{agentSkills}}\n") + config_path = Path("fastagent.config.yaml") + config_path.write_text("default_model: sonnet\n") + request = ExecutionRequest( + prompt="Base prompt\n\n```context.txt\nhello\n```", + model="haiku", + agent="evaluator", + fastagent_config_path=config_path, + artifact_dir=Path("artifacts") / "run_1", + cards_source_dir=cards_dir, + label="test run", + workspace_files={"context.txt": "hello"}, + ) + executor = LocalFastAgentExecutor(fast_agent_bin="fast-agent") + + class FakeProcess: + returncode = 0 + + async def communicate(self) -> tuple[bytes, bytes]: + return (b"assistant output\n", b"") + + async def fake_create_subprocess_exec(*args: str, **kwargs: object) -> FakeProcess: + cwd = kwargs["cwd"] + assert isinstance(cwd, Path) + assert cwd.is_absolute() + results_index = args.index("--results") + 1 + prompt_index = args.index("--prompt-file") + 1 + cards_index = args.index("--card") + 1 + skills_index = args.index("--skills-dir") + 1 + config_index = args.index("--config-path") + 1 + agent_index = args.index("--agent") + 1 + assert args[agent_index] == "evaluator" + for index in (results_index, cards_index, skills_index, config_index, prompt_index): + if index == prompt_index: + continue + assert Path(args[index]).is_absolute() + prompt_text = Path(args[prompt_index]).read_text(encoding="utf-8") + assert "```context.txt\nhello\n```" in prompt_text + _write_result_history(Path(args[results_index]), assistant_text="Final answer") + return FakeProcess() + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec) + + handle = await executor.execute(request) + result = await executor.collect(handle) + + assert result.error is None + + +def test_materialize_workspace_rejects_paths_outside_workspace(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="must not traverse parents"): + materialize_workspace(tmp_path / "workspace", {"../pyproject.toml": "oops"}) + + with pytest.raises(ValueError, match="must be relative"): + materialize_workspace(tmp_path / "workspace", {"/tmp/pwned": "oops"}) + + +def test_load_eval_results_from_artifact_root_reconstructs_metrics(tmp_path: Path) -> None: + artifact_root = tmp_path / "eval" + with_skill_dir = artifact_root / "with-skill" / "test_1" + baseline_dir = artifact_root / "baseline" / "test_1" + with_skill_dir.mkdir(parents=True) + baseline_dir.mkdir(parents=True) + + test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"])) + with_skill_result = TestResult(test_case=test_case, success=True, output="answer") + baseline_result = TestResult(test_case=test_case, success=False, output="miss") + + (with_skill_dir / "test_result.json").write_text( + with_skill_result.model_dump_json(indent=2), + encoding="utf-8", + ) + (baseline_dir / "test_result.json").write_text( + baseline_result.model_dump_json(indent=2), + encoding="utf-8", + ) + + reconstructed = load_eval_results_from_artifact_root( + skill_name="write-good-prs", + model="qwen35", + artifact_root=artifact_root, + ) + + assert reconstructed is not None + assert reconstructed.with_skill_success_rate == 1.0 + assert reconstructed.baseline_success_rate == 0.0 + + +@pytest.mark.asyncio +async def test_evaluate_skill_emits_per_test_progress_messages(tmp_path: Path) -> None: + skill = Skill( + name="write-good-prs", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"])) + messages: list[str] = [] + + class FakeExecutor: + async def execute(self, request: ExecutionRequest) -> ExecutionHandle: + request.artifact_dir.mkdir(parents=True, exist_ok=True) + workspace_dir = request.artifact_dir / "workspace" + workspace_dir.mkdir(parents=True, exist_ok=True) + task = asyncio.create_task( + asyncio.sleep( + 0, + result=ExecutionResult( + output_text="answer", + raw_results_path=None, + stdout_path=request.artifact_dir / "stdout.txt", + stderr_path=request.artifact_dir / "stderr.txt", + artifact_dir=request.artifact_dir, + workspace_dir=workspace_dir, + stats=ConversationStats(), + ), + ) + ) + return ExecutionHandle(request=request, task=task) + + async def collect(self, handle: ExecutionHandle) -> ExecutionResult: + return await handle.task + + async def cancel(self, handle: ExecutionHandle) -> None: + handle.task.cancel() + + results = await evaluate_skill( + skill, + [test_case], + FakeExecutor(), + model="haiku", + fastagent_config_path=tmp_path / "fastagent.config.yaml", + cards_source_dir=tmp_path, + artifact_root=tmp_path / "eval", + progress_callback=messages.append, + ) + + assert results.with_skill_success_rate == 1.0 + assert "starting with-skill test 1/1" in messages + assert "finished with-skill test 1/1 (ok)" in messages + + +@pytest.mark.asyncio +async def test_evaluate_skill_includes_job_id_in_execution_errors(tmp_path: Path) -> None: + skill = Skill( + name="write-good-prs", + description="Write good pull request descriptions.", + body="Use a clear structure.", + ) + test_case = TestCase(input="prompt", expected=ExpectedSpec(contains=["answer"])) + + class FakeExecutor: + async def execute(self, request: ExecutionRequest) -> ExecutionHandle: + request.artifact_dir.mkdir(parents=True, exist_ok=True) + workspace_dir = request.artifact_dir / "workspace" + workspace_dir.mkdir(parents=True, exist_ok=True) + task = asyncio.create_task( + asyncio.sleep( + 0, + result=ExecutionResult( + output_text=None, + raw_results_path=None, + stdout_path=request.artifact_dir / "stdout.txt", + stderr_path=request.artifact_dir / "stderr.txt", + artifact_dir=request.artifact_dir, + workspace_dir=workspace_dir, + stats=ConversationStats(), + error="fast-agent exited with code 1.", + metadata={"job_id": "evalstate/job-123"}, + ), + ) + ) + return ExecutionHandle(request=request, task=task) + + async def collect(self, handle: ExecutionHandle) -> ExecutionResult: + return await handle.task + + async def cancel(self, handle: ExecutionHandle) -> None: + handle.task.cancel() + + results = await evaluate_skill( + skill, + [test_case], + FakeExecutor(), + model="haiku", + fastagent_config_path=tmp_path / "fastagent.config.yaml", + cards_source_dir=tmp_path, + artifact_root=tmp_path / "eval", + progress_callback=None, + ) + + assert ( + results.with_skill_results[0].error + == "fast-agent exited with code 1. (job evalstate/job-123)" + ) diff --git a/tests/test_hf_jobs.py b/tests/test_hf_jobs.py new file mode 100644 index 0000000..2ea9088 --- /dev/null +++ b/tests/test_hf_jobs.py @@ -0,0 +1,523 @@ +from __future__ import annotations + +import subprocess +from datetime import UTC, datetime +from typing import TYPE_CHECKING + +import pytest + +import upskill.hf_jobs as hf_jobs +from upskill.hf_jobs import ( + JobsConfig, + SubmittedJob, + _build_hf_jobs_run_command, + _make_run_id, + _normalize_job_id, + _render_bundle_job_script, + _submit_bundle_job, + parse_duration_seconds, + verify_artifact_repo_access, + wait_for_job_outputs, +) + +if TYPE_CHECKING: + from pathlib import Path + + +def test_parse_duration_seconds_supports_hf_style_suffixes() -> None: + assert parse_duration_seconds("45m") == 2700.0 + assert parse_duration_seconds("2h") == 7200.0 + assert parse_duration_seconds("30") == 30.0 + + +def test_make_run_id_adds_entropy_even_with_frozen_timestamp( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FrozenDateTime(datetime): + @classmethod + def now(cls, tz: object | None = None) -> FrozenDateTime: + del tz + return cls(2026, 3, 22, 12, 0, 0, tzinfo=UTC) + + monkeypatch.setattr("upskill.hf_jobs.datetime", FrozenDateTime) + + run_id_a = _make_run_id("with-skill", "qwen35", "pull-request-descriptions") + run_id_b = _make_run_id("with-skill", "qwen35", "pull-request-descriptions") + + assert run_id_a != run_id_b + assert run_id_a.startswith("20260322T120000Z_with-skill-qwen35-pull-request-descriptions_") + assert run_id_b.startswith("20260322T120000Z_with-skill-qwen35-pull-request-descriptions_") + + +def test_normalize_job_id_extracts_namespace_and_id_from_url() -> None: + assert ( + _normalize_job_id("View at: https://huggingface.co/jobs/evalstate/69bd5e5f71691dc46f161e83") + == "evalstate/69bd5e5f71691dc46f161e83" + ) + + +def test_run_hf_command_uses_doubled_retry_backoff( + monkeypatch: pytest.MonkeyPatch, +) -> None: + sleep_calls: list[float] = [] + attempts = 0 + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + nonlocal attempts + del kwargs + attempts += 1 + if attempts < 3: + return subprocess.CompletedProcess( + args=args, + returncode=1, + stdout="", + stderr="rate limit for the /whoami-v2 endpoint\n", + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr("upskill.hf_jobs.time.sleep", sleep_calls.append) + + completed = hf_jobs._run_hf_command(["hf", "jobs", "run"]) + + assert completed.returncode == 0 + assert sleep_calls == [2.0, 4.0] + + +def test_wait_for_job_outputs_downloads_full_directory( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + calls: list[list[str]] = [] + messages: list[str] = [] + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[2] == "ns/repo" and args[3].endswith("exit_code.txt"): + marker = tmp_path / "outputs" / "run-456" / "exit_code.txt" + marker.parent.mkdir(parents=True, exist_ok=True) + marker.write_text("0\n", encoding="utf-8") + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + output_dir = tmp_path / "outputs" / "run-456" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "exit_code.txt").write_text("0\n", encoding="utf-8") + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + + output_dir = wait_for_job_outputs( + SubmittedJob(job_id="job-123", run_id="run-456", artifact_repo="ns/repo"), + destination_root=tmp_path, + wait_timeout_seconds=1.0, + poll_interval_seconds=0.01, + progress_callback=messages.append, + ) + + assert output_dir == tmp_path / "outputs" / "run-456" + assert len(calls) == 3 + assert any( + call[:7] == ["hf", "jobs", "ps", "-a", "--format", "json", "--namespace"] for call in calls + ) + assert messages[0] == "waiting for job job-123 (run_id=run-456)" + assert "completed; downloading artifacts" in messages[1] + + +def test_wait_for_job_outputs_raises_when_job_enters_error_stage( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + if args[:4] == ["hf", "jobs", "ps", "-a"]: + return subprocess.CompletedProcess( + args=args, + returncode=0, + stdout=( + '[{"id":"job-123","owner":{"name":"evalstate"},' + '"status":{"stage":"ERROR","message":"boom"}}]' + ), + stderr="", + ) + return subprocess.CompletedProcess(args=args, returncode=1, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + + with pytest.raises(RuntimeError, match="ended with stage ERROR"): + wait_for_job_outputs( + SubmittedJob(job_id="evalstate/job-123", run_id="run-456", artifact_repo="ns/repo"), + destination_root=tmp_path, + wait_timeout_seconds=1.0, + poll_interval_seconds=0.01, + ) + + +def test_wait_for_job_outputs_retries_auth_rate_limited_downloads( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + calls: list[list[str]] = [] + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[:4] == ["hf", "jobs", "ps", "-a"]: + return subprocess.CompletedProcess(args=args, returncode=0, stdout="[]", stderr="") + if args[:2] == ["hf", "download"] and args[3].endswith("exit_code.txt"): + marker_attempt = sum( + 1 + for call in calls + if call[:2] == ["hf", "download"] and call[3].endswith("exit_code.txt") + ) + if marker_attempt == 1: + return subprocess.CompletedProcess( + args=args, + returncode=1, + stdout="", + stderr=( + "Error: You've hit the rate limit for the /whoami-v2 endpoint, " + "which is intentionally strict for security reasons.\n" + ), + ) + marker = tmp_path / "outputs" / "run-456" / "exit_code.txt" + marker.parent.mkdir(parents=True, exist_ok=True) + marker.write_text("0\n", encoding="utf-8") + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + if args[:2] == ["hf", "download"] and "--include" in args: + full_download_attempt = sum( + 1 for call in calls if call[:2] == ["hf", "download"] and "--include" in call + ) + if full_download_attempt == 1: + return subprocess.CompletedProcess( + args=args, + returncode=1, + stdout="", + stderr=( + "Error: You've hit the rate limit for the /whoami-v2 endpoint, " + "which is intentionally strict for security reasons.\n" + ), + ) + output_dir = tmp_path / "outputs" / "run-456" + output_dir.mkdir(parents=True, exist_ok=True) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + return subprocess.CompletedProcess(args=args, returncode=1, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr("upskill.hf_jobs.time.sleep", lambda *_args, **_kwargs: None) + + output_dir = wait_for_job_outputs( + SubmittedJob(job_id="job-123", run_id="run-456", artifact_repo="ns/repo"), + destination_root=tmp_path, + wait_timeout_seconds=1.0, + poll_interval_seconds=0.01, + ) + + assert output_dir == tmp_path / "outputs" / "run-456" + assert sum(1 for call in calls if call[:2] == ["hf", "download"] and "--include" in call) == 2 + assert ( + sum( + 1 + for call in calls + if call[:2] == ["hf", "download"] and call[3].endswith("exit_code.txt") + ) + == 2 + ) + + +def test_submit_bundle_job_retries_conflict_upload_and_auth_rate_limit( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + calls: list[list[str]] = [] + bundle_archive = tmp_path / "bundle.tar.gz" + bundle_archive.write_text("bundle", encoding="utf-8") + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[:2] == ["hf", "upload"] and args[4].endswith("bundle.tar.gz"): + upload_attempt = sum( + 1 + for call in calls + if call[:2] == ["hf", "upload"] and call[4].endswith("bundle.tar.gz") + ) + if upload_attempt == 1: + return subprocess.CompletedProcess( + args=args, + returncode=1, + stdout="", + stderr="412 Precondition Failed\nA commit has happened since. Please refresh and try again.\n", + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + if args[:3] == ["hf", "jobs", "run"]: + submit_attempt = sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"]) + if submit_attempt == 1: + return subprocess.CompletedProcess( + args=args, + returncode=1, + stdout="Set HF_DEBUG=1 as environment variable for full traceback.\n", + stderr=( + "Error: You've hit the rate limit for the /whoami-v2 endpoint, " + "which is intentionally strict for security reasons.\n" + ), + ) + return subprocess.CompletedProcess( + args=args, + returncode=0, + stdout="View at: https://huggingface.co/jobs/evalstate/job-123\n", + stderr="", + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr("upskill.hf_jobs.time.sleep", lambda *_args, **_kwargs: None) + monkeypatch.setattr("upskill.hf_jobs._VERIFIED_ARTIFACT_REPOS", set()) + + submission = _submit_bundle_job( + bundle_archive=bundle_archive, + jobs_config=JobsConfig(artifact_repo="ns/repo"), + run_id="run-456", + model="qwen35", + ) + + assert submission == SubmittedJob( + job_id="evalstate/job-123", + run_id="run-456", + artifact_repo="ns/repo", + ) + assert sum(1 for call in calls if call[:2] == ["hf", "upload"]) == 2 + assert sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"]) == 2 + + +def test_submit_bundle_job_retries_auth_rate_limit_during_upload( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + calls: list[list[str]] = [] + bundle_archive = tmp_path / "bundle.tar.gz" + bundle_archive.write_text("bundle", encoding="utf-8") + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[:2] == ["hf", "upload"] and args[4].endswith("bundle.tar.gz"): + upload_attempt = sum( + 1 + for call in calls + if call[:2] == ["hf", "upload"] and call[4].endswith("bundle.tar.gz") + ) + if upload_attempt == 1: + return subprocess.CompletedProcess( + args=args, + returncode=1, + stdout="", + stderr=( + "Error: You've hit the rate limit for the /whoami-v2 endpoint, " + "which is intentionally strict for security reasons.\n" + ), + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + if args[:3] == ["hf", "jobs", "run"]: + return subprocess.CompletedProcess( + args=args, + returncode=0, + stdout="View at: https://huggingface.co/jobs/evalstate/job-123\n", + stderr="", + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr("upskill.hf_jobs.time.sleep", lambda *_args, **_kwargs: None) + + submission = _submit_bundle_job( + bundle_archive=bundle_archive, + jobs_config=JobsConfig(artifact_repo="ns/repo"), + run_id="run-456", + model="qwen35", + ) + + assert submission.job_id == "evalstate/job-123" + assert sum(1 for call in calls if call[:2] == ["hf", "upload"]) == 2 + assert sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"]) == 1 + + +def test_verify_artifact_repo_access_checks_artifact_repo_once_per_process( + monkeypatch: pytest.MonkeyPatch, +) -> None: + calls: list[list[str]] = [] + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[:2] == ["hf", "download"] and "--dry-run" in args: + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr("upskill.hf_jobs._VERIFIED_ARTIFACT_REPOS", set()) + + verify_artifact_repo_access("ns/repo") + verify_artifact_repo_access("ns/repo") + + assert sum(1 for call in calls if call[:2] == ["hf", "download"] and "--dry-run" in call) == 1 + assert {"ns/repo"} == hf_jobs._VERIFIED_ARTIFACT_REPOS + + +def test_submit_bundle_job_uses_prepared_artifact_repo( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + calls: list[list[str]] = [] + bundle_archive = tmp_path / "bundle.tar.gz" + bundle_archive.write_text("bundle", encoding="utf-8") + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[:2] == ["hf", "download"] and "--dry-run" in args: + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + if args[:2] == ["hf", "upload"]: + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + if args[:3] == ["hf", "jobs", "run"]: + run_number = sum(1 for call in calls if call[:3] == ["hf", "jobs", "run"]) + return subprocess.CompletedProcess( + args=args, + returncode=0, + stdout=f"View at: https://huggingface.co/jobs/evalstate/job-{run_number}\n", + stderr="", + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr("upskill.hf_jobs._VERIFIED_ARTIFACT_REPOS", set()) + + verify_artifact_repo_access("ns/repo") + + first = _submit_bundle_job( + bundle_archive=bundle_archive, + jobs_config=JobsConfig(artifact_repo="ns/repo"), + run_id="run-1", + model="qwen35", + ) + second = _submit_bundle_job( + bundle_archive=bundle_archive, + jobs_config=JobsConfig(artifact_repo="ns/repo"), + run_id="run-2", + model="qwen35", + ) + + assert first.job_id == "evalstate/job-1" + assert second.job_id == "evalstate/job-2" + assert sum(1 for call in calls if call[:2] == ["hf", "download"] and "--dry-run" in call) == 1 + assert {"ns/repo"} == hf_jobs._VERIFIED_ARTIFACT_REPOS + jobs_run_call = calls[-1] + assert "--namespace" in jobs_run_call + assert jobs_run_call[jobs_run_call.index("--namespace") + 1] == "ns" + assert "ghcr.io/astral-sh/uv:python3.13-bookworm" in jobs_run_call + assert any("huggingface_hub==1.7.2" in arg for arg in jobs_run_call) + + +def test_build_hf_jobs_run_command_uses_configured_image() -> None: + command = _build_hf_jobs_run_command( + jobs_config=JobsConfig( + artifact_repo="ns/repo", + jobs_image="ghcr.io/example/custom:latest", + ), + run_id="run-123", + model="haiku", + labels=None, + job_script="echo hi", + ) + + assert command[-5:] == [ + "--", + "ghcr.io/example/custom:latest", + "bash", + "-lc", + "echo hi", + ] + + +def test_render_bundle_job_script_retries_auth_rate_limits_for_downloads_and_uploads() -> None: + script = _render_bundle_job_script() + + assert "rate limit for the /whoami-v2 endpoint" in script + assert "local delay=2" in script + assert 'download_with_retries "$ARTIFACT_REPO" "inputs/$RUN_ID/bundle.tar.gz" "$WORK"' in script + assert 'run_hf_with_retries hf upload "$repo" "$src" "$dest"' in script + + +def test_submit_bundle_job_passes_labels_to_hf_jobs_run( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + calls: list[list[str]] = [] + bundle_archive = tmp_path / "bundle.tar.gz" + bundle_archive.write_text("bundle", encoding="utf-8") + + def fake_run( + args: list[str], + **kwargs: object, + ) -> subprocess.CompletedProcess[str]: + del kwargs + calls.append(args) + if args[:2] == ["hf", "upload"]: + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + if args[:3] == ["hf", "jobs", "run"]: + return subprocess.CompletedProcess( + args=args, + returncode=0, + stdout="View at: https://huggingface.co/jobs/evalstate/job-123\n", + stderr="", + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + + _submit_bundle_job( + bundle_archive=bundle_archive, + jobs_config=JobsConfig(artifact_repo="ns/repo"), + run_id="run-123", + model="qwen35", + labels={ + "upskill-model": "qwen35", + "upskill-operation": "eval", + "upskill-request": "eval-with-skill-test-1", + }, + ) + + jobs_run_call = next(call for call in calls if call[:3] == ["hf", "jobs", "run"]) + label_values = [ + jobs_run_call[index + 1] + for index, token in enumerate(jobs_run_call[:-1]) + if token == "--label" + ] + assert label_values == [ + "upskill-model=qwen35", + "upskill-operation=eval", + "upskill-request=eval-with-skill-test-1", + ] diff --git a/tests/test_model_resolution.py b/tests/test_model_resolution.py index c0b330c..9657040 100644 --- a/tests/test_model_resolution.py +++ b/tests/test_model_resolution.py @@ -3,7 +3,7 @@ import pytest from upskill.config import Config -from upskill.model_resolution import resolve_models +from upskill.model_resolution import build_fastagent_model_references, resolve_models def test_resolve_generate_uses_generation_model_for_test_gen_by_default() -> None: @@ -42,6 +42,26 @@ def test_resolve_generate_cli_test_gen_model_overrides_config() -> None: assert resolved.test_generation_model == "opus" +def test_build_fastagent_model_references_for_generate_uses_resolved_models() -> None: + config = Config(skill_generation_model="sonnet", test_gen_model="haiku") + + resolved = resolve_models( + "generate", + config=config, + cli_model="opus", + cli_test_gen_model="kimi", + ) + references = build_fastagent_model_references(config=config, resolved=resolved) + + assert references == { + "system": { + "default": "opus", + "skill_gen": "opus", + "test_gen": "kimi", + } + } + + def test_resolve_eval_defaults_and_simple_mode() -> None: config = Config(skill_generation_model="sonnet", eval_model="haiku", test_gen_model=None) @@ -67,6 +87,26 @@ def test_resolve_eval_cli_test_gen_model_overrides_config() -> None: assert resolved.test_generation_model == "opus" +def test_build_fastagent_model_references_for_eval_keeps_configured_skill_generator() -> None: + config = Config(skill_generation_model="sonnet", test_gen_model="haiku") + + resolved = resolve_models( + "eval", + config=config, + cli_models=["kimi"], + cli_test_gen_model="opus", + ) + references = build_fastagent_model_references(config=config, resolved=resolved) + + assert references == { + "system": { + "default": "sonnet", + "skill_gen": "sonnet", + "test_gen": "opus", + } + } + + def test_resolve_eval_benchmark_mode_disables_baseline() -> None: config = Config(skill_generation_model="sonnet", eval_model="haiku") @@ -149,7 +189,7 @@ def test_resolve_unsupported_command_raises() -> None: def test_config_legacy_model_key_maps_to_skill_generation_model() -> None: - config = Config(model="haiku") + config = Config.model_validate({"model": "haiku"}) assert config.skill_generation_model == "haiku" assert config.model == "haiku" diff --git a/upskill.config.yaml b/upskill.config.yaml index 3ddf1e2..190161a 100644 --- a/upskill.config.yaml +++ b/upskill.config.yaml @@ -1,7 +1,9 @@ # upskill project configuration # Default model for skill generation. -model: sonnet +skill_generation_model: sonnet +test_gen_model: opus +eval_model: sonnet # Optional separate model for evaluation. If omitted, uses `model`. # eval_model: haiku @@ -13,5 +15,19 @@ runs_dir: ./runs # Number of refinement passes during `upskill generate`. max_refine_attempts: 2 +# Default execution settings for eval/benchmark/refinement. +executor: jobs # local | jobs; override with --executor +num_runs: 1 # Override with --runs +max_parallel: 5 # Override with --max-parallel + +# HF Jobs secrets to forward when executor=jobs. +# Use a comma-separated list of environment variable names, not secret values. +# Example: HF_TOKEN,ANTHROPIC_API_KEY +jobs_secrets: HF_TOKEN,ANTHROPIC_API_KEY # Override with --jobs-secrets + + +# Container image used for HF Jobs submissions. +jobs_image: ghcr.io/astral-sh/uv:python3.13-bookworm + # Optional override for fast-agent config file. # fastagent_config: ./fastagent.config.yaml diff --git a/uv.lock b/uv.lock index 9caa95c..7932959 100644 --- a/uv.lock +++ b/uv.lock @@ -30,6 +30,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/f3/219eeca0ad4a20843d4b9eaac5532f87018b9d25730a62a16f54f6c52d1a/agent_client_protocol-0.8.1-py3-none-any.whl", hash = "sha256:9421a11fd435b4831660272d169c3812d553bb7247049c138c3ca127e4b8af8e", size = 54529, upload-time = "2026-02-13T15:34:53.344Z" }, ] +[[package]] +name = "aiofile" +version = "3.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "caio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/67/e2/d7cb819de8df6b5c1968a2756c3cb4122d4fa2b8fc768b53b7c9e5edb646/aiofile-3.9.0.tar.gz", hash = "sha256:e5ad718bb148b265b6df1b3752c4d1d83024b93da9bd599df74b9d9ffcf7919b", size = 17943, upload-time = "2024-10-08T10:39:35.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/25/da1f0b4dd970e52bf5a36c204c107e11a0c6d3ed195eba0bfbc664c312b2/aiofile-3.9.0-py3-none-any.whl", hash = "sha256:ce2f6c1571538cbdfa0143b04e16b208ecb0e9cb4148e528af8a640ed51cc8aa", size = 19539, upload-time = "2024-10-08T10:39:32.955Z" }, +] + [[package]] name = "aiohappyeyeballs" version = "2.6.1" @@ -105,7 +117,7 @@ wheels = [ [[package]] name = "anthropic" -version = "0.79.0" +version = "0.86.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -117,9 +129,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/15/b1/91aea3f8fd180d01d133d931a167a78a3737b3fd39ccef2ae8d6619c24fd/anthropic-0.79.0.tar.gz", hash = "sha256:8707aafb3b1176ed6c13e2b1c9fb3efddce90d17aee5d8b83a86c70dcdcca871", size = 509825, upload-time = "2026-02-07T18:06:18.388Z" } +sdist = { url = "https://files.pythonhosted.org/packages/37/7a/8b390dc47945d3169875d342847431e5f7d5fa716b2e37494d57cfc1db10/anthropic-0.86.0.tar.gz", hash = "sha256:60023a7e879aa4fbb1fed99d487fe407b2ebf6569603e5047cfe304cebdaa0e5", size = 583820, upload-time = "2026-03-18T18:43:08.017Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/95/b2/cc0b8e874a18d7da50b0fda8c99e4ac123f23bf47b471827c5f6f3e4a767/anthropic-0.79.0-py3-none-any.whl", hash = "sha256:04cbd473b6bbda4ca2e41dd670fe2f829a911530f01697d0a1e37321eb75f3cf", size = 405918, upload-time = "2026-02-07T18:06:20.246Z" }, + { url = "https://files.pythonhosted.org/packages/63/5f/67db29c6e5d16c8c9c4652d3efb934d89cb750cad201539141781d8eae14/anthropic-0.86.0-py3-none-any.whl", hash = "sha256:9d2bbd339446acce98858c5627d33056efe01f70435b22b63546fe7edae0cd57", size = 469400, upload-time = "2026-03-18T18:43:06.526Z" }, ] [[package]] @@ -143,6 +155,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, ] +[[package]] +name = "authlib" +version = "1.6.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/af/98/00d3dd826d46959ad8e32af2dbb2398868fd9fd0683c26e56d0789bd0e68/authlib-1.6.9.tar.gz", hash = "sha256:d8f2421e7e5980cc1ddb4e32d3f5fa659cfaf60d8eaf3281ebed192e4ab74f04", size = 165134, upload-time = "2026-03-02T07:44:01.998Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/23/b65f568ed0c22f1efacb744d2db1a33c8068f384b8c9b482b52ebdbc3ef6/authlib-1.6.9-py2.py3-none-any.whl", hash = "sha256:f08b4c14e08f0861dc18a32357b33fbcfd2ea86cfe3fe149484b4d764c4a0ac3", size = 244197, upload-time = "2026-03-02T07:44:00.307Z" }, +] + +[[package]] +name = "beartype" +version = "0.22.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/94/1009e248bbfbab11397abca7193bea6626806be9a327d399810d523a07cb/beartype-0.22.9.tar.gz", hash = "sha256:8f82b54aa723a2848a56008d18875f91c1db02c32ef6a62319a002e3e25a975f", size = 1608866, upload-time = "2025-12-13T06:50:30.72Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl", hash = "sha256:d16c9bbc61ea14637596c5f6fbff2ee99cbe3573e46a716401734ef50c3060c2", size = 1333658, upload-time = "2025-12-13T06:50:28.266Z" }, +] + +[[package]] +name = "cachetools" +version = "7.0.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/dd/57fe3fdb6e65b25a5987fd2cdc7e22db0aef508b91634d2e57d22928d41b/cachetools-7.0.5.tar.gz", hash = "sha256:0cd042c24377200c1dcd225f8b7b12b0ca53cc2c961b43757e774ebe190fd990", size = 37367, upload-time = "2026-03-09T20:51:29.451Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/f3/39cf3367b8107baa44f861dc802cbf16263c945b62d8265d36034fc07bea/cachetools-7.0.5-py3-none-any.whl", hash = "sha256:46bc8ebefbe485407621d0a4264b23c080cedd913921bad7ac3ed2f26c183114", size = 13918, upload-time = "2026-03-09T20:51:27.33Z" }, +] + +[[package]] +name = "caio" +version = "0.9.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/92/88/b8527e1b00c1811db339a1df8bd1ae49d146fcea9d6a5c40e3a80aaeb38d/caio-0.9.25.tar.gz", hash = "sha256:16498e7f81d1d0f5a4c0ad3f2540e65fe25691376e0a5bd367f558067113ed10", size = 26781, upload-time = "2025-12-26T15:21:36.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/57/5e6ff127e6f62c9f15d989560435c642144aa4210882f9494204bc892305/caio-0.9.25-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d6c2a3411af97762a2b03840c3cec2f7f728921ff8adda53d7ea2315a8563451", size = 36979, upload-time = "2025-12-26T15:21:35.484Z" }, + { url = "https://files.pythonhosted.org/packages/a3/9f/f21af50e72117eb528c422d4276cbac11fb941b1b812b182e0a9c70d19c5/caio-0.9.25-cp313-cp313-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0998210a4d5cd5cb565b32ccfe4e53d67303f868a76f212e002a8554692870e6", size = 81900, upload-time = "2025-12-26T15:22:21.919Z" }, + { url = "https://files.pythonhosted.org/packages/9c/12/c39ae2a4037cb10ad5eb3578eb4d5f8c1a2575c62bba675f3406b7ef0824/caio-0.9.25-cp313-cp313-manylinux_2_34_aarch64.whl", hash = "sha256:1a177d4777141b96f175fe2c37a3d96dec7911ed9ad5f02bac38aaa1c936611f", size = 81523, upload-time = "2026-03-04T22:08:25.187Z" }, + { url = "https://files.pythonhosted.org/packages/22/59/f8f2e950eb4f1a5a3883e198dca514b9d475415cb6cd7b78b9213a0dd45a/caio-0.9.25-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:9ed3cfb28c0e99fec5e208c934e5c157d0866aa9c32aa4dc5e9b6034af6286b7", size = 80243, upload-time = "2026-03-04T22:08:26.449Z" }, + { url = "https://files.pythonhosted.org/packages/86/93/1f76c8d1bafe3b0614e06b2195784a3765bbf7b0a067661af9e2dd47fc33/caio-0.9.25-py3-none-any.whl", hash = "sha256:06c0bb02d6b929119b1cfbe1ca403c768b2013a369e2db46bfa2a5761cf82e40", size = 19087, upload-time = "2025-12-26T15:22:00.221Z" }, +] + [[package]] name = "cattrs" version = "25.3.0" @@ -275,6 +330,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, ] +[[package]] +name = "cyclopts" +version = "4.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "docstring-parser" }, + { name = "rich" }, + { name = "rich-rst" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2c/e7/3e26855c046ac527cf94d890f6698e703980337f22ea7097e02b35b910f9/cyclopts-4.10.0.tar.gz", hash = "sha256:0ae04a53274e200ef3477c8b54de63b019bc6cd0162d75c718bf40c9c3fb5268", size = 166394, upload-time = "2026-03-14T14:09:31.043Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/06/d68a5d5d292c2ad2bc6a02e5ca2cb1bb9c15e941ab02f004a06a342d7f0f/cyclopts-4.10.0-py3-none-any.whl", hash = "sha256:50f333382a60df8d40ec14aa2e627316b361c4f478598ada1f4169d959bf9ea7", size = 204097, upload-time = "2026-03-14T14:09:32.504Z" }, +] + [[package]] name = "deprecated" version = "1.3.1" @@ -327,6 +397,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/56/7b/af3d0da15bed3a8665419bb3a630585756920f4ad67abfdfef26240ebcc0/docstring_to_markdown-0.17-py3-none-any.whl", hash = "sha256:fd7d5094aa83943bf5f9e1a13701866b7c452eac19765380dead666e36d3711c", size = 23479, upload-time = "2025-05-02T15:09:06.676Z" }, ] +[[package]] +name = "docutils" +version = "0.22.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" }, +] + [[package]] name = "email-validator" version = "2.3.0" @@ -340,9 +419,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/15/545e2b6cf2e3be84bc1ed85613edd75b8aea69807a71c26f4ca6a9258e82/email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4", size = 35604, upload-time = "2025-08-26T13:09:05.858Z" }, ] +[[package]] +name = "exceptiongroup" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, +] + [[package]] name = "fast-agent-mcp" -version = "0.4.53" +version = "0.6.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "a2a-sdk" }, @@ -352,6 +440,7 @@ dependencies = [ { name = "deprecated" }, { name = "email-validator" }, { name = "fastapi" }, + { name = "fastmcp" }, { name = "google-genai" }, { name = "keyring" }, { name = "mcp" }, @@ -376,9 +465,9 @@ dependencies = [ { name = "uvloop", marker = "sys_platform != 'win32'" }, { name = "watchfiles" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/89/48/027760d3e271299ad71b4baef77f0edb509fcf1ad1e0b6e38367fabf622f/fast_agent_mcp-0.4.53.tar.gz", hash = "sha256:bada3c4ec8be873e2b0fa844524df9da0c0492ca67270ec2b826e7e319f95dda", size = 1688537, upload-time = "2026-02-15T23:09:31.809Z" } +sdist = { url = "https://files.pythonhosted.org/packages/de/b4/abee6994b9d72a6b8763b0f1ac6273a54e6529e26277146df88d58366754/fast_agent_mcp-0.6.7.tar.gz", hash = "sha256:307148c04c3a8817a46e873137d7bd36f03feff53b90e3ec98c8798d6ccb49f8", size = 1992547, upload-time = "2026-03-22T20:50:48.563Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/3b/c385a276521033ce1dec729feb9b7760a7d6f7ff15641e51c51b6d27301d/fast_agent_mcp-0.4.53-py3-none-any.whl", hash = "sha256:9dac6fe59e552b3ba56d19e225bbc59a9e3ec20ac7b8cfe1760c62cb54384a23", size = 1130674, upload-time = "2026-02-15T23:09:26.314Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/a99a6859316172cbbb419ddc7923a450c7f36dd1f8a121d0bbc8e47932c9/fast_agent_mcp-0.6.7-py3-none-any.whl", hash = "sha256:c59ac2f24c677fa3966da0662b9387f64aaf491d76d617fb697563783ed239b3", size = 1461755, upload-time = "2026-03-22T20:50:46.885Z" }, ] [[package]] @@ -396,6 +485,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/05/5cbb59154b093548acd0f4c7c474a118eda06da25aa75c616b72d8fcd92a/fastapi-0.128.0-py3-none-any.whl", hash = "sha256:aebd93f9716ee3b4f4fcfe13ffb7cf308d99c9f3ab5622d8877441072561582d", size = 103094, upload-time = "2025-12-27T15:21:12.154Z" }, ] +[[package]] +name = "fastmcp" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "authlib" }, + { name = "cyclopts" }, + { name = "exceptiongroup" }, + { name = "httpx" }, + { name = "jsonref" }, + { name = "jsonschema-path" }, + { name = "mcp" }, + { name = "openapi-pydantic" }, + { name = "opentelemetry-api" }, + { name = "packaging" }, + { name = "platformdirs" }, + { name = "py-key-value-aio", extra = ["filetree", "keyring", "memory"] }, + { name = "pydantic", extra = ["email"] }, + { name = "pyperclip" }, + { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "rich" }, + { name = "uncalled-for" }, + { name = "uvicorn" }, + { name = "watchfiles" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/83/c95d3bf717698a693eccb43e137a32939d2549876e884e246028bff6ecce/fastmcp-3.1.1.tar.gz", hash = "sha256:db184b5391a31199323766a3abf3a8bfbb8010479f77eca84c0e554f18655c48", size = 17347644, upload-time = "2026-03-14T19:12:20.235Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/70/ea/570122de7e24f72138d006f799768e14cc1ccf7fcb22b7750b2bd276c711/fastmcp-3.1.1-py3-none-any.whl", hash = "sha256:8132ba069d89f14566b3266919d6d72e2ec23dd45d8944622dca407e9beda7eb", size = 633754, upload-time = "2026-03-14T19:12:22.736Z" }, +] + [[package]] name = "frozenlist" version = "1.8.0" @@ -473,7 +594,7 @@ requests = [ [[package]] name = "google-genai" -version = "1.60.0" +version = "1.68.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -487,9 +608,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0a/3f/a753be0dcee352b7d63bc6d1ba14a72591d63b6391dac0cdff7ac168c530/google_genai-1.60.0.tar.gz", hash = "sha256:9768061775fddfaecfefb0d6d7a6cabefb3952ebd246cd5f65247151c07d33d1", size = 487721, upload-time = "2026-01-21T22:17:30.398Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/2c/f059982dbcb658cc535c81bbcbe7e2c040d675f4b563b03cdb01018a4bc3/google_genai-1.68.0.tar.gz", hash = "sha256:ac30c0b8bc630f9372993a97e4a11dae0e36f2e10d7c55eacdca95a9fa14ca96", size = 511285, upload-time = "2026-03-18T01:03:18.243Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/31/e5/384b1f383917b5f0ae92e28f47bc27b16e3d26cd9bacb25e9f8ecab3c8fe/google_genai-1.60.0-py3-none-any.whl", hash = "sha256:967338378ffecebec19a8ed90cf8797b26818bacbefd7846a9280beb1099f7f3", size = 719431, upload-time = "2026-01-21T22:17:28.086Z" }, + { url = "https://files.pythonhosted.org/packages/84/de/7d3ee9c94b74c3578ea4f88d45e8de9405902f857932334d81e89bce3dfa/google_genai-1.68.0-py3-none-any.whl", hash = "sha256:a1bc9919c0e2ea2907d1e319b65471d3d6d58c54822039a249fe1323e4178d15", size = 750912, upload-time = "2026-03-18T01:03:15.983Z" }, ] [[package]] @@ -689,6 +810,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" }, ] +[[package]] +name = "jsonref" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" }, +] + [[package]] name = "jsonschema" version = "4.26.0" @@ -704,6 +834,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, ] +[[package]] +name = "jsonschema-path" +version = "0.4.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pathable" }, + { name = "pyyaml" }, + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/8a/7e6102f2b8bdc6705a9eb5294f8f6f9ccd3a8420e8e8e19671d1dd773251/jsonschema_path-0.4.5.tar.gz", hash = "sha256:c6cd7d577ae290c7defd4f4029e86fdb248ca1bd41a07557795b3c95e5144918", size = 15113, upload-time = "2026-03-03T09:56:46.87Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/d5/4e96c44f6c1ea3d812cf5391d81a4f5abaa540abf8d04ecd7f66e0ed11df/jsonschema_path-0.4.5-py3-none-any.whl", hash = "sha256:7d77a2c3f3ec569a40efe5c5f942c44c1af2a6f96fe0866794c9ef5b8f87fd65", size = 19368, upload-time = "2026-03-03T09:56:45.39Z" }, +] + [[package]] name = "jsonschema-specifications" version = "2025.9.1" @@ -863,7 +1007,7 @@ wheels = [ [[package]] name = "openai" -version = "2.21.0" +version = "2.29.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -875,9 +1019,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/92/e5/3d197a0947a166649f566706d7a4c8f7fe38f1fa7b24c9bcffe4c7591d44/openai-2.21.0.tar.gz", hash = "sha256:81b48ce4b8bbb2cc3af02047ceb19561f7b1dc0d4e52d1de7f02abfd15aa59b7", size = 644374, upload-time = "2026-02-14T00:12:01.577Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/15/203d537e58986b5673e7f232453a2a2f110f22757b15921cbdeea392e520/openai-2.29.0.tar.gz", hash = "sha256:32d09eb2f661b38d3edd7d7e1a2943d1633f572596febe64c0cd370c86d52bec", size = 671128, upload-time = "2026-03-17T17:53:49.599Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/56/0a89092a453bb2c676d66abee44f863e742b2110d4dbb1dbcca3f7e5fc33/openai-2.21.0-py3-none-any.whl", hash = "sha256:0bc1c775e5b1536c294eded39ee08f8407656537ccc71b1004104fe1602e267c", size = 1103065, upload-time = "2026-02-14T00:11:59.603Z" }, + { url = "https://files.pythonhosted.org/packages/d0/b1/35b6f9c8cf9318e3dbb7146cc82dab4cf61182a8d5406fc9b50864362895/openai-2.29.0-py3-none-any.whl", hash = "sha256:b7c5de513c3286d17c5e29b92c4c98ceaf0d775244ac8159aeb1bddf840eb42a", size = 1141533, upload-time = "2026-03-17T17:53:47.348Z" }, ] [package.optional-dependencies] @@ -886,6 +1030,18 @@ aiohttp = [ { name = "httpx-aiohttp" }, ] +[[package]] +name = "openapi-pydantic" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/2e/58d83848dd1a79cb92ed8e63f6ba901ca282c5f09d04af9423ec26c56fd7/openapi_pydantic-0.5.1.tar.gz", hash = "sha256:ff6835af6bde7a459fb93eb93bb92b8749b754fc6e51b2f1590a19dc3005ee0d", size = 60892, upload-time = "2025-01-08T19:29:27.083Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/cf/03675d8bd8ecbf4445504d8071adab19f5f993676795708e36402ab38263/openapi_pydantic-0.5.1-py3-none-any.whl", hash = "sha256:a3a09ef4586f5bd760a8df7f43028b60cafb6d9f61de2acba9574766255ab146", size = 96381, upload-time = "2025-01-08T19:29:25.275Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.39.1" @@ -1098,6 +1254,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" }, ] +[[package]] +name = "pathable" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/55/b748445cb4ea6b125626f15379be7c96d1035d4fa3e8fee362fa92298abf/pathable-0.5.0.tar.gz", hash = "sha256:d81938348a1cacb525e7c75166270644782c0fb9c8cecc16be033e71427e0ef1", size = 16655, upload-time = "2026-02-20T08:47:00.748Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/96/5a770e5c461462575474468e5af931cff9de036e7c2b4fea23c1c58d2cbe/pathable-0.5.0-py3-none-any.whl", hash = "sha256:646e3d09491a6351a0c82632a09c02cdf70a252e73196b36d8a15ba0a114f0a6", size = 16867, upload-time = "2026-02-20T08:46:59.536Z" }, +] + +[[package]] +name = "platformdirs" +version = "4.9.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/56/8d4c30c8a1d07013911a8fdbd8f89440ef9f08d07a1b50ab8ca8be5a20f9/platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934", size = 28737, upload-time = "2026-03-05T18:34:13.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -1207,6 +1381,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3e/73/2ce007f4198c80fcf2cb24c169884f833fe93fbc03d55d302627b094ee91/psutil-7.2.1-cp37-abi3-win_arm64.whl", hash = "sha256:0d67c1822c355aa6f7314d92018fb4268a76668a536f133599b91edd48759442", size = 133836, upload-time = "2025-12-29T08:26:43.086Z" }, ] +[[package]] +name = "py-key-value-aio" +version = "0.4.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beartype" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/3c/0397c072a38d4bc580994b42e0c90c5f44f679303489e4376289534735e5/py_key_value_aio-0.4.4.tar.gz", hash = "sha256:e3012e6243ed7cc09bb05457bd4d03b1ba5c2b1ca8700096b3927db79ffbbe55", size = 92300, upload-time = "2026-02-16T21:21:43.245Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/69/f1b537ee70b7def42d63124a539ed3026a11a3ffc3086947a1ca6e861868/py_key_value_aio-0.4.4-py3-none-any.whl", hash = "sha256:18e17564ecae61b987f909fc2cd41ee2012c84b4b1dcb8c055cf8b4bc1bf3f5d", size = 152291, upload-time = "2026-02-16T21:21:44.241Z" }, +] + +[package.optional-dependencies] +filetree = [ + { name = "aiofile" }, + { name = "anyio" }, +] +keyring = [ + { name = "keyring" }, +] +memory = [ + { name = "cachetools" }, +] + [[package]] name = "pyasn1" version = "0.6.2" @@ -1252,6 +1451,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, ] +[package.optional-dependencies] +email = [ + { name = "email-validator" }, +] + [[package]] name = "pydantic-core" version = "2.41.5" @@ -1501,15 +1705,28 @@ wheels = [ [[package]] name = "rich" -version = "14.3.1" +version = "14.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a1/84/4831f881aa6ff3c976f6d6809b58cdfa350593ffc0dc3c58f5f6586780fb/rich-14.3.1.tar.gz", hash = "sha256:b8c5f568a3a749f9290ec6bddedf835cec33696bfc1e48bcfecb276c7386e4b8", size = 230125, upload-time = "2026-01-24T21:40:44.847Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/87/2a/a1810c8627b9ec8c57ec5ec325d306701ae7be50235e8fd81266e002a3cc/rich-14.3.1-py3-none-any.whl", hash = "sha256:da750b1aebbff0b372557426fb3f35ba56de8ef954b3190315eb64076d6fb54e", size = 309952, upload-time = "2026-01-24T21:40:42.969Z" }, + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, +] + +[[package]] +name = "rich-rst" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docutils" }, + { name = "rich" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bc/6d/a506aaa4a9eaa945ed8ab2b7347859f53593864289853c5d6d62b77246e0/rich_rst-1.3.2.tar.gz", hash = "sha256:a1196fdddf1e364b02ec68a05e8ff8f6914fee10fbca2e6b6735f166bb0da8d4", size = 14936, upload-time = "2025-10-14T16:49:45.332Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl", hash = "sha256:a99b4907cbe118cf9d18b0b44de272efa61f15117c61e39ebdc431baf5df722a", size = 12567, upload-time = "2025-10-14T16:49:42.953Z" }, ] [[package]] @@ -1699,19 +1916,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "ty" +version = "0.0.24" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7a/96/652a425030f95dc2c9548d9019e52502e17079e1daeefbc4036f1c0905b4/ty-0.0.24.tar.gz", hash = "sha256:9fe42f6b98207bdaef51f71487d6d087f2cb02555ee3939884d779b2b3cc8bfc", size = 5354286, upload-time = "2026-03-19T16:55:57.035Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/e5/34457ee11708e734ba81ad65723af83030e484f961e281d57d1eecf08951/ty-0.0.24-py3-none-linux_armv6l.whl", hash = "sha256:1ab4f1f61334d533a3fdf5d9772b51b1300ac5da4f3cdb0be9657a3ccb2ce3e7", size = 10394877, upload-time = "2026-03-19T16:55:54.246Z" }, + { url = "https://files.pythonhosted.org/packages/44/81/bc9a1b1a87f43db15ab64ad781a4f999734ec3b470ad042624fa875b20e6/ty-0.0.24-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:facbf2c4aaa6985229e08f8f9bf152215eb078212f22b5c2411f35386688ab42", size = 10211109, upload-time = "2026-03-19T16:55:28.554Z" }, + { url = "https://files.pythonhosted.org/packages/e4/63/cfc805adeaa61d63ba3ea71127efa7d97c40ba36d97ee7bd957341d05107/ty-0.0.24-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b6d2a3b6d4470c483552a31e9b368c86f154dcc964bccb5406159dc9cd362246", size = 9694769, upload-time = "2026-03-19T16:55:34.309Z" }, + { url = "https://files.pythonhosted.org/packages/33/09/edc220726b6ec44a58900401f6b27140997ef15026b791e26b69a6e69eb5/ty-0.0.24-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c94c25d0500939fd5f8f16ce41cbed5b20528702c1d649bf80300253813f0a2", size = 10176287, upload-time = "2026-03-19T16:55:37.17Z" }, + { url = "https://files.pythonhosted.org/packages/f8/bf/cbe2227be711e65017655d8ee4d050f4c92b113fb4dc4c3bd6a19d3a86d8/ty-0.0.24-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:89cbe7bc7df0fab02dbd8cda79b737df83f1ef7fb573b08c0ee043dc68cffb08", size = 10214832, upload-time = "2026-03-19T16:56:08.518Z" }, + { url = "https://files.pythonhosted.org/packages/af/1d/d15803ee47e9143d10e10bd81ccc14761d08758082bda402950685f0ddfe/ty-0.0.24-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2c5d269bcc9b764850c99f457b5018a79b3ef40ecfbc03344e65effd6cf743", size = 10709892, upload-time = "2026-03-19T16:56:05.727Z" }, + { url = "https://files.pythonhosted.org/packages/36/12/6db0d86c477147f67b9052de209421d76c3e855197b000c25fcbbe86b3a2/ty-0.0.24-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba44512db5b97c3bbd59d93e11296e8548d0c9a3bdd1280de36d7ff22d351896", size = 11280872, upload-time = "2026-03-19T16:56:02.899Z" }, + { url = "https://files.pythonhosted.org/packages/1b/fc/155fe83a97c06d33ccc9e0f428258b32df2e08a428300c715d34757f0111/ty-0.0.24-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a52b7f589c3205512a9c50ba5b2b1e8c0698b72e51b8b9285c90420c06f1cae8", size = 11060520, upload-time = "2026-03-19T16:55:59.956Z" }, + { url = "https://files.pythonhosted.org/packages/ac/f1/32c05a1c4c3c2a95c5b7361dee03a9bf1231d4ad096b161c838b45bce5a0/ty-0.0.24-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7981df5c709c054da4ac5d7c93f8feb8f45e69e829e4461df4d5f0988fe67d04", size = 10791455, upload-time = "2026-03-19T16:55:25.728Z" }, + { url = "https://files.pythonhosted.org/packages/17/2c/53c1ea6bedfa4d4ab64d4de262d8f5e405ecbffefd364459c628c0310d33/ty-0.0.24-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b2860151ad95a00d0f0280b8fef79900d08dcd63276b57e6e5774f2c055979c5", size = 10156708, upload-time = "2026-03-19T16:55:45.563Z" }, + { url = "https://files.pythonhosted.org/packages/45/39/7d2919cf194707169474d80720a5f3d793e983416f25e7ffcf80504c9df2/ty-0.0.24-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:5674a1146d927ab77ff198a88e0c4505134ced342a0e7d1beb4a076a728b7496", size = 10236263, upload-time = "2026-03-19T16:55:31.474Z" }, + { url = "https://files.pythonhosted.org/packages/cf/7f/48eac722f2fd12a5b7aae0effdcb75c46053f94b783d989e3ef0d7380082/ty-0.0.24-py3-none-musllinux_1_2_i686.whl", hash = "sha256:438ecbf1608a9b16dd84502f3f1b23ef2ef32bbd0ab3e0ca5a82f0e0d1cd41ea", size = 10402559, upload-time = "2026-03-19T16:55:39.602Z" }, + { url = "https://files.pythonhosted.org/packages/75/e0/8cf868b9749ce1e5166462759545964e95b02353243594062b927d8bff2a/ty-0.0.24-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ddeed3098dd92a83964e7aa7b41e509ba3530eb539fc4cd8322ff64a09daf1f5", size = 10893684, upload-time = "2026-03-19T16:55:51.439Z" }, + { url = "https://files.pythonhosted.org/packages/17/9f/f54bf3be01d2c2ed731d10a5afa3324dc66f987a6ae0a4a6cbfa2323d080/ty-0.0.24-py3-none-win32.whl", hash = "sha256:83013fb3a4764a8f8bcc6ca11ff8bdfd8c5f719fc249241cb2b8916e80778eb1", size = 9781542, upload-time = "2026-03-19T16:56:11.588Z" }, + { url = "https://files.pythonhosted.org/packages/fb/49/c004c5cc258b10b3a145666e9a9c28ae7678bc958c8926e8078d5d769081/ty-0.0.24-py3-none-win_amd64.whl", hash = "sha256:748a60eb6912d1cf27aaab105ffadb6f4d2e458a3fcadfbd3cf26db0d8062eeb", size = 10764801, upload-time = "2026-03-19T16:55:42.752Z" }, + { url = "https://files.pythonhosted.org/packages/e2/59/006a074e185bfccf5e4c026015245ab4fcd2362b13a8d24cf37a277909a9/ty-0.0.24-py3-none-win_arm64.whl", hash = "sha256:280a3d31e86d0721947238f17030c33f0911cae851d108ea9f4e3ab12a5ed01f", size = 10194093, upload-time = "2026-03-19T16:55:48.303Z" }, +] + [[package]] name = "typer" -version = "0.21.1" +version = "0.24.1" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "annotated-doc" }, { name = "click" }, { name = "rich" }, { name = "shellingham" }, - { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/36/bf/8825b5929afd84d0dabd606c67cd57b8388cb3ec385f7ef19c5cc2202069/typer-0.21.1.tar.gz", hash = "sha256:ea835607cd752343b6b2b7ce676893e5a0324082268b48f27aa058bdb7d2145d", size = 110371, upload-time = "2026-01-06T11:21:10.989Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl", hash = "sha256:7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01", size = 47381, upload-time = "2026-01-06T11:21:09.824Z" }, + { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, ] [[package]] @@ -1735,6 +1976,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, ] +[[package]] +name = "uncalled-for" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/7c/b5b7d8136f872e3f13b0584e576886de0489d7213a12de6bebf29ff6ebfc/uncalled_for-0.2.0.tar.gz", hash = "sha256:b4f8fdbcec328c5a113807d653e041c5094473dd4afa7c34599ace69ccb7e69f", size = 49488, upload-time = "2026-02-27T17:40:58.137Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/7f/4320d9ce3be404e6310b915c3629fe27bf1e2f438a1a7a3cb0396e32e9a9/uncalled_for-0.2.0-py3-none-any.whl", hash = "sha256:2c0bd338faff5f930918f79e7eb9ff48290df2cb05fcc0b40a7f334e55d4d85f", size = 11351, upload-time = "2026-02-27T17:40:56.804Z" }, +] + [[package]] name = "upskill" version = "0.2.1" @@ -1753,19 +2003,21 @@ dev = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "ruff" }, + { name = "ty" }, ] [package.metadata] requires-dist = [ { name = "click", specifier = ">=8.1" }, - { name = "fast-agent-mcp", specifier = ">=0.4.53" }, + { name = "fast-agent-mcp", specifier = ">=0.6.7" }, { name = "pydantic", specifier = ">=2.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" }, { name = "python-dotenv", specifier = ">=1.0" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "rich", specifier = ">=13.0" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" }, + { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.11" }, + { name = "ty", marker = "extra == 'dev'", specifier = ">=0.0.23" }, ] provides-extras = ["dev"]