diff --git a/Makefile b/Makefile index 2fddd3f..764c1b4 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ ifeq ($(GOBIN),) GOBIN := $(shell go env GOPATH)/bin endif -.PHONY: deps build install uninstall test unit vet harness-validate codex-app-eval codex-app-eval-suite codex-memory-deep-eval docker-build docker-run compose-up compose-down compose-dev release-snapshot clean help +.PHONY: deps build install uninstall test unit vet harness-validate codex-app-eval codex-app-eval-suite codex-memory-deep-eval codex-skill-deep-eval docker-build docker-run compose-up compose-down compose-dev release-snapshot clean help .DEFAULT_GOAL := help @@ -57,6 +57,9 @@ codex-app-eval-suite: ## Run real Codex app-server memory/skill scenario suite codex-memory-deep-eval: ## Run deep real Codex app-server memory regression suite python3 scripts/codex_app_server_eval.py --suite --suite-name memory-deep +codex-skill-deep-eval: ## Run deep real Codex app-server skill regression suite + python3 scripts/codex_app_server_eval.py --suite --suite-name skill-deep + # ── Containers / Deployment ────────────────────────────────────────── docker-build: ## Build runtime Docker image diff --git a/docs/harness/eval/CODEX_APP_SERVER.md b/docs/harness/eval/CODEX_APP_SERVER.md index 76545e0..e9d6a52 100644 --- a/docs/harness/eval/CODEX_APP_SERVER.md +++ b/docs/harness/eval/CODEX_APP_SERVER.md @@ -37,6 +37,16 @@ The deep memory suite adds noisy recall filtering, stale-memory supersession, uncertain-preference rejection, secret-like value rejection, and multi-turn continuity through persisted `MEMORY.md`. +For longer skill-loop regression, run: + +```bash +make codex-skill-deep-eval +``` + +The deep skill suite adds transient evidence skip, missing-skill evidence, +approved active skill creation, host-surface preservation, and proposal-first +curation checks, plus reviewable skill authoring drafts. + To trigger a real Codex turn, opt in explicitly: ```bash diff --git a/docs/zh/harness/eval/CODEX_APP_SERVER.md b/docs/zh/harness/eval/CODEX_APP_SERVER.md index bf89656..57290b2 100644 --- a/docs/zh/harness/eval/CODEX_APP_SERVER.md +++ b/docs/zh/harness/eval/CODEX_APP_SERVER.md @@ -35,6 +35,16 @@ make codex-memory-deep-eval deep memory suite 会额外覆盖:带噪声的相关 recall、过期 memory 覆盖、 不确定偏好拒绝、疑似 secret 值拒绝,以及通过持久化 `MEMORY.md` 完成多轮连续性。 +更长的 skill loop 回归可以运行: + +```bash +make codex-skill-deep-eval +``` + +deep skill suite 会额外覆盖:跳过临时 evidence、记录 missing-skill evidence、 +执行已批准的 active skill 创建、保护 host skill surface,以及 proposal-first +curation 不直接激活 skill,并验证 reviewable skill draft 的 authoring。 + 如果需要触发真实 Codex turn,可以显式开启: ```bash diff --git a/harness/eval/README.md b/harness/eval/README.md index 86bc6f2..28c0a4f 100644 --- a/harness/eval/README.md +++ b/harness/eval/README.md @@ -32,6 +32,12 @@ Run the longer memory regression suite with: make codex-memory-deep-eval ``` +Run the longer skill-loop regression suite with: + +```bash +make codex-skill-deep-eval +``` + To run an actual Codex turn, use: ```bash @@ -72,3 +78,12 @@ The `memory-deep` suite extends memory coverage with: - rejecting uncertain preference changes - rejecting secret-like values and generic restatements of existing safety policy - multi-turn continuity through persisted `MEMORY.md` + +The `skill-deep` suite extends skill-loop coverage with: + +- skipping transient one-off workflow evidence +- recording missing-skill evidence as JSONL +- applying an explicitly approved active skill creation +- preserving the host skill surface during canonical skill changes +- producing proposal-first curation output without activating skills +- drafting reviewable skill content without activating it diff --git a/harness/hosts/claude-code/projector.sh b/harness/hosts/claude-code/projector.sh index 8dcd459..9d05d5c 100755 --- a/harness/hosts/claude-code/projector.sh +++ b/harness/hosts/claude-code/projector.sh @@ -261,7 +261,7 @@ export MNEMON_SKILL_LOOP_USAGE_FILE="${CANONICAL_MODULE_DIR}/skills/.usage.jsonl export MNEMON_SKILL_LOOP_PROPOSALS_DIR="${CANONICAL_MODULE_DIR}/proposals" export MNEMON_SKILL_LOOP_HOST_SKILLS_DIR="${host_skills_dir}" export MNEMON_SKILL_LOOP_REVIEW_MIN_EVENTS="\${MNEMON_SKILL_LOOP_REVIEW_MIN_EVENTS:-20}" -export MNEMON_SKILL_LOOP_PROTECTED_SKILLS="\${MNEMON_SKILL_LOOP_PROTECTED_SKILLS:-skill_observe,skill_curate,skill_manage,memory_get,memory_set}" +export MNEMON_SKILL_LOOP_PROTECTED_SKILLS="\${MNEMON_SKILL_LOOP_PROTECTED_SKILLS:-skill_observe,skill_curate,skill_author,skill_manage,memory_get,memory_set}" EOF chmod 0755 "${CONFIG_DIR}/mnemon-skill-loop/env.sh" } @@ -322,6 +322,7 @@ install_skill_loop() { "${CANONICAL_MODULE_DIR}/reports" \ "${HOST_SKILLS_DIR}/skill_observe" \ "${HOST_SKILLS_DIR}/skill_curate" \ + "${HOST_SKILLS_DIR}/skill_author" \ "${HOST_SKILLS_DIR}/skill_manage" \ "${CONFIG_DIR}/agents" \ "${CONFIG_DIR}/hooks/mnemon-skill-loop" @@ -329,6 +330,7 @@ install_skill_loop() { install_file "${MODULE_DIR}/skills/skill_observe.md" "${HOST_SKILLS_DIR}/skill_observe/SKILL.md" 0644 install_file "${MODULE_DIR}/skills/skill_curate.md" "${HOST_SKILLS_DIR}/skill_curate/SKILL.md" 0644 + install_file "${MODULE_DIR}/skills/skill_author.md" "${HOST_SKILLS_DIR}/skill_author/SKILL.md" 0644 install_file "${MODULE_DIR}/skills/skill_manage.md" "${HOST_SKILLS_DIR}/skill_manage/SKILL.md" 0644 install_file "${MODULE_DIR}/subagents/curator.md" "${CONFIG_DIR}/agents/mnemon-skill-curator.md" 0644 @@ -398,6 +400,7 @@ uninstall_skill_loop() { rm -rf "${CONFIG_DIR}/hooks/mnemon-skill-loop" rm -rf "${host_skills_dir}/skill_observe" rm -rf "${host_skills_dir}/skill_curate" + rm -rf "${host_skills_dir}/skill_author" rm -rf "${host_skills_dir}/skill_manage" rm -f "${CONFIG_DIR}/agents/mnemon-skill-curator.md" rm -rf "${CONFIG_DIR}/mnemon-skill-loop" diff --git a/harness/hosts/codex/projector.sh b/harness/hosts/codex/projector.sh index 132d24b..20eba51 100755 --- a/harness/hosts/codex/projector.sh +++ b/harness/hosts/codex/projector.sh @@ -289,6 +289,7 @@ install_skill_loop() { "${CANONICAL_MODULE_DIR}/reports" \ "${HOST_SKILLS_DIR}/skill_observe" \ "${HOST_SKILLS_DIR}/skill_curate" \ + "${HOST_SKILLS_DIR}/skill_author" \ "${HOST_SKILLS_DIR}/skill_manage" \ "${CONFIG_DIR}/mnemon-skill-loop" write_runtime_env "${CONFIG_DIR}/mnemon-skill-loop" "MNEMON_SKILL_LOOP_ENV" "MNEMON_SKILL_LOOP_DIR" @@ -301,13 +302,16 @@ export MNEMON_SKILL_LOOP_ARCHIVED_DIR="${CANONICAL_MODULE_DIR}/skills/archived" export MNEMON_SKILL_LOOP_USAGE_FILE="${CANONICAL_MODULE_DIR}/skills/.usage.jsonl" export MNEMON_SKILL_LOOP_PROPOSALS_DIR="${CANONICAL_MODULE_DIR}/proposals" export MNEMON_SKILL_LOOP_HOST_SKILLS_DIR="${HOST_SKILLS_DIR}" +export MNEMON_SKILL_LOOP_PROTECTED_SKILLS="${MNEMON_SKILL_LOOP_PROTECTED_SKILLS:-skill_observe,skill_curate,skill_author,skill_manage,memory_get,memory_set}" EOF install_file "${MODULE_DIR}/skills/skill_observe.md" "${HOST_SKILLS_DIR}/skill_observe/SKILL.md" 0644 install_file "${MODULE_DIR}/skills/skill_curate.md" "${HOST_SKILLS_DIR}/skill_curate/SKILL.md" 0644 + install_file "${MODULE_DIR}/skills/skill_author.md" "${HOST_SKILLS_DIR}/skill_author/SKILL.md" 0644 install_file "${MODULE_DIR}/skills/skill_manage.md" "${HOST_SKILLS_DIR}/skill_manage/SKILL.md" 0644 append_codex_runtime_note "${HOST_SKILLS_DIR}/skill_observe/SKILL.md" "MNEMON_SKILL_LOOP_DIR" "${CONFIG_DIR}/mnemon-skill-loop/env.sh" append_codex_runtime_note "${HOST_SKILLS_DIR}/skill_curate/SKILL.md" "MNEMON_SKILL_LOOP_DIR" "${CONFIG_DIR}/mnemon-skill-loop/env.sh" + append_codex_runtime_note "${HOST_SKILLS_DIR}/skill_author/SKILL.md" "MNEMON_SKILL_LOOP_DIR" "${CONFIG_DIR}/mnemon-skill-loop/env.sh" append_codex_runtime_note "${HOST_SKILLS_DIR}/skill_manage/SKILL.md" "MNEMON_SKILL_LOOP_DIR" "${CONFIG_DIR}/mnemon-skill-loop/env.sh" write_host_manifest "${CONFIG_DIR}" @@ -356,6 +360,7 @@ uninstall_skill_loop() { local host_skills_dir="${MNEMON_SKILL_LOOP_HOST_SKILLS_DIR:-${HOST_SKILLS_DIR:-${CONFIG_DIR}/skills}}" rm -rf "${host_skills_dir}/skill_observe" rm -rf "${host_skills_dir}/skill_curate" + rm -rf "${host_skills_dir}/skill_author" rm -rf "${host_skills_dir}/skill_manage" rm -rf "${CONFIG_DIR}/mnemon-skill-loop" if [[ "${PURGE_LIBRARY}" == "1" ]]; then diff --git a/harness/modules/skill-loop/GUIDE.md b/harness/modules/skill-loop/GUIDE.md index 1e2d713..861d0be 100644 --- a/harness/modules/skill-loop/GUIDE.md +++ b/harness/modules/skill-loop/GUIDE.md @@ -22,7 +22,9 @@ Record evidence when a session shows one of these signals: - a skill should be protected, pinned, restored, staled, or archived Skip evidence for one-off commands, transient progress, raw chat logs, secrets, -or facts better stored as memory. +or facts better stored as memory. Do not record evidence merely because a +single command succeeded or because the current prompt mentions the skill loop; +there must be a reusable workflow or lifecycle signal. ## Lifecycle diff --git a/harness/modules/skill-loop/README.md b/harness/modules/skill-loop/README.md index afa63e0..c02e21c 100644 --- a/harness/modules/skill-loop/README.md +++ b/harness/modules/skill-loop/README.md @@ -20,6 +20,7 @@ harness/modules/skill-loop/ ├── skills/ │ ├── skill_observe.md │ ├── skill_curate.md +│ ├── skill_author.md │ └── skill_manage.md ├── subagents/ │ └── curator.md @@ -43,6 +44,7 @@ harness/modules/skill-loop/ | `hooks/*.md` | Four lifecycle reminders. Prime syncs active skills; Nudge records evidence; Compact may trigger review; Remind is no-op by default. | | `skills/skill_observe.md` | Online evidence capture protocol. | | `skills/skill_curate.md` | Protocol for starting a curator review. | +| `skills/skill_author.md` | Protocol for drafting reviewable `SKILL.md` content. | | `skills/skill_manage.md` | Approved lifecycle mutation protocol. | | `subagents/curator.md` | Background reviewer that proposes create, patch, consolidate, stale, archive, or restore actions. | | Host adapter | Host-specific projection lives outside the module under `harness/hosts//`. | @@ -90,6 +92,7 @@ The key split is: GUIDE.md decides when skill evolution behavior is useful. skill_observe.md records evidence only. curator.md reviews evidence and proposes changes. +skill_author.md drafts skill content for review. skill_manage.md applies approved changes to canonical state. prime.sh projects active canonical skills into the host skill surface. ``` diff --git a/harness/modules/skill-loop/env.sh b/harness/modules/skill-loop/env.sh index 575ca5a..9276662 100644 --- a/harness/modules/skill-loop/env.sh +++ b/harness/modules/skill-loop/env.sh @@ -21,4 +21,4 @@ export MNEMON_SKILL_LOOP_USAGE_FILE="${MNEMON_SKILL_LOOP_USAGE_FILE:-${MNEMON_SK export MNEMON_SKILL_LOOP_PROPOSALS_DIR="${MNEMON_SKILL_LOOP_PROPOSALS_DIR:-${MNEMON_SKILL_LOOP_DIR}/proposals}" export MNEMON_SKILL_LOOP_HOST_SKILLS_DIR="${MNEMON_SKILL_LOOP_HOST_SKILLS_DIR:-${MNEMON_SKILL_LOOP_CONFIG_DIR}/skills}" export MNEMON_SKILL_LOOP_REVIEW_MIN_EVENTS="${MNEMON_SKILL_LOOP_REVIEW_MIN_EVENTS:-20}" -export MNEMON_SKILL_LOOP_PROTECTED_SKILLS="${MNEMON_SKILL_LOOP_PROTECTED_SKILLS:-skill_observe,skill_curate,skill_manage,memory_get,memory_set}" +export MNEMON_SKILL_LOOP_PROTECTED_SKILLS="${MNEMON_SKILL_LOOP_PROTECTED_SKILLS:-skill_observe,skill_curate,skill_author,skill_manage,memory_get,memory_set}" diff --git a/harness/modules/skill-loop/module.json b/harness/modules/skill-loop/module.json index 31141f9..537fdaf 100644 --- a/harness/modules/skill-loop/module.json +++ b/harness/modules/skill-loop/module.json @@ -21,6 +21,7 @@ "skills": [ "skills/skill_observe.md", "skills/skill_curate.md", + "skills/skill_author.md", "skills/skill_manage.md" ], "subagents": [ diff --git a/harness/modules/skill-loop/skills/skill_author.md b/harness/modules/skill-loop/skills/skill_author.md new file mode 100644 index 0000000..04cb667 --- /dev/null +++ b/harness/modules/skill-loop/skills/skill_author.md @@ -0,0 +1,56 @@ +--- +name: skill_author +description: Draft or revise high-quality SKILL.md content for approved or proposed Mnemon skill-loop changes. +--- + +# skill_author + +Use this skill when a curator proposal, user request, or approved lifecycle +change needs a concrete `SKILL.md` draft. + +## Boundary + +This skill authors skill content only. It does not decide lifecycle placement +and does not activate, stale, archive, restore, or delete skills. + +Write drafts under: + +```text +$MNEMON_SKILL_LOOP_PROPOSALS_DIR +``` + +Approved lifecycle placement is applied later with `skill_manage.md`. + +## Procedure + +1. Confirm the target skill id is hyphen-case: lowercase letters, numbers, and + `-`. +2. Confirm the skill captures a reusable procedure, not project facts, + preferences, credentials, raw transcripts, or one-off task context. +3. Draft a complete `SKILL.md` with: + - YAML frontmatter containing `name` and `description` + - a short trigger-oriented description + - a clear boundary section + - a concise procedure section + - safety or validation notes only when they change behavior +4. Keep the skill focused. Prefer one workflow per skill. +5. Use project-neutral language. Do not embed current branch names, temporary + tokens, credentials, private URLs, or task-specific facts. +6. Save the draft as a proposal artifact such as: + +```text +$MNEMON_SKILL_LOOP_PROPOSALS_DIR/.SKILL.md +``` + +7. Leave `skills/active`, `skills/stale`, `skills/archived`, and host skill + surfaces unchanged unless the user explicitly asks to use `skill_manage.md` + after approval. + +## Quality Checklist + +- The description tells the host when to use the skill. +- The body teaches reusable judgment or procedure the model would not reliably + infer from the current task alone. +- The content is short enough to load on demand. +- The skill avoids duplicated policy already covered by `GUIDE.md`. +- The draft is safe to review before activation. diff --git a/harness/modules/skill-loop/skills/skill_curate.md b/harness/modules/skill-loop/skills/skill_curate.md index 2222faf..04b43ac 100644 --- a/harness/modules/skill-loop/skills/skill_curate.md +++ b/harness/modules/skill-loop/skills/skill_curate.md @@ -29,7 +29,9 @@ It does not directly apply lifecycle changes. Approved changes are applied with - `.usage.jsonl` - existing proposals 3. Request proposals for create, patch, consolidate, stale, archive, or restore - actions only when evidence supports them. + actions only when evidence supports them. When a proposal needs concrete + skill content, use `skill_author.md` to draft reviewable `SKILL.md` content + under the proposals directory. 4. Keep the output proposal-first. Do not enable a new active skill in the current session unless the user explicitly approves and the host supports it. diff --git a/harness/modules/skill-loop/skills/skill_manage.md b/harness/modules/skill-loop/skills/skill_manage.md index 9079e30..73af16e 100644 --- a/harness/modules/skill-loop/skills/skill_manage.md +++ b/harness/modules/skill-loop/skills/skill_manage.md @@ -25,6 +25,7 @@ $MNEMON_SKILL_LOOP_ARCHIVED_DIR ## Allowed MVP Operations - create an approved skill under `active//SKILL.md` +- apply approved `SKILL.md` content drafted by `skill_author.md` - patch an existing skill in its current lifecycle directory - consolidate duplicated skills with an approved replacement - move `active -> stale` @@ -38,7 +39,8 @@ $MNEMON_SKILL_LOOP_ARCHIVED_DIR 1. Read the approved proposal and confirm the intended operation. 2. Check `MNEMON_SKILL_LOOP_PROTECTED_SKILLS`; do not modify protected skills unless the approval explicitly covers the exception. -3. Keep skill ids filesystem-safe: lowercase letters, numbers, `_`, and `-`. +3. Keep new user-facing skill ids hyphen-case: lowercase letters, numbers, and + `-`. Existing protocol skill ids may keep their established underscore names. 4. Apply the smallest canonical change under the lifecycle directories. 5. Prefer moving to `archived` over deletion. 6. Do not edit the host skill surface directly. Let Prime regenerate it. diff --git a/harness/modules/skill-loop/subagents/curator.md b/harness/modules/skill-loop/subagents/curator.md index 2adf3b7..fa7dd5e 100644 --- a/harness/modules/skill-loop/subagents/curator.md +++ b/harness/modules/skill-loop/subagents/curator.md @@ -4,6 +4,7 @@ description: Reviews Mnemon skill evidence and proposes skill lifecycle changes. tools: Read, Write, Edit, Bash, Grep, Glob skills: - skill_observe + - skill_author - skill_manage --- @@ -44,7 +45,8 @@ Run curator review when: 2. Inspect active, stale, and archived skills. 3. Review usage evidence and existing proposals. 4. Identify only evidence-backed opportunities: - - create a skill for a repeated workflow + - create a skill for a repeated workflow, using `skill_author` for draft + `SKILL.md` content when useful - patch a misleading, outdated, or incomplete skill - consolidate duplicated skills - move low-value active skills to stale diff --git a/scripts/codex_app_server_eval.py b/scripts/codex_app_server_eval.py index 8dcfad7..beaad33 100755 --- a/scripts/codex_app_server_eval.py +++ b/scripts/codex_app_server_eval.py @@ -301,6 +301,9 @@ def __init__( self.assert_result = assert_result +SKILL_LOOP_EXPECTED_SKILLS = ["skill_observe", "skill_curate", "skill_author", "skill_manage"] + + def setup_none(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None: del workspace, mnemon_dir, env @@ -513,7 +516,7 @@ def assert_memory_multiturn(report: dict[str, Any], workspace: Path, mnemon_dir: def assert_skill_observe(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: del report, workspace, env - usage_file = mnemon_dir / "harness" / "skill-loop" / "skills" / ".usage.jsonl" + usage_file = skill_usage_path(mnemon_dir) content = usage_file.read_text(encoding="utf-8") if usage_file.exists() else "" return [ {"name": "skill usage log exists", "passed": usage_file.exists(), "path": str(usage_file)}, @@ -521,6 +524,195 @@ def assert_skill_observe(report: dict[str, Any], workspace: Path, mnemon_dir: Pa ] +def skill_loop_path(mnemon_dir: Path) -> Path: + return mnemon_dir / "harness" / "skill-loop" + + +def skill_usage_path(mnemon_dir: Path) -> Path: + return skill_loop_path(mnemon_dir) / "skills" / ".usage.jsonl" + + +def skill_active_path(mnemon_dir: Path, skill_id: str) -> Path: + return skill_loop_path(mnemon_dir) / "skills" / "active" / skill_id / "SKILL.md" + + +def skill_stale_path(mnemon_dir: Path, skill_id: str) -> Path: + return skill_loop_path(mnemon_dir) / "skills" / "stale" / skill_id / "SKILL.md" + + +def skill_archived_path(mnemon_dir: Path, skill_id: str) -> Path: + return skill_loop_path(mnemon_dir) / "skills" / "archived" / skill_id / "SKILL.md" + + +def skill_proposals_dir(mnemon_dir: Path) -> Path: + return skill_loop_path(mnemon_dir) / "proposals" + + +def write_skill(path: Path, skill_id: str, description: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + "---\n" + f"name: {skill_id}\n" + f"description: {description}\n" + "---\n\n" + f"# {skill_id}\n\n" + "Use this skill for lifecycle eval fixtures.\n", + encoding="utf-8", + ) + + +def append_skill_usage(mnemon_dir: Path, item: dict[str, Any]) -> None: + path = skill_usage_path(mnemon_dir) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(item, sort_keys=True) + "\n") + + +def setup_skill_curate_evidence(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None: + del workspace, env + for index, event in enumerate(["missing", "workflow", "feedback"], start=1): + append_skill_usage( + mnemon_dir, + { + "time": f"2026-05-15T00:0{index}:00Z", + "skill": None, + "event": event, + "outcome": "negative" if event == "missing" else "neutral", + "note": "Release handoff checklist workflow repeated across eval, docs, and push tasks.", + "source": "agent", + }, + ) + + +def setup_skill_active_release(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None: + del workspace, env + write_skill(skill_active_path(mnemon_dir, "release-checklist"), "release-checklist", "Release handoff checklist fixture.") + + +def setup_skill_active_legacy(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None: + del workspace, env + write_skill(skill_active_path(mnemon_dir, "legacy-release"), "legacy-release", "Legacy release workflow fixture.") + + +def setup_skill_stale_release(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None: + del workspace, env + write_skill(skill_stale_path(mnemon_dir, "release-checklist"), "release-checklist", "Stale release handoff checklist fixture.") + + +def load_jsonl(path: Path) -> list[dict[str, Any]]: + items: list[dict[str, Any]] = [] + if not path.exists(): + return items + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + value = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(value, dict): + items.append(value) + return items + + +def assert_skill_skip_noise(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, workspace, env + path = skill_usage_path(mnemon_dir) + content = path.read_text(encoding="utf-8") if path.exists() else "" + return [ + {"name": "transient skill evidence was not recorded", "passed": not path.exists() or not content.strip(), "path": str(path)}, + {"name": "temporary token absent from skill evidence", "passed": "skill-temp-742913" not in content.lower(), "path": str(path)}, + ] + + +def assert_skill_missing_observed(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, workspace, env + path = skill_usage_path(mnemon_dir) + items = load_jsonl(path) + matching = [ + item for item in items + if item.get("event") == "missing" + and item.get("skill") == "release-checklist" + and "release handoff checklist" in str(item.get("note", "")).lower() + ] + return [ + {"name": "missing-skill evidence log exists", "passed": path.exists(), "path": str(path)}, + {"name": "missing release checklist evidence recorded", "passed": bool(matching), "path": str(path)}, + {"name": "evidence source is agent or user", "passed": bool(matching) and matching[-1].get("source") in {"agent", "user"}, "path": str(path)}, + ] + + +def assert_skill_manage_create(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, env + path = skill_active_path(mnemon_dir, "release-checklist") + host_path = workspace / ".codex" / "skills" / "release-checklist" / "SKILL.md" + return [ + {"name": "approved skill created in active library", "passed": path.exists(), "path": str(path)}, + assert_file_contains(path, "release-checklist", "created skill has release-checklist identity"), + {"name": "host skill surface was not directly edited", "passed": not host_path.exists(), "path": str(host_path)}, + ] + + +def assert_skill_curate_proposal(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, workspace, env + proposals = skill_proposals_dir(mnemon_dir) + files = sorted(path for path in proposals.rglob("*") if path.is_file()) if proposals.exists() else [] + combined = "\n".join(path.read_text(encoding="utf-8", errors="replace") for path in files) + active = skill_active_path(mnemon_dir, "release-checklist") + return [ + {"name": "curation proposal file created", "passed": bool(files), "path": str(proposals)}, + {"name": "proposal mentions release checklist", "passed": "release handoff checklist" in combined.lower() or "release-checklist" in combined.lower(), "path": str(proposals)}, + {"name": "curation did not directly activate skill", "passed": not active.exists(), "path": str(active)}, + ] + + +def assert_skill_unapproved_noop(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, workspace, env + active = skill_active_path(mnemon_dir, "release-checklist") + archived = skill_archived_path(mnemon_dir, "release-checklist") + return [ + {"name": "unapproved lifecycle request kept active skill", "passed": active.exists(), "path": str(active)}, + {"name": "unapproved lifecycle request did not archive skill", "passed": not archived.exists(), "path": str(archived)}, + ] + + +def assert_skill_stale_move(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, workspace, env + active = skill_active_path(mnemon_dir, "legacy-release") + stale = skill_stale_path(mnemon_dir, "legacy-release") + return [ + {"name": "approved stale move removed active skill", "passed": not active.exists(), "path": str(active)}, + {"name": "approved stale move created stale skill", "passed": stale.exists(), "path": str(stale)}, + ] + + +def assert_skill_restore(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, workspace, env + active = skill_active_path(mnemon_dir, "release-checklist") + stale = skill_stale_path(mnemon_dir, "release-checklist") + return [ + {"name": "approved restore created active skill", "passed": active.exists(), "path": str(active)}, + {"name": "approved restore removed stale skill", "passed": not stale.exists(), "path": str(stale)}, + ] + + +def assert_skill_author_draft(report: dict[str, Any], workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> list[dict[str, Any]]: + del report, env + proposals = skill_proposals_dir(mnemon_dir) + draft = proposals / "release-checklist.SKILL.md" + active = skill_active_path(mnemon_dir, "release-checklist") + host_path = workspace / ".codex" / "skills" / "release-checklist" / "SKILL.md" + content = draft.read_text(encoding="utf-8") if draft.exists() else "" + return [ + {"name": "skill author draft exists", "passed": draft.exists(), "path": str(draft)}, + {"name": "draft has release-checklist frontmatter", "passed": "name: release-checklist" in content.lower(), "path": str(draft)}, + {"name": "draft includes reusable procedure", "passed": "procedure" in content.lower() and "handoff" in content.lower(), "path": str(draft)}, + {"name": "draft did not activate skill", "passed": not active.exists(), "path": str(active)}, + {"name": "draft did not edit host skill surface", "passed": not host_path.exists(), "path": str(host_path)}, + {"name": "draft omits secret-like token", "passed": "sk-test-author" not in content.lower(), "path": str(draft)}, + ] + + SCENARIOS: dict[str, Scenario] = { "memory-skip-local": Scenario( name="memory-skip-local", @@ -633,7 +825,7 @@ def assert_skill_observe(report: dict[str, Any], workspace: Path, mnemon_dir: Pa "skill-observe-evidence": Scenario( name="skill-observe-evidence", modules=["skill-loop"], - expected_skills=["skill_observe", "skill_curate", "skill_manage"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, setup=setup_none, prompt=( "Use the Mnemon skill loop to record lightweight evidence that the eval-runner workflow " @@ -642,6 +834,108 @@ def assert_skill_observe(report: dict[str, Any], workspace: Path, mnemon_dir: Pa ), assert_result=assert_skill_observe, ), + "skill-skip-transient": Scenario( + name="skill-skip-transient", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_none, + prompt=( + "Apply the Mnemon skill loop guide. This turn used a one-off shell command " + "with temporary token skill-temp-742913 and no reusable workflow value. " + "Do not record skill evidence for it. Reply done." + ), + assert_result=assert_skill_skip_noise, + ), + "skill-observe-missing": Scenario( + name="skill-observe-missing", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_none, + prompt=( + "Use the Mnemon skill loop to record missing-skill evidence. " + "The missing skill id is release-checklist, event is missing, outcome is negative, " + "and the note must contain 'release handoff checklist'. Append exactly one JSONL item " + "to the configured usage log. Do not create or patch skills. Reply done." + ), + assert_result=assert_skill_missing_observed, + ), + "skill-manage-approved-create": Scenario( + name="skill-manage-approved-create", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_none, + prompt=( + "Use the Mnemon skill loop skill_manage procedure. This eval is explicit approval " + "to create a new canonical active skill with id release-checklist. Create only " + "active/release-checklist/SKILL.md in the canonical skill library, with frontmatter " + "name: release-checklist and a short procedure for release handoff checks. Do not edit " + "the host .codex skills surface directly. Reply done." + ), + assert_result=assert_skill_manage_create, + ), + "skill-curate-proposal": Scenario( + name="skill-curate-proposal", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_skill_curate_evidence, + prompt=( + "Use the Mnemon skill loop skill_curate procedure to review accumulated evidence. " + "Create a proposal file under the configured proposals directory recommending a " + "release-checklist skill for the repeated release handoff checklist workflow. " + "Do not create active skills or modify the host skill surface. Reply done." + ), + assert_result=assert_skill_curate_proposal, + ), + "skill-manage-unapproved-noop": Scenario( + name="skill-manage-unapproved-noop", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_skill_active_release, + prompt=( + "Apply the Mnemon skill loop skill_manage boundary. I am only considering " + "archiving active skill release-checklist someday, but this is not approved. " + "Do not move, archive, patch, or delete any skill. Reply with what you did." + ), + assert_result=assert_skill_unapproved_noop, + ), + "skill-manage-approved-stale": Scenario( + name="skill-manage-approved-stale", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_skill_active_legacy, + prompt=( + "Use the Mnemon skill loop skill_manage procedure. This eval explicitly approves " + "moving active skill legacy-release to stale because it is superseded. Move only " + "the canonical skill from active to stale. Do not edit the host .codex skill surface. Reply done." + ), + assert_result=assert_skill_stale_move, + ), + "skill-manage-approved-restore": Scenario( + name="skill-manage-approved-restore", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_skill_stale_release, + prompt=( + "Use the Mnemon skill loop skill_manage procedure. This eval explicitly approves " + "restoring stale skill release-checklist to active because renewed evidence supports it. " + "Move only the canonical skill from stale to active. Do not edit the host .codex skill surface. Reply done." + ), + assert_result=assert_skill_restore, + ), + "skill-author-draft": Scenario( + name="skill-author-draft", + modules=["skill-loop"], + expected_skills=SKILL_LOOP_EXPECTED_SKILLS, + setup=setup_none, + prompt=( + "Use the Mnemon skill loop skill_author procedure to draft a reviewable skill. " + "Create only the proposal draft release-checklist.SKILL.md under the configured proposals directory. " + "The skill id is release-checklist and it should teach a reusable release handoff checklist workflow. " + "Include frontmatter name and description plus a concise procedure. Do not activate the skill, do not edit " + "the host .codex skill surface, and do not include this temporary token: sk-test-author-742913. Reply done." + ), + assert_result=assert_skill_author_draft, + ), } @@ -667,6 +961,19 @@ def assert_skill_observe(report: dict[str, Any], workspace: Path, mnemon_dir: Pa ] +SKILL_DEEP_SUITE = [ + "skill-observe-evidence", + "skill-skip-transient", + "skill-observe-missing", + "skill-manage-approved-create", + "skill-curate-proposal", + "skill-manage-unapproved-noop", + "skill-manage-approved-stale", + "skill-manage-approved-restore", + "skill-author-draft", +] + + def scenario_args(base: argparse.Namespace, scenario: Scenario) -> argparse.Namespace: args = argparse.Namespace(**vars(base)) args.modules = scenario.modules @@ -813,7 +1120,7 @@ def parse_args(argv: list[str]) -> argparse.Namespace: ) parser.add_argument( "--suite-name", - choices=["default", "memory-deep"], + choices=["default", "memory-deep", "skill-deep"], default="default", help="Scenario suite to run with --suite.", ) @@ -855,7 +1162,7 @@ def parse_args(argv: list[str]) -> argparse.Namespace: if "memory-loop" in args.modules: expected.extend(["memory_get", "memory_set"]) if "skill-loop" in args.modules: - expected.extend(["skill_observe", "skill_curate", "skill_manage"]) + expected.extend(SKILL_LOOP_EXPECTED_SKILLS) args.expected_skills = expected return args @@ -865,7 +1172,12 @@ def run_suite(args: argparse.Namespace) -> dict[str, Any]: suite_root = Path(args.run_root) if args.run_root else root / ".testdata" / "codex-app-eval-suite" / utc_run_id() suite_root.mkdir(parents=True, exist_ok=True) reports = [] - suite_names = MEMORY_DEEP_SUITE if args.suite_name == "memory-deep" else DEFAULT_SUITE + if args.suite_name == "memory-deep": + suite_names = MEMORY_DEEP_SUITE + elif args.suite_name == "skill-deep": + suite_names = SKILL_DEEP_SUITE + else: + suite_names = DEFAULT_SUITE for name in suite_names: scenario = SCENARIOS[name] current = scenario_args(args, scenario)