From d626eb2574d9c20fbcfaacb2c61012eb22629f9d Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:46:14 +0800
Subject: [PATCH 01/13] feat: add ADR template and integration guide

Architecture Decision Record template for documenting the 'why'
behind architectural decisions, with flow-code integration guidance.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docs/adr-guide.md          | 68 ++++++++++++++++++++++++++++++++++++++
 references/adr-template.md | 54 ++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 docs/adr-guide.md
 create mode 100644 references/adr-template.md

diff --git a/docs/adr-guide.md b/docs/adr-guide.md
new file mode 100644
index 00000000..9fd29fa6
--- /dev/null
+++ b/docs/adr-guide.md
@@ -0,0 +1,68 @@
+# ADR Integration Guide
+
+Architecture Decision Records capture the *why* behind significant technical decisions. This guide covers when to write them, where to store them, and how they integrate with flow-code.
+
+## When to Create an ADR
+
+Write an ADR when making a decision that would be expensive to reverse:
+
+- **Technology choices** — frameworks, libraries, major dependencies
+- **Architectural patterns** — data model design, API style, auth strategy
+- **Infrastructure decisions** — hosting, build tools, deployment approach
+- **Pattern changes** — moving from one approach to another (e.g., REST to GraphQL)
+
+Do NOT write ADRs for routine implementation choices, obvious decisions, or throwaway prototypes.
+
+## Where to Store ADRs
+
+Use `docs/decisions/` with sequential numbering:
+
+```
+docs/decisions/
+  ADR-001-use-libsql-for-storage.md
+  ADR-002-wave-checkpoint-execution-model.md
+  ADR-003-teams-file-locking-protocol.md
+```
+
+Use the template at `references/adr-template.md` as your starting point.
+
+## Referencing ADRs in Task Specs
+
+When a task implements or depends on an architectural decision, reference the ADR in the task spec:
+
+```bash
+flowctl task create --title "Implement file locking" \
+  --spec "Implements ADR-003. See docs/decisions/ADR-003-teams-file-locking-protocol.md"
+```
+
+In inline code, link to the ADR near the relevant implementation:
+
+```
+// Auth strategy per ADR-002. See docs/decisions/ADR-002-auth-strategy.md
+```
+
+## Integration with /flow-code:plan
+
+During planning, ADRs surface naturally at two points:
+
+1. **Plan creation** — When `/flow-code:plan` encounters an architectural decision, create the ADR as a task in the epic. The ADR task should complete before implementation tasks that depend on it.
+
+2. **Plan review** — `/flow-code:plan-review` should verify that significant architectural decisions have corresponding ADRs. Missing ADRs are a review finding.
+
+### Example: ADR as a Plan Task
+
+```
+Epic: fn-50-migrate-to-graphql
+  Task 1: Write ADR-005 documenting REST-to-GraphQL migration rationale
+  Task 2: Implement GraphQL schema (depends on Task 1)
+  Task 3: Migrate endpoints (depends on Task 2)
+```
+
+## ADR Lifecycle
+
+```
+PROPOSED  ->  ACCEPTED  ->  SUPERSEDED by ADR-XXX
+                         ->  DEPRECATED
+```
+
+Never delete old ADRs. When a decision changes, write a new ADR that supersedes the old one. The historical record is the whole point.
diff --git a/references/adr-template.md b/references/adr-template.md
new file mode 100644
index 00000000..0aa5584b
--- /dev/null
+++ b/references/adr-template.md
@@ -0,0 +1,54 @@
+# ADR-NNN: [Short decision title]
+
+## Status
+
+<!-- One of: Proposed | Accepted | Superseded by ADR-XXX | Deprecated -->
+Proposed
+
+## Date
+
+<!-- YYYY-MM-DD when the decision was made or proposed -->
+YYYY-MM-DD
+
+## Context
+
+<!-- What is the problem or situation that requires a decision?
+     Include:
+     - The specific technical challenge or requirement
+     - Relevant constraints (performance, team size, timeline, budget)
+     - Any forcing functions that make this decision necessary now
+     Keep factual — save opinions for Decision and Alternatives. -->
+
+## Decision
+
+<!-- What is the decision and why?
+     State the chosen approach clearly, then explain the reasoning.
+     Link to any supporting evidence (benchmarks, spikes, docs). -->
+
+## Alternatives Considered
+
+<!-- List each alternative with pros, cons, and reason for rejection.
+     Format per alternative:
+
+### [Alternative name]
+- **Pros:** ...
+- **Cons:** ...
+- **Rejected because:** one-sentence reason
+-->
+
+## Consequences
+
+<!-- What are the positive and negative results of this decision?
+     Include:
+     - What becomes easier or possible
+     - What becomes harder or impossible
+     - New constraints or obligations introduced
+     - Skills or infrastructure the team now needs
+     - Follow-up actions required -->
+
+---
+
+<!-- Storage convention: docs/decisions/ADR-NNN-short-title.md
+     Lifecycle: PROPOSED -> ACCEPTED -> (SUPERSEDED or DEPRECATED)
+     Never delete old ADRs — they preserve historical context.
+     When superseding, write a new ADR that references the old one. -->

From df685026f8713b1b7ae0becceec681426c0fe90e Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:47:08 +0800
Subject: [PATCH 02/13] feat: add deprecation and migration skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stack-agnostic skill for managing deprecation workflows — assessment,
replacement, migration, and clean removal. Inspired by agent-skills.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-deprecation/SKILL.md | 194 ++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 skills/flow-code-deprecation/SKILL.md

diff --git a/skills/flow-code-deprecation/SKILL.md b/skills/flow-code-deprecation/SKILL.md
new file mode 100644
index 00000000..43ef8184
--- /dev/null
+++ b/skills/flow-code-deprecation/SKILL.md
@@ -0,0 +1,194 @@
+---
+name: flow-code-deprecation
+description: "Use when removing, replacing, or sunsetting features, APIs, modules, or dependencies in any codebase"
+---
+
+# Deprecation and Migration
+
+## Overview
+
+Code is a liability, not an asset — every line carries ongoing maintenance cost. This skill enforces a disciplined process for deprecating and removing code that no longer earns its keep, ensuring consumers are migrated safely before anything is removed.
+
+## When to Use
+
+- Replacing an old system, API, library, or module with a new one
+- Sunsetting a feature, CLI command, or config option that's no longer needed
+- Consolidating duplicate implementations into a single path
+- Removing dead code that nobody owns but everybody depends on
+- Planning the lifecycle of a new system (deprecation planning starts at design time)
+- Deciding whether to maintain a legacy system or invest in migration
+
+**When NOT to use:**
+- Routine refactoring that doesn't remove public interfaces — use standard refactoring
+- Adding new features alongside old ones without removing anything
+- Version bumps that don't change or remove behavior
+- Bug fixes in legacy code you intend to keep
+
+## The Deprecation Decision
+
+Before deprecating anything, answer these questions:
+
+```
+1. Does this system still provide unique value?
+   → If yes, maintain it. If no, proceed.
+
+2. How many consumers depend on it?
+   → Quantify: grep for imports, check API call logs, review dependency graphs.
+
+3. Does a replacement exist?
+   → If no, BUILD THE REPLACEMENT FIRST. Never deprecate without an alternative.
+
+4. What's the migration cost per consumer?
+   → If automatable (codemod, sed, script), do it yourself (the Churn Rule).
+   → If manual and high-effort, weigh against ongoing maintenance cost.
+
+5. What's the cost of NOT deprecating?
+   → Security risk, engineer time, dependency rot, onboarding friction.
+```
+
+If answers 1=no, 3=yes, and 5 > 4, proceed with deprecation.
+
+## Advisory vs Compulsory Deprecation
+
+| Type | When to Use | Mechanism |
+|------|-------------|-----------|
+| **Advisory** | Old system is stable, migration is optional | Warnings, documentation, nudges. Consumers migrate on their own timeline. |
+| **Compulsory** | Security risk, blocks progress, or maintenance cost is unsustainable | Hard deadline. Old system removed by date X. Provide migration tooling. |
+
+**Default to advisory.** Use compulsory only when risk or cost justifies forcing migration. Compulsory deprecation requires providing migration tooling, documentation, and support — you cannot just announce a deadline.
+
+## Core Process
+
+### Phase 1: Assess Impact
+
+1. **Inventory all consumers** — grep for imports, API calls, config references, CLI invocations. Miss nothing.
+2. **Map the dependency graph** — direct consumers and transitive dependents. Hyrum's Law: with enough users, every observable behavior becomes depended on, including bugs and timing quirks.
+3. **Quantify maintenance cost** — security vulnerabilities, test failures, onboarding friction, dependency update burden.
+4. **Document the assessment:**
+   ```
+   Deprecated: <component name>
+   Consumers: <count and list>
+   Maintenance cost: <specific burden>
+   Replacement: <name or "to be built">
+   Migration type: advisory | compulsory
+   ```
+
+### Phase 2: Build the Replacement
+
+**Do NOT announce deprecation until the replacement is production-proven.**
+
+1. **Cover all critical use cases** of the old system
+2. **Write a migration guide** with concrete steps and examples
+3. **Prove it in production** — not just "theoretically better"
+4. **Verify behavioral parity** on edge cases consumers depend on
+
+### Phase 3: Announce and Document
+
+Create a deprecation notice:
+
+```
+## Deprecation Notice: <ComponentName>
+
+Status: Deprecated as of <date>
+Replacement: <NewComponent> (see migration guide)
+Removal date: Advisory — no hard deadline | Compulsory — <date>
+Reason: <specific maintenance burden or risk>
+
+### Migration Steps
+1. Replace <old import/call> with <new import/call>
+2. Update configuration (see examples)
+3. Run migration verification: <command>
+```
+
+Place notices where consumers will see them: inline warnings, changelogs, documentation headers.
+
+### Phase 4: Migrate Consumers
+
+For each consumer:
+
+1. **Identify all touchpoints** with the deprecated system
+2. **Update to the replacement** — provide codemods or scripts where possible
+3. **Verify behavior matches** — tests, integration checks, manual validation
+4. **Remove references** to the old system
+5. **Confirm no regressions**
+
+**The Churn Rule:** If you own the infrastructure being deprecated, you are responsible for migrating your users — or providing backward-compatible shims. Do not announce deprecation and leave users to figure it out.
+
+### Phase 5: Remove Old Code
+
+Only after all consumers have migrated:
+
+1. **Verify zero active usage** — metrics, logs, dependency analysis, grep
+2. **Remove the code** — implementation, types, exports
+3. **Remove associated artifacts** — tests, documentation, configuration, feature flags
+4. **Remove deprecation notices** — they served their purpose
+5. **Run full verification:**
+   ```bash
+   $FLOWCTL guard
+   ```
+
+### Phase 6: Verify Clean Removal
+
+1. **Search for orphaned references** — stale imports, dead config keys, broken links in docs
+2. **Run the full test suite** — no failures from missing code
+3. **Check build artifacts** — no phantom exports or dangling symbols
+4. **Confirm documentation is updated** — no references to removed components
+
+For planning migration work across multiple tasks, use `/flow-code:plan`.
+
+## Migration Patterns
+
+### Strangler Pattern
+Run old and new in parallel. Route traffic incrementally from old to new. Remove old when it handles 0%.
+
+### Adapter Pattern
+Wrap old interface around new implementation. Consumers keep using the old interface while the backend migrates underneath.
+
+### Feature Flag Migration
+Use feature flags to switch consumers from old to new one at a time. Roll back instantly if issues arise.
+
+## Zombie Code
+
+Code that nobody owns but everybody depends on. Signs:
+- No commits in 6+ months but active consumers exist
+- No assigned maintainer
+- Failing tests nobody fixes
+- Dependencies with known vulnerabilities nobody updates
+
+**Response:** Either assign an owner and maintain it, or deprecate it with a concrete migration plan. Zombie code cannot stay in limbo.
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "It still works, why remove it?" | Working code without maintenance accumulates security debt and complexity. Cost grows silently until it's a crisis. |
+| "Someone might need it later" | If needed later, it can be rebuilt with better design. Keeping unused code "just in case" costs more than rebuilding. |
+| "No one uses this anymore" | Did you verify that with grep, metrics, and logs? Undocumented consumers are the norm, not the exception. |
+| "We can remove it later" | Later never comes. Every month you delay, more consumers may adopt the deprecated path. |
+| "Breaking changes are fine for a major version" | A major version bump does not excuse removing things without migration paths. Semver is a signal, not a license to break users. |
+| "The migration is too expensive" | Compare migration cost to 2-3 years of ongoing maintenance. Migration is almost always cheaper long-term. |
+| "Users will migrate on their own" | They won't. Provide tooling, documentation, and support — or do the migration yourself (the Churn Rule). |
+| "We can maintain both indefinitely" | Two systems doing the same thing means double the maintenance, testing, documentation, and onboarding cost. |
+
+## Red Flags
+
+- Deprecating a system with no replacement available or proven
+- Deprecation announcement with no migration guide, tooling, or timeline
+- "Soft" deprecation that has been advisory for months or years with no progress
+- Removing code without verifying zero active consumers first
+- New features added to a system already marked deprecated (invest in the replacement instead)
+- Zombie code with no owner and active consumers left in limbo
+- Deprecation without quantifying current usage
+
+## Verification
+
+After completing a deprecation cycle, confirm:
+
+- [ ] Deprecation decision documented with consumer count, maintenance cost, and replacement name
+- [ ] Replacement is production-proven and covers all critical use cases
+- [ ] Migration guide exists with concrete steps, examples, and verification commands
+- [ ] All active consumers migrated (verified by grep, metrics, or logs — not assumption)
+- [ ] Old code, tests, documentation, and configuration fully removed
+- [ ] No orphaned references to the deprecated system remain in the codebase
+- [ ] `$FLOWCTL guard` passes after removal
+- [ ] Deprecation notices removed (they served their purpose)

From 6f6c87809ecc43e8461cecbb1349b0dc2086255c Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:47:37 +0800
Subject: [PATCH 03/13] feat: add context engineering skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consolidates context optimization methodology — 5-tier hierarchy,
CLAUDE.md optimization, and strategic use of existing context tools.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-context-eng/SKILL.md | 182 ++++++++++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 skills/flow-code-context-eng/SKILL.md

diff --git a/skills/flow-code-context-eng/SKILL.md b/skills/flow-code-context-eng/SKILL.md
new file mode 100644
index 00000000..8568da10
--- /dev/null
+++ b/skills/flow-code-context-eng/SKILL.md
@@ -0,0 +1,182 @@
+---
+name: flow-code-context-eng
+description: "Use when context window is filling up, agent output quality drops, or starting a new complex task that needs careful context loading"
+---
+
+# Context Engineering
+
+## Overview
+
+Context engineering is about loading the RIGHT information at the RIGHT time — not "load everything." The context window is finite, attention degrades with noise, and wrong context is worse than no context. This skill teaches strategic context management: what to load, when to load it, and when to prune.
+
+## When to Use
+
+- Starting a new session or complex task that spans multiple files
+- Agent output quality is declining (wrong patterns, hallucinated APIs, ignored conventions)
+- Switching between different parts of a codebase
+- Agent ignores project conventions despite them being documented
+- Context window pressure — responses are getting slower or less accurate
+- Planning a review that needs cross-file understanding
+
+**When NOT to use:**
+- Simple single-file edits where the file is already loaded
+- Tasks where the relevant context is already in the conversation
+- Quick fixes where the error message contains all needed information
+
+## Core Process
+
+### Phase 1: Audit Current Context
+
+Before loading anything new, assess what you already have.
+
+**Ask these questions:**
+1. What files/specs are currently loaded in this conversation?
+2. What information is missing for the current task?
+3. What loaded context is stale or irrelevant (from a previous task)?
+4. How deep into the conversation are we? (stale context risk increases with depth)
+
+```
+Current context audit:
+  Loaded:  [list what's in the conversation]
+  Missing: [what the task needs but isn't loaded]
+  Stale:   [what was loaded for a previous task]
+  Action:  [load X, prune Y, refresh Z]
+```
+
+### Phase 2: Apply the 5-Tier Hierarchy
+
+Structure context from most persistent to most transient. Higher tiers have higher priority — when the window is tight, cut from the bottom.
+
+```
+┌─────────────────────────────────────────┐
+│  Tier 1: Rules files                    │ ← Persistent, highest priority
+│  (CLAUDE.md, .cursorrules)              │
+├─────────────────────────────────────────┤
+│  Tier 2: Specs & architecture           │ ← Per-feature
+│  (SPEC.md, ADRs, epic specs)            │
+├─────────────────────────────────────────┤
+│  Tier 3: Source files                   │ ← Per-task
+│  (relevant code, types, tests)          │
+├─────────────────────────────────────────┤
+│  Tier 4: Error output                   │ ← Per-iteration
+│  (test failures, build errors, logs)    │
+├─────────────────────────────────────────┤
+│  Tier 5: Conversation history           │ ← Accumulating, lowest priority
+│  (previous messages, prior attempts)    │
+└─────────────────────────────────────────┘
+```
+
+**Tier 1 — Rules files:** Always loaded, never pruned. This is persistent context that survives across sessions. If it's not in the rules file, it doesn't exist for the agent.
+
+**Tier 2 — Specs & architecture:** Load the relevant section only. "Here's the auth section of our spec" beats "here's our entire 5000-word spec" when working on auth.
+
+**Tier 3 — Source files:** Load before editing. Read the file, its tests, one example of the pattern, and relevant type definitions. Use `structure` commands for signatures before committing to full file reads.
+
+**Tier 4 — Error output:** Feed specific errors, not entire logs. One failing test's traceback, not 500 lines of test output.
+
+**Tier 5 — Conversation history:** Compresses and degrades over time. Start fresh sessions when switching major features. Summarize progress when context is getting long.
+
+### Phase 3: Optimize CLAUDE.md
+
+CLAUDE.md is the highest-leverage context in any project. 10 lines there outweigh 1000 lines in conversation.
+
+**A good CLAUDE.md covers:**
+- Tech stack and versions
+- Build/test/lint commands (exact invocations)
+- Code conventions (with one example)
+- Boundaries (what NOT to do)
+- Project-specific patterns
+
+**Audit checklist:**
+- [ ] Does the CLAUDE.md exist?
+- [ ] Does it cover the commands an agent needs to run?
+- [ ] Does it show the conventions the agent keeps violating?
+- [ ] Is it under 200 lines? (longer = lower compliance)
+
+If the agent keeps making the same mistake, the fix is almost always adding a line to CLAUDE.md — not correcting the agent in conversation.
+
+### Phase 4: Use Existing Tools Strategically
+
+Do not reinvent context gathering. flow-code has purpose-built tools for each context need.
+
+| Need | Tool | When |
+|------|------|------|
+| Deep codebase understanding | `context-scout` agent | Before planning or major implementation |
+| AI-powered file discovery | `flow-code-rp-explorer` skill | When you need to find all files related to a feature |
+| Cross-model review context | `flow-code-export-context` skill | When preparing context for external LLM review |
+| Code signatures without full reads | `rp-cli structure` or `get_code_structure` | When you need function/type shapes, not full files |
+| Targeted file search | `rp-cli search` or `file_search` | When you know what pattern to find |
+
+**Decision tree:**
+
+```
+Need to understand a feature?
+  ├─ Know the files → Read them directly (Tier 3)
+  ├─ Know the area → structure + targeted search
+  └─ Don't know where to start → context-scout agent or builder
+
+Need cross-file analysis?
+  ├─ Architecture question → context_builder(response_type="question")
+  ├─ Planning → context_builder(response_type="plan")
+  └─ Review → context_builder(response_type="review")
+
+Need to share context externally?
+  └─ flow-code-export-context skill
+```
+
+### Phase 5: Monitor and Prune
+
+Context management is not set-and-forget. Monitor throughout the session.
+
+**Prune triggers:**
+- Switched to a different task → remove files from the old task
+- Error was fixed → remove the error output and failed attempts
+- Conversation exceeds ~50 messages → consider fresh session
+- Agent starts hallucinating APIs → context is stale, reload source files
+
+**Token budgeting principles:**
+- Signatures (structure) cost ~500 tokens vs ~5000 for full files (10x savings)
+- Slice reads (`--start-line --limit`) cost ~300 tokens for 50 lines
+- Prefer structure first, full read only for files you will edit
+- Never dump full files for context — use signatures + targeted slices
+
+**Fresh session checklist:**
+1. Summarize progress so far (what's done, what's next)
+2. Note any decisions or conventions discovered
+3. Start new session with the summary + relevant Tier 1-2 context
+4. Reload only the Tier 3 files needed for the next task
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "I'll just load everything" | Context windows are finite. Irrelevant context dilutes signal and degrades output quality. Precision beats volume. |
+| "CLAUDE.md doesn't matter much" | It's the highest-leverage file in any project. 10 lines in CLAUDE.md prevent more mistakes than 100 lines of in-conversation correction. |
+| "I can hold it all in my head" | Context compresses and degrades. What you loaded 20 messages ago may be effectively gone. Verify, don't assume. |
+| "More context is always better" | Research shows performance degrades with excess instructions. Wrong context is actively worse than no context — it introduces false patterns. |
+| "I'll read files when I need them" | Reactive loading causes mid-task quality drops. Proactive context loading at task start prevents the "forgot the convention" class of errors entirely. |
+| "The context window is huge, I'll use it all" | Window size is not attention budget. A 200K window with 190K of noise performs worse than a 50K window with 50K of signal. |
+| "I don't need to start a fresh session" | Stale context from previous tasks actively misleads. The cost of reloading is minutes; the cost of stale context is hours of wrong-direction work. |
+
+## Red Flags
+
+- Agent output stops following project conventions that are documented in CLAUDE.md
+- Agent invents APIs, imports, or utilities that don't exist in the codebase
+- Agent re-implements something that already exists (failed to load existing code)
+- Quality visibly degrades as conversation gets longer
+- Agent gives generic advice instead of project-specific answers
+- Same correction given to the agent more than twice in one session
+- No rules file exists in the project (context starvation guaranteed)
+- Full files dumped into context when only a function signature was needed
+
+## Verification
+
+After applying context engineering, confirm:
+
+- [ ] CLAUDE.md exists, is current, and covers tech stack, commands, conventions, and boundaries
+- [ ] Only task-relevant files are loaded (no leftover context from previous tasks)
+- [ ] Agent output references actual project files and APIs (not hallucinated ones)
+- [ ] Structure/signatures used before full file reads (token efficiency)
+- [ ] Context was refreshed when switching between major tasks
+- [ ] Error output is specific (single failure, not full logs)
+- [ ] Session was restarted if conversation exceeded ~50 messages on different topics

From 778f42f2a051dc3ffe03b06703450ad6aef98926 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:47:50 +0800
Subject: [PATCH 04/13] feat: add references/ directory with 4 universal
 checklists

Stack-agnostic reference checklists for testing patterns, security,
performance, and code review. Adapted from agent-skills patterns.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 references/code-review-checklist.md | 105 +++++++++++++++++++++++
 references/performance-checklist.md | 114 +++++++++++++++++++++++++
 references/security-checklist.md    | 127 ++++++++++++++++++++++++++++
 references/testing-patterns.md      | 124 +++++++++++++++++++++++++++
 4 files changed, 470 insertions(+)
 create mode 100644 references/code-review-checklist.md
 create mode 100644 references/performance-checklist.md
 create mode 100644 references/security-checklist.md
 create mode 100644 references/testing-patterns.md

diff --git a/references/code-review-checklist.md b/references/code-review-checklist.md
new file mode 100644
index 00000000..000ebbd0
--- /dev/null
+++ b/references/code-review-checklist.md
@@ -0,0 +1,105 @@
+# Code Review Checklist
+
+Quick reference for structured code review. Covers five axes: correctness, readability, architecture, security, and performance.
+
+## Table of Contents
+
+- [Before Reviewing](#before-reviewing)
+- [Correctness](#correctness)
+- [Readability](#readability)
+- [Architecture](#architecture)
+- [Security](#security)
+- [Performance](#performance)
+- [Testing](#testing)
+- [Review Communication](#review-communication)
+- [Common Review Anti-Patterns](#common-review-anti-patterns)
+
+## Before Reviewing
+
+- [ ] Understand the purpose (read PR description, linked issue, or spec)
+- [ ] Check the diff size (>400 lines = request split unless justified)
+- [ ] Run the code locally or verify CI passes
+- [ ] Review commits in logical order (not just the final diff)
+
+## Correctness
+
+- [ ] Logic matches stated requirements and acceptance criteria
+- [ ] Edge cases handled (empty inputs, nil/null, boundary values, overflow)
+- [ ] Error paths return appropriate errors (not silently swallowed)
+- [ ] Concurrent access is safe (race conditions, data races)
+- [ ] State transitions are valid (no impossible states representable)
+- [ ] Backward compatibility maintained (or breaking change documented)
+- [ ] Database migrations are reversible and safe for rollback
+
+## Readability
+
+- [ ] Names are descriptive and consistent with codebase conventions
+- [ ] Functions do one thing at one level of abstraction
+- [ ] No dead code, commented-out code, or TODO without issue reference
+- [ ] Complex logic has explanatory comments (why, not what)
+- [ ] Magic numbers replaced with named constants
+- [ ] Nesting depth <= 3 levels (extract functions or early-return)
+- [ ] File length reasonable (<300 lines, split if larger)
+- [ ] Public API has documentation (function signatures, types, constraints)
+
+## Architecture
+
+- [ ] Change is in the right layer (not business logic in HTTP handler)
+- [ ] No unnecessary coupling between modules
+- [ ] Dependency direction follows architecture (no upward dependencies)
+- [ ] Abstractions are justified (no premature generalization)
+- [ ] Existing patterns followed (or deviation justified in PR description)
+- [ ] No duplication that should be extracted (DRY at module boundaries)
+- [ ] Configuration is externalized (not hardcoded values)
+- [ ] New dependencies justified and minimal
+
+## Security
+
+- [ ] User input validated at system boundaries
+- [ ] No secrets, tokens, or credentials in code
+- [ ] SQL/command injection prevented (parameterized queries, no shell interpolation)
+- [ ] Authorization checked on every resource access
+- [ ] Sensitive data not logged or exposed in error responses
+- [ ] File paths validated (no path traversal)
+- [ ] Deserialization is type-restricted (no arbitrary object creation)
+
+## Performance
+
+- [ ] No N+1 queries or unbounded fetches
+- [ ] Algorithmic complexity appropriate for expected data size
+- [ ] No unnecessary allocations in hot paths
+- [ ] Database queries use indexes (check query plan for new queries)
+- [ ] Caching has eviction strategy and bounded size
+- [ ] No blocking I/O in async contexts
+- [ ] Pagination used for list endpoints
+
+## Testing
+
+- [ ] New code has tests (unit and/or integration as appropriate)
+- [ ] Tests cover happy path AND error/edge cases
+- [ ] Tests are deterministic (no flaky time/order dependencies)
+- [ ] Test names describe behavior, not implementation
+- [ ] Mocks are at boundaries only (not testing internal wiring)
+- [ ] Existing tests updated if behavior changed
+- [ ] No test-only code paths in production code
+
+## Review Communication
+
+- [ ] Comments are specific and actionable ("rename X to Y" not "this is confusing")
+- [ ] Distinguish blocking issues from suggestions (prefix: "nit:", "suggestion:", "blocking:")
+- [ ] Ask questions before assuming intent ("Is this intentional?" not "This is wrong")
+- [ ] Praise good patterns when you see them
+- [ ] Limit review rounds (aim for <=2 rounds before approval)
+
+## Common Review Anti-Patterns
+
+| Anti-Pattern | Problem | Better Approach |
+|---|---|---|
+| Rubber-stamping | Bugs and debt slip through | Review every diff, even small ones |
+| Style nitpicking only | Misses logic and design bugs | Use linters for style, review for logic |
+| Blocking on preference | Stalls velocity without quality gain | Approve with suggestion, not block |
+| Reviewing without running | Misses runtime issues | Run locally or check CI output |
+| Rewriting in review | Demoralizing, scope creep | Suggest direction, let author implement |
+| Ignoring test quality | Tests pass but don't verify anything | Review tests as carefully as production code |
+| Delayed reviews (>24h) | Blocks author, context decays | Review within 4 hours during work hours |
+| Drive-by partial review | Author thinks it's approved | Review entire PR or say what you skipped |
diff --git a/references/performance-checklist.md b/references/performance-checklist.md
new file mode 100644
index 00000000..1228f5d9
--- /dev/null
+++ b/references/performance-checklist.md
@@ -0,0 +1,114 @@
+# Performance Checklist
+
+Quick reference for application performance. Stack-agnostic — covers backend, CLI, and system-level patterns.
+
+## Table of Contents
+
+- [Measurement First](#measurement-first)
+- [Algorithmic Efficiency](#algorithmic-efficiency)
+- [Memory Management](#memory-management)
+- [Concurrency](#concurrency)
+- [Database Performance](#database-performance)
+- [Network and I/O](#network-and-io)
+- [Caching](#caching)
+- [Build and Deploy](#build-and-deploy)
+- [Common Anti-Patterns](#common-anti-patterns)
+
+## Measurement First
+
+- [ ] Performance targets defined before optimizing (latency p50/p95/p99, throughput)
+- [ ] Benchmarks exist for hot paths (reproducible, tracked in CI)
+- [ ] Profiler used before guessing bottleneck location
+- [ ] Before/after measurements accompany every optimization PR
+
+| Language | Profiler | Benchmark Tool |
+|----------|---------|---------------|
+| Rust | `perf`, `flamegraph`, `cargo-flamegraph` | `criterion`, `#[bench]` |
+| Python | `cProfile`, `py-spy` | `pytest-benchmark`, `timeit` |
+| Go | `pprof` (built-in) | `testing.B` (built-in) |
+| TypeScript | Chrome DevTools, `clinic.js` | `vitest bench`, `tinybench` |
+
+## Algorithmic Efficiency
+
+- [ ] Hot-path algorithms are appropriate complexity (no O(n^2) where O(n log n) works)
+- [ ] Data structures match access patterns (hashmap for lookup, sorted array for range)
+- [ ] No redundant computation in loops (hoist invariants out)
+- [ ] String concatenation in loops uses builder/buffer pattern
+- [ ] Sorting uses appropriate algorithm for data characteristics
+
+## Memory Management
+
+- [ ] No unbounded growth (buffers, caches, queues have size limits)
+- [ ] Large allocations reuse buffers where possible
+- [ ] Streaming used for large data (not loading entire files into memory)
+- [ ] Object pools used for frequently allocated/freed objects in hot paths
+- [ ] Memory leaks checked (event listeners, timers, closures holding references)
+
+| Language | Leak Detection |
+|----------|---------------|
+| Rust | `valgrind`, `miri`, ownership system prevents most leaks |
+| Python | `tracemalloc`, `objgraph` |
+| Go | `pprof heap`, `runtime.MemStats` |
+| TypeScript | Chrome DevTools heap snapshot, `--inspect` |
+
+## Concurrency
+
+- [ ] I/O-bound work uses async/non-blocking patterns
+- [ ] CPU-bound work parallelized across cores where beneficial
+- [ ] Shared state minimized; prefer message passing or immutable data
+- [ ] Lock granularity appropriate (no global locks in hot paths)
+- [ ] Connection pools sized for expected concurrency
+- [ ] Backpressure implemented for producer-consumer patterns
+
+## Database Performance
+
+- [ ] No N+1 query patterns (use joins, batch loading, or eager loading)
+- [ ] Queries have appropriate indexes (check query plans)
+- [ ] List endpoints paginated (never unbounded SELECT)
+- [ ] Connection pooling configured and sized
+- [ ] Slow query logging enabled (identify regressions)
+- [ ] Bulk operations used instead of row-by-row inserts/updates
+- [ ] Read replicas used for read-heavy workloads (if applicable)
+- [ ] Transactions held for minimum duration
+
+## Network and I/O
+
+- [ ] API response times < 200ms (p95 target)
+- [ ] Response compression enabled (gzip/brotli)
+- [ ] Batch APIs available for multi-resource operations
+- [ ] No synchronous blocking in async handlers
+- [ ] File I/O uses buffered readers/writers
+- [ ] DNS and connection reuse via keep-alive / connection pooling
+- [ ] Timeouts set on all outbound calls (prevent hanging)
+
+## Caching
+
+- [ ] Cache invalidation strategy defined (TTL, event-based, or versioned keys)
+- [ ] Cache hit rate monitored (< 80% hit rate = review strategy)
+- [ ] Hot data cached close to consumer (in-process > local > distributed)
+- [ ] Cache stampede protection (singleflight, probabilistic early expiry)
+- [ ] Cache size bounded (LRU or similar eviction policy)
+- [ ] Cached data has freshness expectations documented
+
+## Build and Deploy
+
+- [ ] Release builds use full optimizations (not debug builds in prod)
+- [ ] Binary size minimized (strip symbols, LTO where appropriate)
+- [ ] Startup time measured and optimized (lazy initialization)
+- [ ] Health check endpoints respond quickly (no heavy initialization)
+- [ ] Graceful shutdown drains in-flight requests
+
+## Common Anti-Patterns
+
+| Anti-Pattern | Impact | Fix |
+|---|---|---|
+| N+1 queries | Linear DB load growth | Use joins or batch loading |
+| Unbounded queries | Memory exhaustion, timeouts | Always paginate, add LIMIT |
+| Missing indexes | Slow reads as data grows | Add indexes for filtered/sorted columns |
+| Premature optimization | Wasted effort, worse readability | Profile first, optimize measured bottlenecks |
+| Global locks in hot paths | Serialized throughput | Fine-grained locks or lock-free structures |
+| Allocating in tight loops | GC pressure, cache thrashing | Pre-allocate, reuse buffers |
+| Synchronous I/O in async code | Thread starvation | Use async I/O or spawn blocking tasks |
+| No timeouts on network calls | Resource exhaustion on failure | Set timeouts on every outbound call |
+| Caching without eviction | Unbounded memory growth | Set max size with LRU eviction |
+| Logging in hot paths | I/O bottleneck | Sample or batch log writes |
diff --git a/references/security-checklist.md b/references/security-checklist.md
new file mode 100644
index 00000000..cf646476
--- /dev/null
+++ b/references/security-checklist.md
@@ -0,0 +1,127 @@
+# Security Checklist
+
+Quick reference for application security. Stack-agnostic — applies to Rust, Python, Go, TypeScript, and beyond.
+
+## Table of Contents
+
+- [Pre-Commit Checks](#pre-commit-checks)
+- [Authentication](#authentication)
+- [Authorization](#authorization)
+- [Input Validation](#input-validation)
+- [Cryptography](#cryptography)
+- [Data Protection](#data-protection)
+- [Dependency Security](#dependency-security)
+- [Error Handling](#error-handling)
+- [Infrastructure](#infrastructure)
+- [OWASP Top 10 Quick Reference](#owasp-top-10-quick-reference)
+
+## Pre-Commit Checks
+
+- [ ] No secrets in code (passwords, API keys, tokens, private keys)
+- [ ] `.gitignore` covers: `.env`, `*.pem`, `*.key`, `credentials.*`
+- [ ] `.env.example` uses placeholder values only
+- [ ] No hardcoded connection strings or endpoints
+- [ ] Secret scanning enabled in CI (e.g., `gitleaks`, `trufflehog`)
+
+## Authentication
+
+- [ ] Passwords hashed with bcrypt (>=12 rounds), scrypt, or argon2
+- [ ] Session tokens are cryptographically random (>=256 bits)
+- [ ] Session expiration configured with reasonable max-age
+- [ ] Rate limiting on login endpoints (<=10 attempts per 15 minutes)
+- [ ] Password reset tokens: time-limited (<=1 hour), single-use
+- [ ] Account lockout after repeated failures (with notification)
+- [ ] MFA supported for sensitive operations
+
+## Authorization
+
+- [ ] Every protected endpoint checks authentication first
+- [ ] Every resource access checks ownership or role (prevents IDOR)
+- [ ] Admin endpoints require admin role verification
+- [ ] API keys/tokens scoped to minimum necessary permissions
+- [ ] JWT tokens validated: signature, expiration, issuer, audience
+- [ ] Default deny: new endpoints are protected unless explicitly public
+- [ ] Service-to-service calls authenticated (mTLS, signed tokens)
+
+## Input Validation
+
+- [ ] All user input validated at system boundaries
+- [ ] Validation uses allowlists, not denylists
+- [ ] String lengths constrained (min/max)
+- [ ] Numeric ranges validated and bounds-checked
+- [ ] File uploads: type restricted, size limited, content verified
+- [ ] SQL queries parameterized (no string concatenation)
+- [ ] Output encoded/escaped for context (HTML, SQL, shell, URLs)
+- [ ] URLs validated before redirect (prevent open redirect)
+- [ ] Deserialization restricted to expected types (no arbitrary object creation)
+
+## Cryptography
+
+- [ ] TLS 1.2+ for all network communication
+- [ ] No custom crypto implementations (use audited libraries)
+- [ ] Symmetric encryption: AES-256-GCM or ChaCha20-Poly1305
+- [ ] Asymmetric: RSA >=2048 bits or Ed25519
+- [ ] Random values from CSPRNG, not `rand()` or `Math.random()`
+- [ ] Keys rotated on schedule and on compromise
+
+| Language | Crypto Library | CSPRNG |
+|----------|---------------|--------|
+| Rust | `ring`, `rustls` | `rand::rngs::OsRng` |
+| Python | `cryptography` | `secrets` module |
+| Go | `crypto/*` stdlib | `crypto/rand` |
+| TypeScript | `crypto` (Node) | `crypto.randomBytes` |
+
+## Data Protection
+
+- [ ] Sensitive fields excluded from API responses (password hashes, tokens)
+- [ ] Sensitive data not logged (passwords, tokens, PII)
+- [ ] PII encrypted at rest (if required by regulation)
+- [ ] Database backups encrypted
+- [ ] Data retention policy defined and enforced
+- [ ] Personally identifiable data deletable on request
+
+## Dependency Security
+
+- [ ] Dependencies audited regularly for known vulnerabilities
+- [ ] Lockfile committed and used in CI (reproducible builds)
+- [ ] Automated alerts for vulnerable dependencies enabled
+- [ ] Minimal dependency policy (fewer deps = smaller attack surface)
+
+| Language | Audit Command |
+|----------|--------------|
+| Rust | `cargo audit` |
+| Python | `pip-audit`, `safety check` |
+| Go | `govulncheck ./...` |
+| TypeScript | `npm audit`, `yarn audit` |
+
+## Error Handling
+
+- [ ] Production errors are generic (no stack traces, SQL, or internals)
+- [ ] Errors logged server-side with correlation IDs
+- [ ] Error messages do not reveal user existence (login/signup)
+- [ ] Panic/crash recovery does not expose sensitive state
+- [ ] Rate-limited error responses (prevent enumeration attacks)
+
+## Infrastructure
+
+- [ ] HTTPS enforced (redirect HTTP to HTTPS)
+- [ ] Security headers set (CSP, HSTS, X-Content-Type-Options, X-Frame-Options)
+- [ ] CORS restricted to known origins (never `*` in production)
+- [ ] Containers run as non-root user
+- [ ] Network segmentation between services and databases
+- [ ] Secrets managed via vault/KMS, not environment variables in code
+
+## OWASP Top 10 Quick Reference
+
+| # | Vulnerability | Prevention |
+|---|---|---|
+| 1 | Broken Access Control | Auth checks on every endpoint, ownership verification |
+| 2 | Cryptographic Failures | HTTPS, strong hashing, no secrets in code |
+| 3 | Injection | Parameterized queries, input validation |
+| 4 | Insecure Design | Threat modeling, spec-driven development |
+| 5 | Security Misconfiguration | Security headers, minimal permissions, audit deps |
+| 6 | Vulnerable Components | Audit deps, keep updated, minimize dependencies |
+| 7 | Auth Failures | Strong passwords, rate limiting, session management |
+| 8 | Data Integrity Failures | Verify updates/dependencies, signed artifacts |
+| 9 | Logging Failures | Log security events, don't log secrets |
+| 10 | SSRF | Validate/allowlist URLs, restrict outbound requests |
diff --git a/references/testing-patterns.md b/references/testing-patterns.md
new file mode 100644
index 00000000..3bf1f301
--- /dev/null
+++ b/references/testing-patterns.md
@@ -0,0 +1,124 @@
+# Testing Patterns Reference
+
+Quick reference for testing patterns across stacks. Stack-agnostic — examples in Rust, Python, Go, TypeScript.
+
+## Table of Contents
+
+- [Test Structure (Arrange-Act-Assert)](#test-structure-arrange-act-assert)
+- [Test Naming Conventions](#test-naming-conventions)
+- [Mocking Patterns](#mocking-patterns)
+- [Property-Based Testing](#property-based-testing)
+- [Integration Testing](#integration-testing)
+- [Test Isolation](#test-isolation)
+- [Test Anti-Patterns](#test-anti-patterns)
+
+## Test Structure (Arrange-Act-Assert)
+
+Every test follows the same three-phase pattern regardless of language:
+
+```
+// Arrange: Set up test data and preconditions
+// Act:     Perform the action being tested
+// Assert:  Verify the outcome
+```
+
+- [ ] Each test has exactly one reason to fail
+- [ ] Test name describes expected behavior, not implementation
+- [ ] Arrange phase uses factory functions, not inline construction
+- [ ] Assert phase checks outcomes, not intermediate state
+
+## Test Naming Conventions
+
+```
+Pattern: [unit]_[expected_behavior]_[condition]
+
+Rust:    #[test] fn parse_config_returns_error_when_file_missing()
+Python:  def test_parse_config_returns_error_when_file_missing():
+Go:      func TestParseConfig_ReturnsError_WhenFileMissing(t *testing.T)
+TS:      it('returns error when config file is missing', () => {})
+```
+
+- [ ] Names read as sentences describing behavior
+- [ ] No `test1`, `test_thing`, or `it_works` names
+- [ ] Failure message makes the bug obvious without reading test code
+
+## Mocking Patterns
+
+### Mock at Boundaries Only
+
+```
+Mock these:                    Don't mock these:
++-- Database calls             +-- Internal utility functions
++-- HTTP/gRPC clients          +-- Business logic
++-- File system operations     +-- Data transformations
++-- External API calls         +-- Validation functions
++-- Clock/time (when needed)   +-- Pure functions
+```
+
+- [ ] Mocks verify interactions at system boundaries, not internal calls
+- [ ] Each mock has explicit expectations (not just "was called")
+- [ ] Mock setup is extracted into helpers when reused across tests
+- [ ] Prefer fakes (in-memory implementations) over mocks for complex interfaces
+
+### Language-Specific Patterns
+
+| Language | Mock Library | Injection Pattern |
+|----------|-------------|-------------------|
+| Rust | `mockall`, test modules | Trait objects, generics |
+| Python | `unittest.mock`, `pytest-mock` | Dependency injection, `monkeypatch` |
+| Go | `gomock`, `testify/mock` | Interface parameters |
+| TypeScript | `jest.fn()`, `vitest` | Constructor injection, module mocks |
+
+## Property-Based Testing
+
+Use when: inputs have wide ranges, edge cases are non-obvious, or round-trip invariants exist.
+
+- [ ] Encode invariants as properties (e.g., `serialize(deserialize(x)) == x`)
+- [ ] Test commutativity, associativity, idempotency where applicable
+- [ ] Use shrinking to find minimal failing cases
+- [ ] Combine with example-based tests for known edge cases
+
+| Language | Library |
+|----------|---------|
+| Rust | `proptest`, `quickcheck` |
+| Python | `hypothesis` |
+| Go | `gopter`, `rapid` |
+| TypeScript | `fast-check` |
+
+## Integration Testing
+
+- [ ] Tests use real dependencies (database, filesystem) where practical
+- [ ] Each test creates and tears down its own state (no shared fixtures)
+- [ ] Tests run in parallel-safe isolation (unique DB schemas, temp dirs)
+- [ ] Network calls to external services are recorded/replayed (VCR pattern)
+- [ ] CI runs integration tests in a separate stage from unit tests
+
+### Database Testing Patterns
+
+- [ ] Use transactions that roll back after each test (fastest isolation)
+- [ ] Or: create per-test schemas/databases and drop after
+- [ ] Seed minimal data per test, not shared global fixtures
+- [ ] Assert on query results, not SQL strings
+
+## Test Isolation
+
+- [ ] No test depends on another test's execution or ordering
+- [ ] No shared mutable state between tests (global variables, singletons)
+- [ ] Temp files use unique directories per test, cleaned up after
+- [ ] Environment variables restored after tests that modify them
+- [ ] Parallel execution is the default; serial only when justified
+
+## Test Anti-Patterns
+
+| Anti-Pattern | Problem | Better Approach |
+|---|---|---|
+| Testing implementation details | Breaks on refactor | Test inputs/outputs only |
+| Snapshot everything | No one reviews snapshot diffs | Assert specific values |
+| Shared mutable state | Tests pollute each other | Setup/teardown per test |
+| Testing third-party code | Wastes time, not your bug | Mock the boundary |
+| `skip`/`ignore` permanently | Dead code hiding bugs | Remove or fix the test |
+| Overly broad assertions | Doesn't catch regressions | Be specific about expectations |
+| No async error handling | Swallowed errors, false passes | Always await async tests |
+| Flaky time-dependent tests | Random CI failures | Use fake clocks, not `sleep` |
+| Giant test fixtures | Hard to understand failures | Minimal data per test |
+| Testing private internals | Couples tests to structure | Test through public API |

From 0e43cdb57b0cb330149da85d40e3b3fc9437f935 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:50:19 +0800
Subject: [PATCH 05/13] feat: add API and interface design skill

Contract-first interface design skill covering REST APIs, CLI tools,
libraries, RPC, and event schemas. Includes Hyrum's Law, idempotency,
and boundary validation patterns.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-api-design/SKILL.md | 183 +++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 skills/flow-code-api-design/SKILL.md

diff --git a/skills/flow-code-api-design/SKILL.md b/skills/flow-code-api-design/SKILL.md
new file mode 100644
index 00000000..89a5e7e9
--- /dev/null
+++ b/skills/flow-code-api-design/SKILL.md
@@ -0,0 +1,183 @@
+---
+name: flow-code-api-design
+description: "Use when designing or modifying APIs, module boundaries, CLI interfaces, library public surfaces, RPC contracts, or event schemas"
+---
+
+# API and Interface Design
+
+## Overview
+
+Contract-first interface design: define the interface before implementing it, then enforce it at boundaries. Applicable to REST APIs, CLI tools, library APIs, RPC services, event schemas, and module boundaries. Good interfaces make the right thing easy and the wrong thing hard.
+
+## When to Use
+
+- Designing a new API, CLI command, library public surface, or RPC contract
+- Changing an existing public interface (adding fields, modifying behavior)
+- Defining cross-team or cross-module contracts
+- Establishing event schemas or message formats
+- Creating module boundaries within a codebase
+
+**When NOT to use:**
+- Internal implementation details behind a stable interface
+- Private functions within a module
+- Refactoring internals without changing the public surface
+- If debugging an API issue, use the `flow-code-debug` skill instead
+
+## Core Process
+
+### Phase 1: Define the Contract First
+
+Design the interface before writing any implementation. The contract is the spec.
+
+1. **Identify consumers** -- who calls this interface? Other services, CLI users, library consumers, event subscribers?
+2. **Define input/output shapes** -- what goes in, what comes out. Be explicit about types, optionality, and defaults.
+3. **Specify error semantics** -- every interface has failure modes. Define them upfront.
+
+```
+Example contract (stack-agnostic pseudocode):
+
+  interface TaskStore:
+    create(input: CreateTaskInput) -> Task | ValidationError
+    get(id: string) -> Task | NotFoundError
+    list(filter: Filter, page: Page) -> PaginatedResult<Task>
+    update(id: string, patch: Partial<Task>) -> Task | NotFoundError | ValidationError
+    delete(id: string) -> void  // idempotent: succeeds even if already deleted
+```
+
+For CLI tools, the contract is the command signature:
+```bash
+# Contract: flowctl task create --title <str> [--domain <str>] [--files <paths>]
+# Output: JSON with { id, title, status } on success
+# Exit code: 0 success, 1 validation error, 2 not found
+```
+
+For event schemas:
+```
+Event: task.completed
+  payload: { task_id: string, completed_at: timestamp, duration_seconds: int }
+  guarantees: at-least-once delivery, idempotent consumers expected
+```
+
+### Phase 2: Apply Hyrum's Law Awareness
+
+> With a sufficient number of users, all observable behaviors become de facto contracts.
+
+Every public behavior -- including undocumented quirks, error message text, ordering, and timing -- becomes a commitment once consumers depend on it.
+
+Design implications:
+- Be intentional about what you expose. If consumers can observe it, they will depend on it.
+- Do not leak implementation details through the interface (internal IDs, database structure, stack traces).
+- Treat error message formats as part of the contract -- consumers parse them.
+- For CLI tools: exit codes, output format (JSON vs text), and flag names are all contract surface.
+
+### Phase 3: Design Error Semantics
+
+Pick one error strategy and use it consistently across the entire interface:
+
+**Structured errors (recommended):**
+```
+Error shape (any transport):
+  code: string      // Machine-readable: "VALIDATION_ERROR", "NOT_FOUND"
+  message: string   // Human-readable: "Title is required"
+  details?: any     // Additional context when helpful
+
+Transport mapping:
+  REST:  code -> HTTP status (400, 404, 409, 422, 500)
+  CLI:   code -> exit code + JSON stderr
+  RPC:   code -> status enum + metadata
+  Event: code -> dead-letter reason
+```
+
+Do not mix patterns. If some operations throw, others return null, and others return `{ error }` -- consumers cannot predict behavior.
+
+### Phase 4: Validate at Boundaries
+
+Trust internal code. Validate where external input enters the system:
+
+**Where validation belongs:**
+- API route handlers (user input)
+- CLI argument parsing (user input)
+- External service response parsing (third-party data -- always untrusted)
+- Event/message deserialization (cross-service data)
+- Environment variable loading (configuration)
+
+**Where validation does NOT belong:**
+- Between internal functions that share type contracts
+- In utility functions called by already-validated code
+- On data that just came from your own database
+
+### Phase 5: Plan for Evolution
+
+Design interfaces that can grow without breaking consumers:
+
+1. **Prefer addition over modification** -- new optional fields, new endpoints/commands, new event types.
+2. **Never remove or change existing fields** -- every consumer becomes a constraint.
+3. **Version when breaking changes are unavoidable** -- but prefer extension first.
+4. **Use the One-Version Rule** -- avoid forcing consumers to choose between multiple versions.
+
+For CLI tools: new flags default to off, new subcommands are additive, output format changes require `--format` flags.
+
+### Phase 6: Design for Idempotency
+
+Idempotency applies beyond HTTP -- CLI operations, event handlers, and RPC calls all benefit:
+
+- **DELETE/remove operations**: succeed even if resource already gone.
+- **CLI operations**: `flowctl lock --task T1 --files f.py` is safe to call twice.
+- **Event handlers**: processing the same event twice produces the same result.
+- **Create with client ID**: client provides an idempotency key; server deduplicates.
+
+```
+Idempotency decision:
+  Is the operation naturally idempotent (GET, PUT, DELETE)? -> No extra work.
+  Is it a create/mutation? -> Require idempotency key or make it upsert.
+  Is it an event handler? -> Track processed event IDs or make handler pure.
+```
+
+### Phase 7: Document the Contract
+
+The contract must be committed alongside the implementation:
+
+- Type definitions or schema files (protobuf, JSON Schema, OpenAPI, CLI --help)
+- Error code catalog (all possible error codes with descriptions)
+- Examples of success and failure responses
+- Migration notes for any changes to existing interfaces
+
+Reference `references/code-review-checklist.md` for review patterns -- particularly the Architecture and Correctness sections when reviewing API changes.
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "I'll design the API after implementation" | API shapes emerge from implementation but don't match consumer needs. Contract-first catches mismatches before code exists. |
+| "Internal APIs don't need contracts" | Hyrum's Law applies to internal APIs too. Internal consumers are still consumers -- contracts prevent coupling and enable parallel work. |
+| "We can always change it later" | Every consumer becomes a constraint. The cost of change grows exponentially with adoption. |
+| "The code is the documentation" | Consumers shouldn't need to read implementation to use your API. The contract is the interface, not the code behind it. |
+| "Edge cases can wait" | Edge cases in APIs become permanent undefined behavior. Consumers will discover them and depend on whatever happens. |
+| "We don't need error codes yet" | Without structured errors from day one, consumers parse message strings. Changing error format later breaks every consumer. |
+| "This CLI is just for internal use" | Internal CLI tools accumulate scripts that depend on exact output format. Same contract discipline applies. |
+| "Versioning is overkill" | Breaking changes without versioning break consumers silently. Design for extension from the start. |
+
+## Red Flags
+
+- Interface returns different shapes depending on conditions (inconsistent contract)
+- Error formats differ across operations in the same API
+- Validation scattered throughout internal code instead of at boundaries
+- Breaking changes to existing fields (type changes, removals, renamed flags)
+- List/search operations without pagination or size limits
+- CLI commands with unparseable free-text output instead of structured (JSON) output
+- No idempotency story for mutating operations
+- Implementation details leaking through the interface (database IDs, internal error traces)
+- Contract defined only in code comments, not in types or schema files
+
+## Verification
+
+After designing or modifying an interface:
+
+- [ ] Contract defined before implementation (types, schema, or CLI signature committed first)
+- [ ] Every operation has explicit input and output shapes with typed errors
+- [ ] Error responses follow a single consistent format across all operations
+- [ ] Validation happens at system boundaries only (not scattered internally)
+- [ ] New fields/flags are additive and optional (backward compatible)
+- [ ] Mutating operations have an idempotency strategy
+- [ ] Contract documentation committed alongside implementation
+- [ ] Reviewed against `references/code-review-checklist.md` Architecture and Correctness sections

From 44bda5879bc9e9bb34fdfc1911aaaf205d80221d Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:50:31 +0800
Subject: [PATCH 06/13] feat: add CI/CD patterns skill

Quality gate pipeline design, shift-left principle, feature flags,
and staged rollouts. Platform-agnostic with GitHub Actions examples.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-cicd/SKILL.md | 245 +++++++++++++++++++++++++++++++++
 1 file changed, 245 insertions(+)
 create mode 100644 skills/flow-code-cicd/SKILL.md

diff --git a/skills/flow-code-cicd/SKILL.md b/skills/flow-code-cicd/SKILL.md
new file mode 100644
index 00000000..90fa2089
--- /dev/null
+++ b/skills/flow-code-cicd/SKILL.md
@@ -0,0 +1,245 @@
+---
+name: flow-code-cicd
+description: "Use when setting up, modifying, or troubleshooting CI/CD pipelines, quality gates, or deployment automation"
+---
+
+# CI/CD Patterns
+
+## Overview
+
+Automate quality gates so no change reaches production without passing verification. CI/CD is the enforcement mechanism for every other skill -- it catches what humans and agents miss, consistently on every change. Gates ordered from fastest to slowest; shift left to catch problems at the cheapest stage.
+
+## When to Use
+
+- Setting up a new project's CI pipeline
+- Adding or modifying quality gates
+- Configuring deployment pipelines or staged rollouts
+- Integrating feature flags for deploy/release separation
+- Debugging CI failures or slow pipelines
+- **Especially when:** no CI exists yet, pipeline exceeds 10 minutes, deploys are manual
+
+**When NOT to use:**
+- One-off manual deployments that won't recur
+- Local-only development with no shared branches
+- For test design details, see `references/testing-patterns.md`
+
+## Core Process
+
+### Phase 1: Design Quality Gate Pipeline
+
+Order gates from fastest to slowest. Every gate that fails stops the pipeline -- no skipping.
+
+```
+Change Pushed
+    |
+    v
++-------------------+
+|  1. LINT           |  Seconds. Catches style, unused vars, formatting.
+|  2. TYPE CHECK     |  Seconds. Catches type errors statically.
+|  3. UNIT TESTS     |  Seconds-minutes. Catches logic errors.
+|  4. BUILD          |  Minutes. Catches compilation/bundling errors.
+|  5. INTEGRATION    |  Minutes. Catches cross-component failures.
+|  6. E2E (optional) |  Minutes. Catches user-facing regressions.
+|  7. SECURITY AUDIT |  Minutes. Catches vulnerable deps, secrets.
++-------------------+
+    |
+    v
+  Ready for review
+```
+
+**No gate can be skipped.** If lint fails, fix lint -- don't disable the rule. If a test fails, fix the code -- don't skip the test.
+
+### Phase 2: Apply Shift-Left Principle
+
+Catch problems as early as possible. A bug caught in linting costs minutes; the same bug caught in production costs hours.
+
+**Local pre-commit gate** -- run before pushing:
+
+```bash
+$FLOWCTL guard
+```
+
+This runs all configured guards (lint, type-check, tests) locally before code leaves the developer's machine. CI then re-runs the same checks as a safety net, not the first line of defense.
+
+**Pipeline optimization rules:**
+1. Cheapest checks first (lint before build, unit before integration)
+2. Parallelize independent gates (lint + type-check can run simultaneously)
+3. Cache dependencies aggressively
+4. Use path filters to skip irrelevant jobs (docs-only PRs skip e2e)
+
+### Phase 3: Configure Pipeline
+
+Platform-agnostic principles with a GitHub Actions example:
+
+**Generic pipeline pattern** (adapt to any CI system):
+
+```
+trigger: pull_request, push to main
+stages:
+  - stage: fast-checks (parallel)
+    jobs: [lint, type-check]
+  - stage: test
+    jobs: [unit-test, build]
+    depends_on: fast-checks
+  - stage: verify
+    jobs: [integration, security-audit]
+    depends_on: test
+  - stage: deploy-staging (auto on main)
+    depends_on: verify
+  - stage: deploy-production (manual gate or auto after staging)
+    depends_on: deploy-staging
+```
+
+**GitHub Actions example:**
+
+```yaml
+# .github/workflows/ci.yml
+name: CI
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Lint
+        run: |
+          # Language-specific: cargo clippy, npm run lint, ruff check, etc.
+          echo "Run your linter here"
+
+  typecheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Type check
+        run: |
+          # cargo check, npx tsc --noEmit, mypy, etc.
+          echo "Run your type checker here"
+
+  test:
+    needs: [lint, typecheck]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Unit tests
+        run: |
+          # cargo test, npm test, pytest, go test, etc.
+          echo "Run your tests here"
+      - name: Build
+        run: |
+          # cargo build --release, npm run build, go build, etc.
+          echo "Run your build here"
+
+  security:
+    needs: [test]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Security audit
+        run: |
+          # cargo audit, npm audit, pip-audit, govulncheck
+          echo "Run your security audit here"
+```
+
+See `references/security-checklist.md` for the full security verification checklist including dependency auditing, secret scanning, and OWASP Top 10.
+
+### Phase 4: Configure Feature Flags
+
+Feature flags decouple deployment from release. Deploy code without enabling it.
+
+```
+Feature flag lifecycle:
+  Create flag → Enable for testing → Canary (1%) → Staged rollout → Full rollout → Remove flag
+
+Flag rules:
+  - Every flag gets a cleanup date at creation
+  - Flags older than 30 days without rollout trigger review
+  - Dead flags (fully rolled out) are tech debt -- remove them
+```
+
+**Why flags matter:**
+- Ship code to main without enabling it (reduces branch lifetime)
+- Roll back without redeploying (disable the flag)
+- Canary new features safely (1% of users first)
+- A/B test behavior differences
+
+### Phase 5: Set Up Staged Rollouts
+
+Never deploy to 100% of users at once.
+
+```
+Staged rollout:
+  1%  → Monitor 15 min → errors? → Rollback
+  10% → Monitor 30 min → errors? → Rollback
+  50% → Monitor 1 hour → errors? → Rollback
+  100% → Continue monitoring
+```
+
+**Rollback requirements:**
+- Every deployment must be reversible within 5 minutes
+- Rollback procedure documented and tested (not just "we'll figure it out")
+- Rollback does not require a new build (revert to previous artifact)
+
+### Phase 6: Add Monitoring and Alerting
+
+Deployment without monitoring is flying blind.
+
+```
+Post-deploy checklist:
+  [ ] Error rate monitoring active
+  [ ] Latency monitoring active
+  [ ] Key business metrics dashboarded
+  [ ] Alerting configured for anomalies
+  [ ] Runbook exists for common failure modes
+```
+
+**CI pipeline health:**
+- Track pipeline duration over time -- if it grows past 10 minutes, optimize
+- Monitor flaky test rate -- flaky tests erode trust in CI
+- Alert on pipeline failures in main branch (broken main = emergency)
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "CI is too slow" | Slow CI means wrong gate order. Fastest checks first. Parallelize. Cache. A 5-minute pipeline prevents hours of debugging. |
+| "We'll add tests to CI later" | CI without tests is just a build server. Tests are the point of CI. |
+| "Feature flags add complexity" | Deploying untested code to everyone adds more complexity. Flags give you a kill switch. |
+| "We don't need staging" | Staging catches issues that local environments can't reproduce. Network, data volume, concurrency -- staging surfaces them. |
+| "Manual deploy is fine for now" | Manual processes are forgotten, skipped, and inconsistent. Automate on day one. |
+| "This change is trivial, skip CI" | Trivial changes break builds. CI is fast for trivial changes anyway. |
+| "The test is flaky, just re-run" | Flaky tests mask real bugs and waste everyone's time. Fix the flakiness. See `references/testing-patterns.md` for test isolation patterns. |
+| "We'll figure out rollback if we need it" | You need it. Plan rollback before the first deploy, not during an outage. |
+
+## Red Flags
+
+- No CI pipeline in the project -- every shared project needs automated verification
+- CI failures ignored or silenced ("it's probably flaky")
+- Tests disabled in CI to make the pipeline pass
+- Production deploys without staging verification
+- No rollback mechanism -- deploying is a one-way door
+- Secrets hardcoded in CI config files instead of a secrets manager
+- Pipeline duration growing unchecked past 10 minutes
+- Feature flags with no cleanup dates accumulating as dead code
+- Main branch broken for hours with no one investigating
+
+## Verification
+
+After setting up or modifying a CI/CD pipeline, confirm:
+
+- [ ] All quality gates present and ordered fastest-to-slowest (lint, types, tests, build, integration, security)
+- [ ] `$FLOWCTL guard` configured as local pre-commit gate
+- [ ] Pipeline runs on every PR and push to main
+- [ ] Failures block merge (branch protection or equivalent configured)
+- [ ] Secrets stored in secrets manager, not in code or CI config (see `references/security-checklist.md`)
+- [ ] Deployment has a tested rollback mechanism
+- [ ] Pipeline completes in under 10 minutes for the standard test suite
+- [ ] Feature flags have documented cleanup dates
+- [ ] Post-deploy monitoring and alerting are active
+
+For performance gates in CI, see the `flow-code-performance` skill.
+For test design and patterns, see `references/testing-patterns.md`.

From 8ad20e3b78fc4ad32bf05565936f3b807f06af46 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:50:50 +0800
Subject: [PATCH 07/13] feat: add performance methodology skill

Measure-Identify-Fix-Verify-Guard workflow for systematic performance
optimization. Stack-agnostic with multi-language profiling examples.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-performance/SKILL.md | 240 ++++++++++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 skills/flow-code-performance/SKILL.md

diff --git a/skills/flow-code-performance/SKILL.md b/skills/flow-code-performance/SKILL.md
new file mode 100644
index 00000000..fbbb0a04
--- /dev/null
+++ b/skills/flow-code-performance/SKILL.md
@@ -0,0 +1,240 @@
+---
+name: flow-code-performance
+description: "Use when investigating performance issues, optimizing hot paths, adding benchmarks, or establishing performance regression guards"
+---
+
+# Performance Methodology
+
+## Overview
+
+Systematic performance optimization: measure first, then fix. Never optimize without a baseline. Intuition about bottlenecks is wrong more often than it is right -- profiling data is the only reliable guide. Every optimization must be measured before and after, and guarded against regression.
+
+## When to Use
+
+- Performance requirements exist in the spec (latency budgets, throughput SLAs)
+- Users or monitoring report slow behavior
+- Adding benchmarks to hot paths or critical operations
+- Pre-release performance check or audit
+- Investigating a suspected performance regression
+- Optimizing build times, test suite duration, or CI pipeline speed
+
+**When NOT to use:**
+- Premature optimization -- no evidence of a problem exists yet
+- No baseline measurements exist (measure first, then come back)
+- Micro-optimizing code that isn't on the hot path
+- "Making it faster" without a target metric to hit
+
+## Core Process
+
+```
+1. MEASURE  --> Establish baseline with profiling tools
+2. IDENTIFY --> Find the actual bottleneck (profiler, not intuition)
+3. FIX      --> Apply targeted fix to the measured bottleneck only
+4. VERIFY   --> Re-measure. If not measurably faster, revert
+5. GUARD    --> Add regression test/benchmark to prevent regression
+```
+
+### Step 1: Measure
+
+**No baseline = no optimization.** Before touching any code, capture reproducible numbers.
+
+#### Multi-Stack Profiling Tools
+
+| Language | Profiler | Benchmark Tool |
+|----------|----------|----------------|
+| Rust | `cargo flamegraph`, `perf`, `flamegraph` | `criterion`, `cargo bench` |
+| Python | `cProfile`, `py-spy` | `pytest-benchmark`, `timeit` |
+| Go | `pprof` (built-in) | `go test -bench`, `testing.B` |
+| General | `time`, `perf stat`, `hyperfine` | Custom harness with warm-up runs |
+
+#### What to Capture
+
+```bash
+# Rust: criterion benchmark with baseline
+cd project && cargo bench -- --save-baseline before
+
+# Python: profile a specific function
+python -m cProfile -s cumtime script.py > profile_before.txt
+
+# Go: CPU profile
+go test -bench=BenchmarkHotPath -cpuprofile=cpu_before.prof ./...
+
+# General: wall-clock timing with statistical rigor
+hyperfine --warmup 3 './my_command'
+```
+
+Record these numbers in a file or commit message. You need them for Step 4.
+
+### Step 2: Identify
+
+**The bottleneck is rarely where you think.** Use profiling data to find it.
+
+```
+Where is time spent?
+|-- CPU-bound
+|   |-- Hot loop? --> Check algorithmic complexity, data structures
+|   |-- Redundant computation? --> Cache or hoist invariants
+|   +-- Serialized work? --> Parallelize across cores
+|-- Memory-bound
+|   |-- Excessive allocation? --> Reuse buffers, pre-allocate
+|   |-- Cache thrashing? --> Improve data locality
+|   +-- Unbounded growth? --> Add size limits, eviction
+|-- I/O-bound
+|   |-- Database? --> Check queries, indexes, N+1 patterns
+|   |-- Network? --> Batch requests, connection pooling
+|   |-- Disk? --> Buffer I/O, async operations
++-- Concurrency
+    |-- Lock contention? --> Fine-grained locks, lock-free structures
+    |-- Thread starvation? --> Tune pool sizes
+    +-- Synchronous blocking in async? --> Use non-blocking I/O
+```
+
+#### Reading Profiler Output
+
+```bash
+# Rust: generate flamegraph
+cargo flamegraph --bin my_binary -- --my-args
+# Look for wide bars (time) and tall stacks (deep call chains)
+
+# Python: interactive profile browser
+python -m cProfile -o profile.prof script.py
+python -m pstats profile.prof
+# sort by cumulative time: sort cumtime
+
+# Go: interactive pprof
+go tool pprof cpu_before.prof
+# top10, list FunctionName, web (visualize)
+```
+
+### Step 3: Fix
+
+**Address the measured bottleneck only.** Do not "clean up" nearby code. Do not optimize multiple things at once.
+
+Common fixes by bottleneck type:
+
+**Algorithmic:**
+- Replace O(n^2) with O(n log n) or O(n) where possible
+- Use appropriate data structures (hashmap for lookup, sorted array for range queries)
+- Hoist loop invariants, eliminate redundant computation
+
+**Memory:**
+- Pre-allocate buffers to known sizes
+- Reuse allocations in hot loops (object pools, arena allocators)
+- Stream large data instead of loading entirely into memory
+
+**I/O and Database:**
+- Add indexes for filtered/sorted columns
+- Replace N+1 queries with joins or batch loading
+- Paginate unbounded queries
+- Use connection pooling
+
+**Concurrency:**
+- Replace global locks with fine-grained or reader-writer locks
+- Use lock-free data structures for contended hot paths
+- Move CPU-bound work to dedicated threads/processes
+
+### Step 4: Verify
+
+**Re-measure with the same methodology as Step 1.** Compare directly.
+
+```bash
+# Rust: compare against saved baseline
+cargo bench -- --baseline before
+
+# Python: compare profiles
+python -m cProfile -s cumtime script.py > profile_after.txt
+diff profile_before.txt profile_after.txt
+
+# Go: compare benchmarks
+go test -bench=BenchmarkHotPath -count=5 ./... > after.txt
+benchstat before.txt after.txt
+
+# General: side-by-side comparison
+hyperfine --warmup 3 './old_command' './new_command'
+```
+
+**Decision gate:**
+- Measurably faster (statistically significant) --> proceed to Step 5
+- Not measurably faster --> **revert the change**. It added complexity without benefit
+- Faster but broke correctness --> **revert**. Correctness always wins
+
+### Step 5: Guard
+
+**Add a regression guard so the improvement sticks.**
+
+```bash
+# Rust: criterion benchmark checked in CI
+# Add benchmark to benches/ directory, run in CI pipeline
+
+# Python: pytest-benchmark with --benchmark-autosave
+pytest --benchmark-autosave tests/benchmarks/
+
+# Go: benchmark test committed alongside unit tests
+# go test -bench=. runs automatically with test suite
+
+# General: performance budget in CI
+# Fail the build if response time > threshold
+```
+
+Guard types (choose at least one):
+- **Benchmark test**: committed to repo, runs in CI, fails on regression
+- **Performance budget**: threshold in CI config (e.g., "p95 < 200ms")
+- **Monitoring alert**: production metric alert for latency/throughput
+- **Size budget**: binary/artifact size threshold
+
+For CI performance gates, see the `flow-code-cicd` skill.
+
+## Amdahl's Law Reminder
+
+```
+Speedup = 1 / ((1 - P) + P/S)
+
+P = fraction of time in the bottleneck
+S = speedup factor of your optimization
+
+Example: Bottleneck is 10% of runtime. You make it 10x faster.
+Speedup = 1 / (0.9 + 0.1/10) = 1 / 0.91 = 1.10x (only 10% faster!)
+
+Lesson: Only the dominant bottleneck matters. A 2x speedup on 80% of
+runtime beats a 100x speedup on 5% of runtime.
+```
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "I know where the bottleneck is" | Profile first. Intuition is wrong >50% of the time. The actual bottleneck surprises even experienced engineers. |
+| "This optimization is obvious" | Obvious optimizations often aren't. Compilers and runtimes already optimize the obvious stuff. Measure to confirm. |
+| "We don't have time to benchmark" | You don't have time to optimize blindly either. A 10-minute benchmark saves hours of wasted effort on the wrong bottleneck. |
+| "It's fast enough" | Without a baseline, you don't know what "enough" means. Define the target, measure against it. |
+| "Let's optimize everything" | Amdahl's Law: only the bottleneck matters. Optimizing non-bottleneck code adds complexity for zero user-visible improvement. |
+| "We'll add benchmarks later" | Later never comes. The regression ships silently, and you re-discover it in production under load. |
+| "The fix is small, no need to re-measure" | Small fixes can have surprising effects (positive or negative). The measurement takes minutes. Just do it. |
+
+## Red Flags
+
+- Optimizing without profiling data to justify the target
+- Multiple optimizations applied simultaneously (can't isolate which helped)
+- No before/after numbers in the commit message or PR description
+- Optimization that makes code significantly harder to read without measured justification
+- "Performance refactor" that changes architecture without baseline measurements
+- Benchmarks that don't use warm-up runs or statistical methods (single-run timings are noise)
+- Reverting to debug builds or removing optimizations "temporarily" for debugging
+
+## Verification
+
+After any performance-related change, confirm:
+
+- [ ] Baseline measurements captured before optimization (specific numbers, not "it felt slow")
+- [ ] Profiler identified the actual bottleneck (not assumed from code reading)
+- [ ] After-measurements show statistically significant improvement over baseline
+- [ ] Regression guard added (benchmark test, CI budget, or monitoring alert)
+- [ ] Optimization did not break correctness (full test suite passes)
+- [ ] Before/after numbers documented in commit message or PR description
+- [ ] No unrelated "while I'm here" optimizations bundled in the same change
+
+## References
+
+- Detailed checklist: `references/performance-checklist.md`
+- For CI performance gates, see the `flow-code-cicd` skill
+- For debugging performance issues, see the `flow-code-debug` skill

From 3de2d168f9ba43b72bd348b7e4d502d9ed04e764 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 09:54:26 +0800
Subject: [PATCH 08/13] feat: audit and strengthen anti-rationalization tables
 across all skills

Reviewed 13 skills for Common Rationalizations and Red Flags coverage.
10 skills already met the >=5 rationalizations, >=3 red flags minimum.
Added both sections to the 3 skills that were missing them entirely:
plan-review (7 rationalizations, 6 red flags), brainstorm (7, 6),
and auto-improve (7, 6).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-auto-improve/SKILL.md | 21 +++++++++++++++++++++
 skills/flow-code-brainstorm/SKILL.md   | 21 +++++++++++++++++++++
 skills/flow-code-plan-review/SKILL.md  | 21 +++++++++++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/skills/flow-code-auto-improve/SKILL.md b/skills/flow-code-auto-improve/SKILL.md
index db54aea9..2b4e2be6 100644
--- a/skills/flow-code-auto-improve/SKILL.md
+++ b/skills/flow-code-auto-improve/SKILL.md
@@ -280,6 +280,27 @@ scripts/auto-improve/auto-improve.sh
 
 If `--watch` was passed, add `--watch` flag.
 
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "I know what to improve without analysis" | Intuition-driven improvements miss the highest-impact targets. Analysis finds the hotspots you don't see. |
+| "All improvements are equally valuable" | Impact varies by orders of magnitude. Rank by measured impact, not gut feeling. |
+| "Experiments are overhead" | Unverified improvements are guesses. A 2-minute before/after measurement proves the change actually helped. |
+| "Guard passes, so the change is safe" | Guards catch regressions, not quality improvements. Passing guards means you didn't break anything — not that you improved anything. |
+| "Let's improve everything at once" | Batch changes can't be isolated. One improvement per commit lets you revert the one that caused a regression. |
+| "The codebase is too messy to measure" | That's exactly why you need baselines. Measurement reveals which mess matters most. |
+| "This refactor is obviously better" | "Obviously better" without metrics is opinion. Measure readability, performance, or maintainability to confirm. |
+
+## Red Flags
+
+- Improvement commits with no before/after metrics in the message
+- Multiple unrelated improvements in a single commit
+- Auto-improve loop running without guard command configured (no safety net)
+- program.md unchanged across many iterations (stale focus, diminishing returns)
+- Improvements that increase code complexity without measured benefit
+- Reverting to fix regressions introduced by "improvements"
+
 ## Notes
 
 - First run auto-scaffolds `scripts/auto-improve/`. Subsequent runs reuse existing program.md (preserves user edits).
diff --git a/skills/flow-code-brainstorm/SKILL.md b/skills/flow-code-brainstorm/SKILL.md
index 676bfe30..00aa61aa 100644
--- a/skills/flow-code-brainstorm/SKILL.md
+++ b/skills/flow-code-brainstorm/SKILL.md
@@ -160,3 +160,24 @@ Requirements written to .flow/specs/<slug>-requirements.md
 
 Next step: Run /flow-code:plan .flow/specs/<slug>-requirements.md
 ```
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "I already know what to build" | Premature certainty is the most expensive mistake. Brainstorming surfaces assumptions you didn't know you had. |
+| "Brainstorming is just talking, not real work" | Brainstorming produces the requirements doc that drives everything downstream. Skip it and you plan against assumptions. |
+| "We don't have time to explore alternatives" | Exploring 3 approaches for 15 minutes is cheaper than rebuilding after choosing wrong on day one. |
+| "The first idea is usually right" | First ideas are anchored on recent experience. Pressure-testing reveals options that outperform the obvious choice. |
+| "Requirements are already in the ticket" | Tickets describe what someone wants, not what should be built. Brainstorming translates desire into actionable constraints. |
+| "Let's just start and iterate" | Iteration without direction is wandering. Brainstorming sets the constraints that make iteration productive. |
+| "This is too small to brainstorm" | Small scope doesn't mean small risk. A 5-minute pressure test on a "simple" feature often reveals hidden complexity. |
+
+## Red Flags
+
+- Jumping straight to /flow-code:plan without exploring requirements
+- Only one approach considered (no alternatives generated or evaluated)
+- Requirements doc has zero constraints or non-goals listed
+- "Open Questions" section is empty (every problem has unknowns)
+- Brainstorm output restates the original request without adding new insight
+- User's actual problem never identified (solution proposed without understanding need)
diff --git a/skills/flow-code-plan-review/SKILL.md b/skills/flow-code-plan-review/SKILL.md
index dc883b08..bb48b6ba 100644
--- a/skills/flow-code-plan-review/SKILL.md
+++ b/skills/flow-code-plan-review/SKILL.md
@@ -200,3 +200,24 @@ $FLOWCTL checkpoint restore --epic <EPIC_ID> --json
 ```
 
 **CRITICAL**: For RP, re-reviews must stay in the SAME chat so reviewer has context. Only use `--new-chat` on the FIRST review.
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "The plan looks reasonable, ship it" | Reasonable-looking plans fail at implementation. Review forces you to find the gaps before code exists. |
+| "We already discussed this verbally" | Verbal agreement evaporates. Written review catches assumptions that felt obvious in conversation but aren't. |
+| "Reviewer doesn't know our codebase" | External perspective catches blind spots. Codebase familiarity causes pattern blindness — fresh eyes find structural issues. |
+| "Review is blocking progress" | Review prevents rework. A 30-minute review saves days of implementing the wrong design. |
+| "The spec is too detailed to review" | Over-detailed specs hide weak architecture behind volume. If it's too complex to review, it's too complex to implement. |
+| "Minor plan issues, we'll fix during implementation" | Plan issues compound during implementation. A wrong assumption in the spec becomes wrong code in every task. |
+| "Just one more iteration and it's perfect" | Diminishing returns are real. Hit the circuit breaker, ship what's good enough, and capture remaining concerns as gaps. |
+
+## Red Flags
+
+- Plan approved without any reviewer reading the actual task specs
+- SHIP verdict with zero questions asked (rubber-stamp review)
+- Review feedback ignored because "we know better"
+- Same structural issue found in implementation that was present in the plan (review missed it)
+- Plan-review skipped because "we're behind schedule"
+- Reviewer only checked formatting, not technical feasibility

From 5799ccab9c3496ccf1edfa8312e54612b3cd65c9 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 10:08:08 +0800
Subject: [PATCH 09/13] refactor: move reference checklists into skill
 directories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each checklist now lives inside the skill that references it,
matching the established pattern (django, browser skills).
This ensures Skill tool path resolution works correctly.

- testing-patterns.md, security-checklist.md → skills/flow-code-cicd/references/
- performance-checklist.md → skills/flow-code-performance/references/
- code-review-checklist.md → skills/flow-code-api-design/references/
- adr-template.md stays in root references/ (not skill-specific)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../flow-code-api-design/references}/code-review-checklist.md     | 0
 .../flow-code-cicd/references}/security-checklist.md              | 0
 .../flow-code-cicd/references}/testing-patterns.md                | 0
 .../flow-code-performance/references}/performance-checklist.md    | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {references => skills/flow-code-api-design/references}/code-review-checklist.md (100%)
 rename {references => skills/flow-code-cicd/references}/security-checklist.md (100%)
 rename {references => skills/flow-code-cicd/references}/testing-patterns.md (100%)
 rename {references => skills/flow-code-performance/references}/performance-checklist.md (100%)

diff --git a/references/code-review-checklist.md b/skills/flow-code-api-design/references/code-review-checklist.md
similarity index 100%
rename from references/code-review-checklist.md
rename to skills/flow-code-api-design/references/code-review-checklist.md
diff --git a/references/security-checklist.md b/skills/flow-code-cicd/references/security-checklist.md
similarity index 100%
rename from references/security-checklist.md
rename to skills/flow-code-cicd/references/security-checklist.md
diff --git a/references/testing-patterns.md b/skills/flow-code-cicd/references/testing-patterns.md
similarity index 100%
rename from references/testing-patterns.md
rename to skills/flow-code-cicd/references/testing-patterns.md
diff --git a/references/performance-checklist.md b/skills/flow-code-performance/references/performance-checklist.md
similarity index 100%
rename from references/performance-checklist.md
rename to skills/flow-code-performance/references/performance-checklist.md

From efb1edf7d8215162c0ef018a3e82da370bbffa34 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 10:42:15 +0800
Subject: [PATCH 10/13] feat(flowctl-db): add skills table and skill module for
 semantic routing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add skills table (name, description, embedding) with vector index for
semantic skill matching. SkillRepo provides upsert() and match_skills()
using cosine similarity via BGE-small embeddings.

Reuses embed_one()/ensure_embedder() from memory module — zero duplication.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 flowctl/crates/flowctl-db/src/lib.rs     |   2 +
 flowctl/crates/flowctl-db/src/memory.rs  |   6 +-
 flowctl/crates/flowctl-db/src/pool.rs    |  10 +-
 flowctl/crates/flowctl-db/src/schema.sql |  15 ++
 flowctl/crates/flowctl-db/src/skill.rs   | 260 +++++++++++++++++++++++
 5 files changed, 289 insertions(+), 4 deletions(-)
 create mode 100644 flowctl/crates/flowctl-db/src/skill.rs

diff --git a/flowctl/crates/flowctl-db/src/lib.rs b/flowctl/crates/flowctl-db/src/lib.rs
index abe78d3e..bd836d61 100644
--- a/flowctl/crates/flowctl-db/src/lib.rs
+++ b/flowctl/crates/flowctl-db/src/lib.rs
@@ -24,12 +24,14 @@ pub mod memory;
 pub mod metrics;
 pub mod pool;
 pub mod repo;
+pub mod skill;
 
 pub use error::DbError;
 pub use indexer::{reindex, ReindexResult};
 pub use events::{EventLog, TaskTokenSummary, TokenRecord, TokenUsageRow};
 pub use memory::{MemoryEntry, MemoryFilter, MemoryRepo};
 pub use metrics::StatsQuery;
+pub use skill::{SkillEntry, SkillMatch, SkillRepo};
 pub use pool::{cleanup, open_async, open_memory_async, resolve_db_path, resolve_libsql_path, resolve_state_dir};
 pub use repo::{
     DepRepo, EpicRepo, EventRepo, EventRow, EvidenceRepo, FileLockRepo, FileOwnershipRepo,
diff --git a/flowctl/crates/flowctl-db/src/memory.rs b/flowctl/crates/flowctl-db/src/memory.rs
index d2bdee81..d952c739 100644
--- a/flowctl/crates/flowctl-db/src/memory.rs
+++ b/flowctl/crates/flowctl-db/src/memory.rs
@@ -88,7 +88,7 @@ static EMBEDDER: OnceCell<Result<Mutex<TextEmbedding>, String>> = OnceCell::cons
 /// model (~130MB) via fastembed; subsequent calls return the cached
 /// instance. Initialization runs on a blocking thread because fastembed
 /// performs synchronous file I/O.
-async fn ensure_embedder() -> Result<(), DbError> {
+pub(crate) async fn ensure_embedder() -> Result<(), DbError> {
     let res = EMBEDDER
         .get_or_init(|| async {
             match tokio::task::spawn_blocking(|| {
@@ -109,7 +109,7 @@ async fn ensure_embedder() -> Result<(), DbError> {
 }
 
 /// Embed a single passage into a 384-dim vector.
-async fn embed_one(text: &str) -> Result<Vec<f32>, DbError> {
+pub(crate) async fn embed_one(text: &str) -> Result<Vec<f32>, DbError> {
     ensure_embedder().await?;
     let text = text.to_string();
     let result = tokio::task::spawn_blocking(move || {
@@ -132,7 +132,7 @@ async fn embed_one(text: &str) -> Result<Vec<f32>, DbError> {
 }
 
 /// Convert a `Vec<f32>` into a libSQL `vector32()` literal string.
-fn vec_to_literal(v: &[f32]) -> String {
+pub(crate) fn vec_to_literal(v: &[f32]) -> String {
     let parts: Vec<String> = v.iter().map(std::string::ToString::to_string).collect();
     format!("[{}]", parts.join(","))
 }
diff --git a/flowctl/crates/flowctl-db/src/pool.rs b/flowctl/crates/flowctl-db/src/pool.rs
index 70147e1f..0e95390e 100644
--- a/flowctl/crates/flowctl-db/src/pool.rs
+++ b/flowctl/crates/flowctl-db/src/pool.rs
@@ -98,7 +98,7 @@ async fn apply_schema(conn: &Connection) -> Result<(), DbError> {
     .await
     .map_err(|e| DbError::Schema(format!("reverse deps backfill failed: {e}")))?;
 
-    // Try to create the vector index (requires libSQL server extensions).
+    // Try to create vector indexes (requires libSQL server extensions).
     // Gracefully degrade if not available (embedded/core mode).
     let _ = conn
         .execute(
@@ -107,6 +107,13 @@ async fn apply_schema(conn: &Connection) -> Result<(), DbError> {
         )
         .await;
 
+    let _ = conn
+        .execute(
+            "CREATE INDEX IF NOT EXISTS skills_emb_idx ON skills(libsql_vector_idx(embedding))",
+            (),
+        )
+        .await;
+
     Ok(())
 }
 
@@ -215,6 +222,7 @@ mod tests {
             "daily_rollup",
             "monthly_rollup",
             "memory",
+            "skills",
         ] {
             assert!(
                 tables.contains(&expected.to_string()),
diff --git a/flowctl/crates/flowctl-db/src/schema.sql b/flowctl/crates/flowctl-db/src/schema.sql
index 8ba8b5ac..3ef5ea74 100644
--- a/flowctl/crates/flowctl-db/src/schema.sql
+++ b/flowctl/crates/flowctl-db/src/schema.sql
@@ -217,6 +217,21 @@ CREATE TABLE IF NOT EXISTS memory (
     embedding     BLOB
 );
 
+-- ── Skills with native vector embedding (BGE-small, 384-dim) ───────
+
+CREATE TABLE IF NOT EXISTS skills (
+    id          INTEGER PRIMARY KEY AUTOINCREMENT,
+    name        TEXT NOT NULL UNIQUE,
+    description TEXT NOT NULL,
+    plugin_path TEXT,
+    updated_at  TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')),
+    embedding   BLOB
+);
+
+-- Native libSQL vector index for semantic skill matching
+-- NOTE: Applied separately in pool.rs with graceful degradation.
+-- CREATE INDEX IF NOT EXISTS skills_emb_idx ON skills(libsql_vector_idx(embedding));
+
 -- ── Indexes ─────────────────────────────────────────────────────────
 
 CREATE INDEX IF NOT EXISTS idx_gaps_epic ON gaps(epic_id);
diff --git a/flowctl/crates/flowctl-db/src/skill.rs b/flowctl/crates/flowctl-db/src/skill.rs
new file mode 100644
index 00000000..c4428f9b
--- /dev/null
+++ b/flowctl/crates/flowctl-db/src/skill.rs
@@ -0,0 +1,260 @@
+//! Skill repository with native libSQL vector search.
+//!
+//! Stores skill metadata (name, description, plugin path) with a 384-dim
+//! BGE-small embedding for semantic matching via `vector_top_k`.
+//!
+//! Reuses `embed_one()`, `ensure_embedder()`, and `vec_to_literal()` from
+//! the memory module -- zero duplication.
+
+use libsql::{params, Connection};
+
+use crate::error::DbError;
+use crate::memory::{embed_one, vec_to_literal};
+
+// ── Types ───────────────────────────────────────────────────────────
+
+/// A skill match result from semantic search.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct SkillMatch {
+    pub name: String,
+    pub description: String,
+    pub score: f64,
+}
+
+/// A registered skill entry.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct SkillEntry {
+    pub id: i64,
+    pub name: String,
+    pub description: String,
+    pub plugin_path: Option<String>,
+    pub updated_at: String,
+}
+
+// ── Repository ──────────────────────────────────────────────────────
+
+/// Async repository for skill metadata + semantic vector search.
+pub struct SkillRepo {
+    conn: Connection,
+}
+
+impl SkillRepo {
+    pub fn new(conn: Connection) -> Self {
+        Self { conn }
+    }
+
+    /// Insert or replace a skill. Auto-generates an embedding from
+    /// `description` when the embedder is available; otherwise leaves the
+    /// embedding NULL and logs a warning.
+    pub async fn upsert(
+        &self,
+        name: &str,
+        description: &str,
+        plugin_path: Option<&str>,
+    ) -> Result<(), DbError> {
+        let now = chrono::Utc::now().to_rfc3339();
+
+        self.conn
+            .execute(
+                "INSERT INTO skills (name, description, plugin_path, updated_at)
+                 VALUES (?1, ?2, ?3, ?4)
+                 ON CONFLICT(name) DO UPDATE SET
+                     description = excluded.description,
+                     plugin_path = excluded.plugin_path,
+                     updated_at  = excluded.updated_at",
+                params![
+                    name.to_string(),
+                    description.to_string(),
+                    plugin_path.map(String::from),
+                    now,
+                ],
+            )
+            .await?;
+
+        // Attempt to embed; swallow failures (NULL embedding is fine).
+        match embed_one(description).await {
+            Ok(vec) => {
+                let lit = vec_to_literal(&vec);
+                self.conn
+                    .execute(
+                        "UPDATE skills SET embedding = vector32(?1) WHERE name = ?2",
+                        params![lit, name.to_string()],
+                    )
+                    .await?;
+            }
+            Err(e) => {
+                tracing::warn!(
+                    skill = name,
+                    error = %e,
+                    "embedder unavailable; skill inserted without embedding"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Semantic search: embed the query, find nearest skills via
+    /// `vector_top_k`, convert L2 distance to cosine similarity, and
+    /// filter by threshold.
+    ///
+    /// Returns `Ok(vec![])` (not an error) if the embedder or vector
+    /// index is unavailable -- graceful degradation.
+    pub async fn match_skills(
+        &self,
+        query: &str,
+        limit: usize,
+        threshold: f64,
+    ) -> Result<Vec<SkillMatch>, DbError> {
+        let vec = match embed_one(query).await {
+            Ok(v) => v,
+            Err(_) => return Ok(vec![]),
+        };
+        let lit = vec_to_literal(&vec);
+
+        let rows_result = self
+            .conn
+            .query(
+                "SELECT s.name, s.description, top.distance
+                   FROM vector_top_k('skills_emb_idx', vector32(?1), ?2) AS top
+                   JOIN skills s ON s.rowid = top.id",
+                params![lit, limit as i64],
+            )
+            .await;
+
+        let mut rows = match rows_result {
+            Ok(r) => r,
+            Err(_) => return Ok(vec![]), // vector index unavailable
+        };
+
+        let mut out = Vec::new();
+        while let Some(row) = rows.next().await? {
+            let dist: f64 = row.get(2)?;
+            // Convert L2 distance to cosine similarity.
+            // For unit-norm vectors: cos_sim = 1 - (L2^2 / 2).
+            let score = 1.0 - (dist * dist / 2.0);
+            if score >= threshold {
+                out.push(SkillMatch {
+                    name: row.get::<String>(0)?,
+                    description: row.get::<String>(1)?,
+                    score,
+                });
+            }
+        }
+        Ok(out)
+    }
+
+    /// List all registered skills (for debugging / introspection).
+    pub async fn list(&self) -> Result<Vec<SkillEntry>, DbError> {
+        let mut rows = self
+            .conn
+            .query(
+                "SELECT id, name, description, plugin_path, updated_at
+                   FROM skills
+                  ORDER BY name ASC",
+                (),
+            )
+            .await?;
+
+        let mut out = Vec::new();
+        while let Some(row) = rows.next().await? {
+            out.push(SkillEntry {
+                id: row.get::<i64>(0)?,
+                name: row.get::<String>(1)?,
+                description: row.get::<String>(2)?,
+                plugin_path: row.get::<Option<String>>(3)?,
+                updated_at: row.get::<String>(4)?,
+            });
+        }
+        Ok(out)
+    }
+}
+
+// ── Tests ───────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::pool::open_memory_async;
+
+    async fn fresh_repo() -> SkillRepo {
+        let (_db, conn) = open_memory_async().await.expect("open memory db");
+        let _ = Box::leak(Box::new(_db));
+        SkillRepo::new(conn)
+    }
+
+    #[tokio::test]
+    async fn test_upsert_and_list() {
+        let repo = fresh_repo().await;
+        repo.upsert("plan", "Plan and design tasks", Some("/plugins/flow"))
+            .await
+            .expect("upsert");
+        repo.upsert("work", "Execute implementation tasks", None)
+            .await
+            .expect("upsert");
+
+        let skills = repo.list().await.expect("list");
+        assert_eq!(skills.len(), 2);
+        assert_eq!(skills[0].name, "plan");
+        assert_eq!(skills[1].name, "work");
+    }
+
+    #[tokio::test]
+    async fn test_upsert_replaces() {
+        let repo = fresh_repo().await;
+        repo.upsert("plan", "old description", None)
+            .await
+            .expect("upsert");
+        repo.upsert("plan", "new description", Some("/new/path"))
+            .await
+            .expect("upsert");
+
+        let skills = repo.list().await.expect("list");
+        assert_eq!(skills.len(), 1);
+        assert_eq!(skills[0].description, "new description");
+        assert_eq!(skills[0].plugin_path.as_deref(), Some("/new/path"));
+    }
+
+    #[tokio::test]
+    async fn test_match_skills_graceful_no_index() {
+        // In-memory DB won't have vector index; should return empty, not error.
+        let repo = fresh_repo().await;
+        repo.upsert("plan", "Plan tasks", None)
+            .await
+            .expect("upsert");
+
+        let matches = repo
+            .match_skills("planning", 5, 0.5)
+            .await
+            .expect("match_skills should not error");
+        // May be empty if embedder or index is unavailable -- that's fine.
+        assert!(matches.len() <= 5);
+    }
+
+    /// Semantic match end-to-end. Gated behind `#[ignore]` because the
+    /// first run downloads the BGE-small model (~130MB).
+    #[tokio::test]
+    #[ignore = "requires fastembed model (~130MB); run with --ignored"]
+    async fn test_match_skills_semantic() {
+        let repo = fresh_repo().await;
+        repo.upsert("plan", "Design and architect implementation plans", None)
+            .await
+            .expect("upsert");
+        repo.upsert("work", "Execute coding tasks and write code", None)
+            .await
+            .expect("upsert");
+        repo.upsert("review", "Review code changes for quality", None)
+            .await
+            .expect("upsert");
+
+        let matches = repo
+            .match_skills("architecture design", 2, 0.3)
+            .await
+            .expect("match_skills");
+        assert!(!matches.is_empty(), "expected at least one match");
+        assert_eq!(
+            matches[0].name, "plan",
+            "expected 'plan' as best match for architecture query"
+        );
+    }
+}

From 13fdd3b7d1cd471738679f6d5929e910812ef558 Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 10:47:17 +0800
Subject: [PATCH 11/13] feat(flowctl-cli): add skill register and skill-match
 commands

skill register: scans skills/*/SKILL.md, extracts YAML frontmatter
(name + description), embeds via BGE-small, stores in skills table.

skill match: semantic search against registered skills, returns
ranked results with cosine similarity scores.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../flowctl-cli/src/commands/db_shim.rs       |   6 +
 .../crates/flowctl-cli/src/commands/mod.rs    |   1 +
 .../crates/flowctl-cli/src/commands/skill.rs  | 212 ++++++++++++++++++
 flowctl/crates/flowctl-cli/src/main.rs        |   7 +
 4 files changed, 226 insertions(+)
 create mode 100644 flowctl/crates/flowctl-cli/src/commands/skill.rs

diff --git a/flowctl/crates/flowctl-cli/src/commands/db_shim.rs b/flowctl/crates/flowctl-cli/src/commands/db_shim.rs
index b154ebfb..b9a9eed6 100644
--- a/flowctl/crates/flowctl-cli/src/commands/db_shim.rs
+++ b/flowctl/crates/flowctl-cli/src/commands/db_shim.rs
@@ -30,6 +30,12 @@ impl Connection {
     fn inner(&self) -> libsql::Connection {
         self.conn.clone()
     }
+
+    /// Public accessor for modules that need the raw libsql connection
+    /// (e.g. skill commands that call async repos directly).
+    pub fn inner_conn(&self) -> libsql::Connection {
+        self.conn.clone()
+    }
 }
 
 fn block_on<F: std::future::Future>(fut: F) -> F::Output {
diff --git a/flowctl/crates/flowctl-cli/src/commands/mod.rs b/flowctl/crates/flowctl-cli/src/commands/mod.rs
index a7cbe8d6..c3593ce6 100644
--- a/flowctl/crates/flowctl-cli/src/commands/mod.rs
+++ b/flowctl/crates/flowctl-cli/src/commands/mod.rs
@@ -18,4 +18,5 @@ pub mod rp;
 pub mod stack;
 pub mod stats;
 pub mod task;
+pub mod skill;
 pub mod workflow;
diff --git a/flowctl/crates/flowctl-cli/src/commands/skill.rs b/flowctl/crates/flowctl-cli/src/commands/skill.rs
new file mode 100644
index 00000000..66fffabf
--- /dev/null
+++ b/flowctl/crates/flowctl-cli/src/commands/skill.rs
@@ -0,0 +1,212 @@
+//! Skill commands: register, match.
+//!
+//! `skill register` scans `skills/*/SKILL.md` files, extracts YAML
+//! frontmatter (name + description), and upserts each into the DB with
+//! a BGE-small embedding for semantic matching.
+//!
+//! `skill match` performs semantic vector search against registered
+//! skills and returns ranked results.
+
+use clap::Subcommand;
+use serde::Deserialize;
+use serde_json::json;
+
+use crate::output::{error_exit, json_output, pretty_output};
+
+use super::db_shim;
+
+// ── CLI definition ─────────────────────────────────────────────────
+
+#[derive(Subcommand, Debug)]
+pub enum SkillCmd {
+    /// Scan skills/*/SKILL.md and register into DB with embeddings.
+    Register {
+        /// Directory to scan (default: DROID_PLUGIN_ROOT or CLAUDE_PLUGIN_ROOT).
+        #[arg(long)]
+        dir: Option<String>,
+    },
+    /// Semantic search against registered skills.
+    Match {
+        /// Search query text.
+        query: String,
+        /// Maximum results to return.
+        #[arg(long, default_value = "5")]
+        limit: usize,
+        /// Minimum cosine similarity threshold.
+        #[arg(long, default_value = "0.75")]
+        threshold: f64,
+    },
+}
+
+// ── Frontmatter struct ─────────────────────────────────────────────
+
+#[derive(Deserialize)]
+struct SkillFrontmatter {
+    name: String,
+    description: String,
+}
+
+// ── Dispatch ───────────────────────────────────────────────────────
+
+pub fn dispatch(cmd: &SkillCmd, json: bool) {
+    match cmd {
+        SkillCmd::Register { dir } => cmd_skill_register(json, dir.as_deref()),
+        SkillCmd::Match {
+            query,
+            limit,
+            threshold,
+        } => cmd_skill_match(json, query, *limit, *threshold),
+    }
+}
+
+// ── Register ───────────────────────────────────────────────────────
+
+fn cmd_skill_register(json: bool, dir: Option<&str>) {
+    // Resolve plugin root directory.
+    let root = match dir {
+        Some(d) => std::path::PathBuf::from(d),
+        None => {
+            if let Ok(d) = std::env::var("DROID_PLUGIN_ROOT") {
+                std::path::PathBuf::from(d)
+            } else if let Ok(d) = std::env::var("CLAUDE_PLUGIN_ROOT") {
+                std::path::PathBuf::from(d)
+            } else {
+                error_exit("No --dir given and DROID_PLUGIN_ROOT / CLAUDE_PLUGIN_ROOT not set");
+            }
+        }
+    };
+
+    let skills_dir = root.join("skills");
+    if !skills_dir.is_dir() {
+        error_exit(&format!("Skills directory not found: {}", skills_dir.display()));
+    }
+
+    // Walk skills/*/SKILL.md
+    let mut entries: Vec<(String, String, String)> = Vec::new(); // (name, description, path)
+    let read_dir = std::fs::read_dir(&skills_dir).unwrap_or_else(|e| {
+        error_exit(&format!("Cannot read {}: {e}", skills_dir.display()));
+    });
+
+    for entry in read_dir.flatten() {
+        if !entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
+            continue;
+        }
+        let skill_md = entry.path().join("SKILL.md");
+        if !skill_md.is_file() {
+            continue;
+        }
+        let content = match std::fs::read_to_string(&skill_md) {
+            Ok(c) => c,
+            Err(e) => {
+                eprintln!("warn: cannot read {}: {e}", skill_md.display());
+                continue;
+            }
+        };
+        let fm: SkillFrontmatter =
+            match flowctl_core::frontmatter::parse_frontmatter(&content) {
+                Ok(f) => f,
+                Err(e) => {
+                    eprintln!(
+                        "warn: cannot parse frontmatter in {}: {e}",
+                        skill_md.display()
+                    );
+                    continue;
+                }
+            };
+        entries.push((
+            fm.name,
+            fm.description,
+            skill_md.to_string_lossy().to_string(),
+        ));
+    }
+
+    // Upsert each skill into DB.
+    let conn = db_shim::require_db().unwrap_or_else(|e| {
+        error_exit(&format!("Cannot open DB: {e}"));
+    });
+
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create tokio runtime");
+
+    let repo = flowctl_db::skill::SkillRepo::new(conn.inner_conn());
+
+    for (name, desc, path) in &entries {
+        rt.block_on(async {
+            repo.upsert(name, desc, Some(path.as_str()))
+                .await
+                .unwrap_or_else(|e| {
+                    eprintln!("warn: failed to upsert skill '{}': {e}", name);
+                });
+        });
+    }
+
+    let skills_json: Vec<serde_json::Value> = entries
+        .iter()
+        .map(|(n, d, _)| json!({"name": n, "description": d}))
+        .collect();
+
+    if json {
+        json_output(json!({
+            "registered": entries.len(),
+            "skills": skills_json,
+        }));
+    } else {
+        pretty_output("skill_register", &format!("Registered {} skills", entries.len()));
+        for (name, desc, _) in &entries {
+            pretty_output("skill_register", &format!("  {} — {}", name, desc));
+        }
+    }
+}
+
+// ── Match ──────────────────────────────────────────────────────────
+
+fn cmd_skill_match(json: bool, query: &str, limit: usize, threshold: f64) {
+    let conn = db_shim::require_db().unwrap_or_else(|e| {
+        error_exit(&format!("Cannot open DB: {e}"));
+    });
+
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create tokio runtime");
+
+    let repo = flowctl_db::skill::SkillRepo::new(conn.inner_conn());
+    let matches = rt.block_on(async {
+        repo.match_skills(query, limit, threshold)
+            .await
+            .unwrap_or_else(|e| {
+                error_exit(&format!("match_skills failed: {e}"));
+            })
+    });
+
+    if json {
+        let out: Vec<serde_json::Value> = matches
+            .iter()
+            .map(|m| {
+                json!({
+                    "name": m.name,
+                    "description": m.description,
+                    "score": (m.score * 100.0).round() / 100.0,
+                })
+            })
+            .collect();
+        json_output(json!(out));
+    } else {
+        if matches.is_empty() {
+            pretty_output("skill_match", "No matching skills found.");
+            return;
+        }
+        pretty_output(
+            "skill_match",
+            &format!("  {:<6} {:<28} {}", "Score", "Name", "Description"),
+        );
+        for m in &matches {
+            pretty_output(
+                "skill_match",
+                &format!("  {:<6.2} {:<28} {}", m.score, m.name, m.description),
+            );
+        }
+    }
+}
diff --git a/flowctl/crates/flowctl-cli/src/main.rs b/flowctl/crates/flowctl-cli/src/main.rs
index 46126e75..d8464fc2 100644
--- a/flowctl/crates/flowctl-cli/src/main.rs
+++ b/flowctl/crates/flowctl-cli/src/main.rs
@@ -23,6 +23,7 @@ use commands::{
     query,
     ralph::RalphCmd,
     rp::RpCmd,
+    skill::SkillCmd,
     stack::{InvariantsCmd, StackCmd},
     stats::StatsCmd,
     task::TaskCmd,
@@ -216,6 +217,11 @@ enum Commands {
         #[command(subcommand)]
         cmd: RalphCmd,
     },
+    /// Skill registry commands (register, match).
+    Skill {
+        #[command(subcommand)]
+        cmd: SkillCmd,
+    },
     /// RepoPrompt helpers.
     Rp {
         #[command(subcommand)]
@@ -484,6 +490,7 @@ fn main() {
         Commands::Stack { cmd } => commands::stack::dispatch(&cmd, json),
         Commands::Invariants { cmd } => commands::stack::dispatch_invariants(&cmd, json),
         Commands::Ralph { cmd } => commands::ralph::dispatch(&cmd, json),
+        Commands::Skill { cmd } => commands::skill::dispatch(&cmd, json),
         Commands::Rp { cmd } => commands::rp::dispatch(&cmd, json),
         Commands::Codex { cmd } => commands::codex::dispatch(&cmd, json),
         Commands::Hook { cmd } => commands::hook::dispatch(&cmd),

From 75984a09ec7e5d39c7fce46f94713ce258ac436f Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 10:48:35 +0800
Subject: [PATCH 12/13] feat: integrate skill-match into Plan Step 0.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Step 0.6 (skill routing) between clarity check and research.
Translates request to English keywords, calls flowctl skill-match,
and injects matched skills into task spec Approach sections.

Non-blocking: embedder failure or no matches → skip silently.
Max 3 skill references per task to avoid spec bloat.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/flow-code-plan/steps.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/skills/flow-code-plan/steps.md b/skills/flow-code-plan/steps.md
index a7f2c9ec..d7b8c960 100644
--- a/skills/flow-code-plan/steps.md
+++ b/skills/flow-code-plan/steps.md
@@ -59,6 +59,29 @@ $FLOWCTL init --json
 3. Pick best by: blast radius, value/effort, codebase alignment
 4. Output: `Clarified: "<original>" → "<specific target>" | Approach: <A|B|C> — <why>`
 
+## Step 0.6: Skill routing (auto — non-blocking)
+
+After clarity check, match the request against registered engineering discipline skills to auto-inject relevant guidance into task specs.
+
+1. Translate the request to English keywords (if not already English). This costs zero tokens — you're already processing the request.
+2. Run:
+   ```bash
+   $FLOWCTL skill match "<english keywords>" --threshold 0.75 --limit 3 --json
+   ```
+3. If matches found (non-empty JSON array): save them for Step 5 (task spec writing). Each matched skill will be referenced in the task's Approach section.
+4. If empty result, error, or embedder unavailable: skip silently. Skill routing is advisory, never blocking.
+
+**Output** (inline, no user prompt):
+```
+Skill routing: flow-code-api-design (0.87), flow-code-performance (0.42 — below threshold)
+```
+
+**Integration in Step 5** (task spec writing): For each skill with score ≥ threshold, add to the task's Approach section:
+```markdown
+- Reference `flow-code-<name>` skill principles when implementing
+```
+Max 3 skill references per task to avoid spec bloat.
+
 ## Step 1: Fast research (parallel)
 
 **If input is a Flow ID** (fn-N-slug or fn-N-slug.M, including legacy fn-N/fn-N-xxx): First fetch it with `$FLOWCTL show <id> --json` and `$FLOWCTL cat <id>` to get the request context.

From 273bc6bf63ba2d36e307b3068c06d5fae276e7cd Mon Sep 17 00:00:00 2001
From: z23cc <admin@z23.cc>
Date: Tue, 7 Apr 2026 11:09:57 +0800
Subject: [PATCH 13/13] fix(flowctl): use vector_distance_cos for skill
 matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace vector_top_k (requires ANN index, fails in embedded libSQL)
with vector_distance_cos (exact search, no index needed). Works
unconditionally in embedded mode. Performance is <1ms for 30 skills.

Also lower default threshold from 0.75 to 0.70 based on testing —
"deprecate old module" scores 0.73 against flow-code-deprecation.

Tested: all 5 new skills match correctly with relevant queries.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../crates/flowctl-cli/src/commands/skill.rs  |  2 +-
 flowctl/crates/flowctl-db/src/skill.rs        | 35 +++++++++++++------
 skills/flow-code-plan/steps.md                |  2 +-
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/flowctl/crates/flowctl-cli/src/commands/skill.rs b/flowctl/crates/flowctl-cli/src/commands/skill.rs
index 66fffabf..11bee184 100644
--- a/flowctl/crates/flowctl-cli/src/commands/skill.rs
+++ b/flowctl/crates/flowctl-cli/src/commands/skill.rs
@@ -33,7 +33,7 @@ pub enum SkillCmd {
         #[arg(long, default_value = "5")]
         limit: usize,
         /// Minimum cosine similarity threshold.
-        #[arg(long, default_value = "0.75")]
+        #[arg(long, default_value = "0.70")]
         threshold: f64,
     },
 }
diff --git a/flowctl/crates/flowctl-db/src/skill.rs b/flowctl/crates/flowctl-db/src/skill.rs
index c4428f9b..d2855fb9 100644
--- a/flowctl/crates/flowctl-db/src/skill.rs
+++ b/flowctl/crates/flowctl-db/src/skill.rs
@@ -112,27 +112,33 @@ impl SkillRepo {
         };
         let lit = vec_to_literal(&vec);
 
+        // Use vector_distance_cos() instead of vector_top_k() — works without
+        // a vector index (no ANN index required in embedded mode). Exact search
+        // via full table scan; perfectly fast for <10,000 rows (~30 skills).
         let rows_result = self
             .conn
             .query(
-                "SELECT s.name, s.description, top.distance
-                   FROM vector_top_k('skills_emb_idx', vector32(?1), ?2) AS top
-                   JOIN skills s ON s.rowid = top.id",
+                "SELECT s.name, s.description,
+                        vector_distance_cos(s.embedding, vector32(?1)) AS distance
+                   FROM skills s
+                  WHERE s.embedding IS NOT NULL
+                  ORDER BY distance ASC
+                  LIMIT ?2",
                 params![lit, limit as i64],
             )
             .await;
 
         let mut rows = match rows_result {
             Ok(r) => r,
-            Err(_) => return Ok(vec![]), // vector index unavailable
+            Err(_) => return Ok(vec![]), // vector functions unavailable
         };
 
         let mut out = Vec::new();
         while let Some(row) = rows.next().await? {
             let dist: f64 = row.get(2)?;
-            // Convert L2 distance to cosine similarity.
-            // For unit-norm vectors: cos_sim = 1 - (L2^2 / 2).
-            let score = 1.0 - (dist * dist / 2.0);
+            // Cosine distance → cosine similarity: sim = 1 - dist
+            // (0 = identical, 1 = orthogonal, 2 = opposite)
+            let score = 1.0 - dist;
             if score >= threshold {
                 out.push(SkillMatch {
                     name: row.get::<String>(0)?,
@@ -231,8 +237,9 @@ mod tests {
         assert!(matches.len() <= 5);
     }
 
-    /// Semantic match end-to-end. Gated behind `#[ignore]` because the
-    /// first run downloads the BGE-small model (~130MB).
+    /// Semantic match end-to-end using vector_distance_cos (no index needed).
+    /// Gated behind `#[ignore]` because the first run downloads the
+    /// BGE-small model (~130MB).
     #[tokio::test]
     #[ignore = "requires fastembed model (~130MB); run with --ignored"]
     async fn test_match_skills_semantic() {
@@ -248,13 +255,19 @@ mod tests {
             .expect("upsert");
 
         let matches = repo
-            .match_skills("architecture design", 2, 0.3)
+            .match_skills("architecture design", 3, 0.3)
             .await
             .expect("match_skills");
         assert!(!matches.is_empty(), "expected at least one match");
+        // "plan" (Design and architect...) should be the best match
         assert_eq!(
             matches[0].name, "plan",
-            "expected 'plan' as best match for architecture query"
+            "expected 'plan' as best match for architecture query, got '{}'",
+            matches[0].name
         );
+        // Scores should be between 0 and 1
+        for m in &matches {
+            assert!(m.score > 0.0 && m.score <= 1.0, "score out of range: {}", m.score);
+        }
     }
 }
diff --git a/skills/flow-code-plan/steps.md b/skills/flow-code-plan/steps.md
index d7b8c960..c66d7de4 100644
--- a/skills/flow-code-plan/steps.md
+++ b/skills/flow-code-plan/steps.md
@@ -66,7 +66,7 @@ After clarity check, match the request against registered engineering discipline
 1. Translate the request to English keywords (if not already English). This costs zero tokens — you're already processing the request.
 2. Run:
    ```bash
-   $FLOWCTL skill match "<english keywords>" --threshold 0.75 --limit 3 --json
+   $FLOWCTL skill match "<english keywords>" --threshold 0.70 --limit 3 --json
    ```
 3. If matches found (non-empty JSON array): save them for Step 5 (task spec writing). Each matched skill will be referenced in the task's Approach section.
 4. If empty result, error, or embedder unavailable: skip silently. Skill routing is advisory, never blocking.