diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
new file mode 100644
index 00000000..6562a2ef
--- /dev/null
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,137 @@
+# CASCADE Architecture
+
+CASCADE is a PM-to-Code automation platform that connects project management tools (Trello, JIRA), source control (GitHub), and monitoring (Sentry) to AI-powered agents that autonomously implement features, review PRs, debug failures, and manage backlogs. Webhooks from external providers flow through a router, get queued in Redis, and are processed by ephemeral worker containers that run agents against cloned repositories.
+
+> **Relationship to CLAUDE.md**: `CLAUDE.md` is the operational reference (commands, env vars, how-to). This document and its deep-dives cover the *system design* — how components fit together and why.
+
+## System Overview
+
+```mermaid
+graph TB
+    subgraph External["External Providers"]
+        Trello
+        JIRA
+        GitHub
+        Sentry
+    end
+
+    subgraph CASCADE["CASCADE Platform"]
+        Router["Router :3000<br/>Webhook receiver"]
+        Redis[(Redis / BullMQ)]
+        Worker["Worker containers<br/>One job per container"]
+        Dashboard["Dashboard :3001<br/>API + tRPC"]
+        DB[(PostgreSQL)]
+    end
+
+    subgraph Clients
+        WebUI["Dashboard UI"]
+        CLI["cascade CLI"]
+    end
+
+    Trello -->|webhook| Router
+    JIRA -->|webhook| Router
+    GitHub -->|webhook| Router
+    Sentry -->|webhook| Router
+
+    Router -->|enqueue job| Redis
+    Redis -->|dequeue job| Worker
+
+    Worker -->|PRs, comments| GitHub
+    Worker -->|status updates| Trello
+    Worker -->|status updates| JIRA
+
+    Router <--> DB
+    Worker <--> DB
+    Dashboard <--> DB
+    Dashboard <--> Redis
+
+    WebUI <--> Dashboard
+    CLI <--> Dashboard
+```
+
+See also: [`docs/architecture.d2`](architecture.d2) for the D2 source diagram.
+
+## Service Topology
+
+| Service | Entry Point | Default Port | Responsibility |
+|---------|-------------|-------------|----------------|
+| **Router** | `src/router/index.ts` | 3000 | Receive webhooks, verify signatures, run trigger dispatch, enqueue jobs to Redis, manage worker containers |
+| **Worker** | `src/worker-entry.ts` | N/A (ephemeral) | Process one job per container — run trigger handlers, execute agents, exit on completion |
+| **Dashboard** | `src/dashboard.ts` | 3001 | tRPC API for web UI and CLI, session auth, serve frontend static files in self-hosted mode |
+
+## End-to-End Request Flow
+
+The canonical path from webhook to pull request:
+
+```mermaid
+sequenceDiagram
+    participant P as Provider<br/>(Trello/GitHub/JIRA/Sentry)
+    participant R as Router
+    participant Q as Redis/BullMQ
+    participant W as Worker
+    participant A as Agent Engine
+
+    P->>R: POST /provider/webhook
+    R->>R: Parse, verify signature, dedup
+    R->>R: Lookup project, dispatch triggers
+    R->>R: Check concurrency, post ack comment
+    R->>Q: Enqueue job
+    Q->>W: Spawn container with job env vars
+    W->>W: Bootstrap integrations, dispatch by job type
+    W->>W: Match trigger, resolve agent definition
+    W->>A: Execute agent (clone repo, run engine)
+    A->>A: LLM loop: read, edit, test, commit
+    A-->>P: Create PR / post comments / update status
+    W->>W: Finalize run record, cleanup, exit
+```
+
+## Architectural Patterns
+
+**Registry pattern** — Integrations, triggers, engines, PM providers, and capabilities all use registries (singleton maps populated at bootstrap). Infrastructure code looks up by key with no provider-specific branching.
+
+**Capability-driven tool resolution** — Agent YAML definitions declare required capabilities (`fs:read`, `pm:write`, `scm:pr`). At runtime, capabilities are resolved against available integrations to determine which gadgets (tools) the agent receives.
+
+**Two-tier credential resolution** — In the router and dashboard, credentials are read from the `project_credentials` database table. In workers, the router pre-loads credentials as environment variables to avoid giving workers direct DB access to secrets.
+
+**Dual-persona GitHub model** — Each project uses two GitHub bot accounts (implementer and reviewer) to prevent feedback loops. Agent type determines which persona token is used.
+
+**YAML-based agent definitions** — Agents are defined declaratively in YAML files specifying identity, capabilities, triggers, prompts, and lifecycle hooks. Definitions resolve via three tiers: in-memory cache, database, then YAML files on disk.
+
+**AsyncLocalStorage credential scoping** — Provider clients (GitHub, Trello, JIRA) use Node.js `AsyncLocalStorage` to scope credentials per-request, preventing cross-request credential leakage.
+
+## Directory Map
+
+| Directory | Purpose |
+|-----------|---------|
+| `src/router/` | Webhook receiver, BullMQ producer, worker container management |
+| `src/webhook/` | Shared webhook handler factory, parsers, signature verification, logging |
+| `src/triggers/` | Event-to-agent routing: TriggerRegistry, TriggerHandler implementations |
+| `src/agents/` | Agent definitions (YAML), profiles, capabilities, prompt templates |
+| `src/backends/` | LLM execution engines: Claude Code, LLMist, Codex, OpenCode |
+| `src/gadgets/` | Tool implementations agents use (file ops, PM, SCM, alerting, shell) |
+| `src/integrations/` | Unified integration interfaces, registry, bootstrap |
+| `src/pm/` | PM abstraction layer: provider interface, Trello/JIRA adapters, lifecycle |
+| `src/github/` | GitHub API client, dual-persona model, PR operations |
+| `src/trello/` | Trello API client |
+| `src/jira/` | JIRA API client (jira.js wrapper) |
+| `src/sentry/` | Sentry API client, alerting integration |
+| `src/config/` | Configuration provider, caching, credential resolution, integration roles |
+| `src/db/` | Drizzle ORM schema, repositories, migrations |
+| `src/api/` | tRPC routers for dashboard API |
+| `src/cli/` | Two CLIs: `cascade` (dashboard) and `cascade-tools` (agent tools) |
+| `src/utils/` | Logging, repo cloning, lifecycle/watchdog, env scrubbing |
+| `src/types/` | Shared TypeScript types |
+| `src/queue/` | BullMQ queue helpers |
+
+## Deep-Dive Documents
+
+1. [Services and Deployment](./architecture/01-services.md) — Three-service architecture, startup sequences, container model
+2. [Webhook Pipeline](./architecture/02-webhook-pipeline.md) — Handler factory, platform adapters, processing pipeline
+3. [Trigger System](./architecture/03-trigger-system.md) — TriggerRegistry, handlers, config resolution, context pipeline
+4. [Agent System](./architecture/04-agent-system.md) — YAML definitions, profiles, capabilities, prompts, hooks
+5. [Engine Backends](./architecture/05-engine-backends.md) — AgentEngine interface, archetypes, execution adapter
+6. [Integration Layer](./architecture/06-integration-layer.md) — IntegrationModule, registry, categories, provider implementations
+7. [Gadgets](./architecture/07-gadgets.md) — Capability-to-gadget mapping, built-in tools, cascade-tools CLI
+8. [Configuration and Credentials](./architecture/08-config-credentials.md) — Config provider, credential resolution, encryption
+9. [Database](./architecture/09-database.md) — Schema, ER diagram, repositories, migrations
+10. [Resilience](./architecture/10-resilience.md) — Watchdog, concurrency controls, rate limiting, retry, loop prevention
diff --git a/docs/architecture/01-services.md b/docs/architecture/01-services.md
new file mode 100644
index 00000000..f2769834
--- /dev/null
+++ b/docs/architecture/01-services.md
@@ -0,0 +1,172 @@
+# Services and Deployment
+
+CASCADE runs as three independent services. There is no monolithic server mode — each service has a distinct entry point, lifecycle, and scaling model.
+
+```mermaid
+graph LR
+    subgraph Router["Router Container"]
+        R_Hono["Hono :3000"]
+        R_BullMQ["BullMQ Producer"]
+        R_WM["Worker Manager"]
+    end
+
+    subgraph Workers["Worker Containers (ephemeral)"]
+        W1["Worker 1"]
+        W2["Worker 2"]
+        WN["Worker N"]
+    end
+
+    subgraph Dashboard["Dashboard Container"]
+        D_Hono["Hono :3001"]
+        D_tRPC["tRPC Router"]
+    end
+
+    Redis[(Redis)]
+    DB[(PostgreSQL)]
+
+    R_Hono --> R_BullMQ --> Redis
+    R_WM --> Workers
+    Redis --> R_WM
+
+    D_Hono --> D_tRPC
+    Dashboard <--> DB
+    Router <--> DB
+    Workers <--> DB
+```
+
+## Router
+
+**Entry point**: `src/router/index.ts`
+**Default port**: 3000
+
+The router is the webhook ingestion point. It receives HTTP POST requests from external providers, processes them through a multi-step pipeline, and enqueues jobs to Redis for worker containers.
+
+### Webhook endpoints
+
+| Route | Provider | Notes |
+|-------|----------|-------|
+| `POST /trello/webhook` | Trello | HEAD/GET returns 200 for Trello's verification |
+| `POST /github/webhook` | GitHub | Injects `X-GitHub-Event` header into payload |
+| `POST /jira/webhook` | JIRA | HEAD/GET returns 200 for JIRA verification |
+| `POST /sentry/webhook/:projectId` | Sentry | Project ID in URL for unambiguous routing |
+| `GET /health` | Internal | Queue stats, active worker count |
+
+### Startup sequence
+
+Module-load phase (runs at import time, before `startRouter()`):
+1. `registerBuiltInEngines()` — register engine settings schemas (required before any `loadConfig()`)
+2. `createTriggerRegistry()` + `registerBuiltInTriggers()` — populate trigger handlers
+
+`startRouter()` async phase:
+3. `seedAgentDefinitions()` — sync built-in YAML definitions to database
+4. `initAgentMessages()` — load ack message templates
+5. `initPrompts()` — load prompt templates
+6. `startCancelListener()` — listen for run cancellation requests
+7. `startWorkerProcessor()` — begin polling BullMQ for jobs and spawning containers
+8. `serve()` — start Hono HTTP server
+
+### Key modules
+
+| File | Purpose |
+|------|---------|
+| `webhook-processor.ts` | Generic 12-step pipeline (see [02-webhook-pipeline](./02-webhook-pipeline.md)) |
+| `platform-adapter.ts` | `RouterPlatformAdapter` interface |
+| `adapters/` | Per-provider adapter implementations |
+| `worker-manager.ts` | Spawns/monitors Docker worker containers |
+| `queue.ts` | BullMQ `addJob()`, queue stats |
+| `action-dedup.ts` | In-memory deduplication of webhook deliveries |
+| `work-item-lock.ts` | Prevents concurrent agents on the same work item |
+| `agent-type-lock.ts` | Agent-type concurrency limits |
+| `cancel-listener.ts` | Listens for run cancellation via BullMQ events |
+| `webhookVerification.ts` | HMAC signature verification per provider |
+
+## Worker
+
+**Entry point**: `src/worker-entry.ts`
+**Port**: None (ephemeral container, no HTTP server)
+
+Workers are stateless, one-job-per-container processes spawned by the router's worker manager. Each worker reads its job from environment variables, processes it, and exits.
+
+### Environment variables
+
+The router passes job data to workers via Docker container env vars:
+
+| Variable | Purpose |
+|----------|---------|
+| `JOB_ID` | Unique job identifier |
+| `JOB_TYPE` | `trello`, `github`, `jira`, `sentry`, `manual-run`, `retry-run`, `debug-analysis` |
+| `JOB_DATA` | JSON-encoded job payload |
+| `CASCADE_CREDENTIAL_KEYS` | Comma-separated list of credential env var names |
+| Individual credential vars | Pre-loaded project credentials (e.g., `GITHUB_TOKEN_IMPLEMENTER`) |
+
+### Job types
+
+```typescript
+type JobData =
+  | TrelloJobData      // Trello webhook payload
+  | GitHubJobData      // GitHub webhook payload
+  | JiraJobData        // JIRA webhook payload
+  | SentryJobData      // Sentry webhook payload
+  | ManualRunJobData   // Dashboard-initiated run
+  | RetryRunJobData    // Retry a failed run
+  | DebugAnalysisJobData; // Post-mortem debug analysis
+```
+
+### Startup sequence
+
+1. `loadEnvConfigSafe()` — load `.cascade/env` if present
+2. `getDb()` — eagerly initialize DB connection (caches pool before env scrub)
+3. `registerBuiltInEngines()` — register engine settings schemas (before `loadConfig()`)
+4. `loadConfig()` — cache project config from database
+5. `seedAgentDefinitions()` — sync built-in YAML definitions to database
+6. `initAgentMessages()` — load ack message templates
+7. `initPrompts()` — load prompt templates
+8. `scrubSensitiveEnv()` — remove `DATABASE_URL` and other secrets from `process.env`
+9. `createTriggerRegistry()` + `registerBuiltInTriggers()` — populate trigger handlers
+10. `dispatchJob()` — route to the appropriate handler based on `JOB_TYPE`
+
+The security scrub in step 8 prevents agent engines (which execute arbitrary LLM-generated commands) from accessing database credentials. Note that trigger registration (step 9) happens after the scrub — it only needs the in-memory config, not the database.
+
+### Dispatch flow
+
+`dispatchJob()` switches on the job type:
+- **Webhook jobs** (`trello`, `github`, `jira`, `sentry`) — call the provider-specific webhook processor, which re-runs trigger dispatch and executes the matched agent
+- **Dashboard jobs** (`manual-run`, `retry-run`, `debug-analysis`) — call `processDashboardJob()`, which loads project config and invokes the appropriate runner
+
+## Dashboard
+
+**Entry point**: `src/dashboard.ts`
+**Default port**: 3001
+
+The dashboard serves the tRPC API consumed by both the web frontend and the `cascade` CLI. In self-hosted mode, it also serves the built frontend as static files.
+
+### Routes
+
+| Route | Purpose |
+|-------|---------|
+| `POST /api/auth/login` | Email/password authentication |
+| `POST /api/auth/logout` | Session invalidation |
+| `/trpc/*` | tRPC API endpoints |
+| `GET /health` | Service health check |
+| `/*` (static) | Frontend files from `dist/web/` (self-hosted mode only) |
+
+### Startup sequence
+
+Module-load phase (runs at import time, before `startDashboard()`):
+1. `registerBuiltInEngines()` — register engine settings schemas
+2. CORS middleware, logging middleware registered on Hono app
+3. Auth routes mounted (`/api/auth/login`, `/api/auth/logout`)
+4. tRPC router mounted with session-based context resolution
+5. Static file serving (if `dist/web/` exists)
+
+`startDashboard()` async phase:
+6. `initPrompts()` — load prompt templates
+7. `serve()` — start Hono HTTP server
+
+### tRPC context
+
+Every tRPC request builds a context containing:
+- `user` — resolved from session cookie via `resolveUserFromSession()`
+- `effectiveOrgId` — computed from user's org membership or `x-org-context` header
+
+Procedure types enforce auth levels: `publicProcedure`, `protectedProcedure`, `adminProcedure`, `superAdminProcedure`.
diff --git a/docs/architecture/02-webhook-pipeline.md b/docs/architecture/02-webhook-pipeline.md
new file mode 100644
index 00000000..dfd929ed
--- /dev/null
+++ b/docs/architecture/02-webhook-pipeline.md
@@ -0,0 +1,149 @@
+# Webhook Pipeline
+
+Webhooks from external providers (Trello, GitHub, JIRA, Sentry) are processed through a two-layer system: a **webhook handler factory** that handles HTTP concerns, and a **router platform adapter** that implements the business logic pipeline.
+
+## Webhook Handler Factory
+
+`src/webhook/webhookHandlers.ts` — `createWebhookHandler()`
+
+The factory creates Hono route handlers with a standard lifecycle:
+
+```
+HTTP POST → Parse payload → Verify signature → Process webhook → Log result → Return 200/4xx
+```
+
+Each webhook endpoint provides a `WebhookHandlerConfig`:
+
+```typescript
+interface WebhookHandlerConfig {
+  source: string;                    // 'trello' | 'github' | 'jira' | 'sentry'
+  parsePayload: (c: Context) => ParseResult;
+  verifySignature?: (ctx, rawBody, projectId?) => VerificationResult | null;
+  processWebhook: (payload, eventType?, headers?) => Promise<WebhookLogOverrides>;
+}
+```
+
+The factory handles:
+- Payload parsing with per-provider parsers (`src/webhook/webhookParsers.ts`)
+- Optional signature verification (`src/webhook/signatureVerification.ts`)
+- Fire-and-forget acknowledgment reactions
+- Webhook logging to `webhook_logs` table (`src/webhook/webhookLogging.ts`)
+- Error handling (parse failures → 400, signature failures → 401)
+
+### Platform Parsers
+
+| Parser | Source | Event type extraction |
+|--------|--------|----------------------|
+| `parseGitHubPayload()` | JSON or form-encoded body | `X-GitHub-Event` header |
+| `parseTrelloPayload()` | JSON body | `action.type` field |
+| `parseJiraPayload()` | JSON body | `webhookEvent` field |
+| `parseSentryPayload()` | JSON body | `Sentry-Hook-Resource` header |
+
+## Platform Adapters
+
+`src/router/platform-adapter.ts` — `RouterPlatformAdapter` interface
+
+Each provider implements this interface to plug into the generic `processRouterWebhook()` pipeline:
+
+```typescript
+interface RouterPlatformAdapter {
+  readonly type: string;
+  parseWebhook(payload: unknown): Promise<ParsedWebhookEvent | null>;
+  isProcessableEvent(event: ParsedWebhookEvent): boolean;
+  isSelfAuthored(event: ParsedWebhookEvent, payload: unknown): Promise<boolean>;
+  sendReaction(event: ParsedWebhookEvent, payload: unknown): void;
+  resolveProject(event: ParsedWebhookEvent): Promise<RouterProjectConfig | null>;
+  dispatchWithCredentials(event, payload, project, triggerRegistry): Promise<TriggerResult | null>;
+  postAck(event, payload, project, agentType, triggerResult): Promise<AckResult | null>;
+  buildJob(event, payload, project, triggerResult, ackResult): CascadeJob;
+  firePreActions?(job, payload): void;
+}
+```
+
+### Normalized event
+
+All platforms normalize their webhook payload into a `ParsedWebhookEvent`:
+
+```typescript
+interface ParsedWebhookEvent {
+  projectIdentifier: string;  // Board ID, repo name, JIRA project key
+  eventType: string;          // Human-readable event descriptor
+  workItemId?: string;        // Card ID, PR number, issue key
+  isCommentEvent: boolean;    // Whether this needs ack reaction
+  actionId?: string;          // Platform-specific ID for dedup
+}
+```
+
+### Provider adapters
+
+| Adapter | File | Project lookup key |
+|---------|------|--------------------|
+| `TrelloRouterAdapter` | `src/router/adapters/trello.ts` | `boardId` |
+| `GitHubRouterAdapter` | `src/router/adapters/github.ts` | `repoFullName` |
+| `JiraRouterAdapter` | `src/router/adapters/jira.ts` | JIRA project key |
+| `SentryRouterAdapter` | `src/router/adapters/sentry.ts` | CASCADE `projectId` (from URL) |
+
+## The 12-Step Pipeline
+
+`src/router/webhook-processor.ts` — `processRouterWebhook()`
+
+```mermaid
+flowchart TD
+    A[1. Parse payload] --> B{2. Duplicate?}
+    B -->|Yes| SKIP1[Skip: duplicate action]
+    B -->|No| C{3. Processable event?}
+    C -->|No| SKIP2[Skip: event type not processable]
+    C -->|Yes| D{4. Self-authored?}
+    D -->|Yes| SKIP3[Skip: loop prevention]
+    D -->|No| E[5. Fire ack reaction]
+    E --> F{6. Resolve project config}
+    F -->|Not found| SKIP4[Skip: no project config]
+    F -->|Found| G[7. Dispatch triggers with credentials]
+    G -->|No match| SKIP5[Skip: no trigger matched]
+    G -->|Matched| H{8. Work-item / agent-type lock}
+    H -->|Locked| SKIP6[Skip: concurrency limit]
+    H -->|Free| I[9. Post ack comment]
+    I --> J[10. Build job]
+    J --> K[11. Fire pre-actions]
+    K --> L[12. Enqueue to Redis]
+```
+
+### Step details
+
+1. **Parse** — Adapter normalizes raw payload into `ParsedWebhookEvent`
+2. **Dedup** — Check in-memory set of recently processed `actionId`s (`action-dedup.ts`)
+3. **Filter** — Adapter's `isProcessableEvent()` checks event type relevance
+4. **Self-check** — Adapter's `isSelfAuthored()` detects bot's own actions (loop prevention)
+5. **Reaction** — Fire-and-forget emoji reaction on the source event
+6. **Resolve config** — Look up project by platform identifier (board ID, repo, etc.)
+7. **Dispatch triggers** — Within credential scope, call `TriggerRegistry.dispatch()` to find matching agent
+8. **Concurrency** — Check work-item lock (`work-item-lock.ts`) and agent-type concurrency (`agent-type-lock.ts`)
+9. **Ack comment** — Post an acknowledgment comment to the work item or PR
+10. **Build job** — Package trigger result + payload + ack info into a `CascadeJob`
+11. **Pre-actions** — Optional fire-and-forget actions (e.g., GitHub eyes reaction)
+12. **Enqueue** — Add job to BullMQ Redis queue; mark work item and agent type as enqueued
+
+### Concurrency controls
+
+| Mechanism | File | Purpose |
+|-----------|------|---------|
+| Action dedup | `action-dedup.ts` | Prevent processing same webhook delivery twice |
+| Work-item lock | `work-item-lock.ts` | Prevent concurrent agents on the same card/issue |
+| Agent-type lock | `agent-type-lock.ts` | Configurable `max_concurrency` per agent type per project |
+
+All locks are in-memory with TTL expiry. They are conservative (enqueue-time only) — the worker performs its own verification before executing.
+
+## Signature Verification
+
+`src/router/webhookVerification.ts`
+
+Each provider's verification function checks for a stored `webhook_secret` credential and validates the signature header:
+
+| Provider | Header | Algorithm |
+|----------|--------|-----------|
+| GitHub | `X-Hub-Signature-256` | HMAC-SHA256 |
+| Trello | Custom verification | Trello-specific |
+| JIRA | `X-Hub-Signature` | HMAC-SHA256 |
+| Sentry | `Sentry-Hook-Signature` | HMAC-SHA256 |
+
+If no webhook secret is configured for a project, verification is skipped (returns `null`).
diff --git a/docs/architecture/03-trigger-system.md b/docs/architecture/03-trigger-system.md
new file mode 100644
index 00000000..1c44b24c
--- /dev/null
+++ b/docs/architecture/03-trigger-system.md
@@ -0,0 +1,180 @@
+# Trigger System
+
+The trigger system routes webhook events to the appropriate agent. When a webhook arrives, the router builds a `TriggerContext` and calls `TriggerRegistry.dispatch()` to find the first matching handler. The matched handler returns a `TriggerResult` specifying which agent to run and with what input.
+
+## TriggerRegistry
+
+`src/triggers/registry.ts`
+
+A simple ordered list of handlers with first-match-wins dispatch:
+
+```typescript
+class TriggerRegistry {
+  register(handler: TriggerHandler): void;
+  dispatch(ctx: TriggerContext): Promise<TriggerResult | null>;
+  getHandlers(): TriggerHandler[];
+}
+```
+
+`dispatch()` iterates handlers in registration order. For each handler:
+1. Call `matches(ctx)` — if `false`, skip
+2. Call `handle(ctx)` — if it returns a `TriggerResult`, return it
+3. If `handle()` returns `null`, continue to next handler
+
+## TriggerHandler
+
+`src/triggers/types.ts`
+
+```typescript
+interface TriggerHandler {
+  name: string;
+  description: string;
+  matches(ctx: TriggerContext): boolean;
+  handle(ctx: TriggerContext): Promise<TriggerResult | null>;
+}
+```
+
+### TriggerContext
+
+```typescript
+interface TriggerContext {
+  project: ProjectConfig;
+  source: TriggerSource;          // 'trello' | 'github' | 'jira' | 'sentry'
+  payload: unknown;                // Raw webhook payload
+  personaIdentities?: PersonaIdentities;  // GitHub bot identities
+}
+```
+
+### TriggerResult
+
+```typescript
+interface TriggerResult {
+  agentType: string | null;        // Which agent to run
+  agentInput: AgentInput;          // Input data for the agent
+  workItemId?: string;
+  workItemUrl?: string;
+  workItemTitle?: string;
+  prNumber?: number;
+  prUrl?: string;
+  prTitle?: string;
+  waitForChecks?: boolean;         // Poll CI before starting
+  onBlocked?: () => void;          // Cleanup if job can't be enqueued
+}
+```
+
+## Built-in Triggers
+
+Registration happens in `src/triggers/builtins.ts`, which delegates to per-platform `register.ts` files:
+
+```typescript
+function registerBuiltInTriggers(registry: TriggerRegistry): void {
+  registerTrelloTriggers(registry);
+  registerJiraTriggers(registry);
+  registerGitHubTriggers(registry);
+  registerSentryTriggers(registry);
+}
+```
+
+### Trello triggers (`src/triggers/trello/`)
+
+| Handler | Event | Agent |
+|---------|-------|-------|
+| `TrelloCommentMentionTrigger` | Bot mentioned in comment | Varies by context |
+| `TrelloStatusChangedSplittingTrigger` | Card → Splitting list | `splitting` |
+| `TrelloStatusChangedPlanningTrigger` | Card → Planning list | `planning` |
+| `TrelloStatusChangedTodoTrigger` | Card → Todo list | `implementation` |
+| `TrelloStatusChangedBacklogTrigger` | Card → Backlog list | `backlog-manager` |
+| `TrelloStatusChangedMergedTrigger` | Card → Merged list | `backlog-manager` |
+| `ReadyToProcessLabelTrigger` | "cascade-ready" label added | `splitting` |
+
+### JIRA triggers (`src/triggers/jira/`)
+
+| Handler | Event | Agent |
+|---------|-------|-------|
+| `JiraCommentMentionTrigger` | Bot mentioned in comment | Varies |
+| `JiraStatusChangedTrigger` | Issue status transition | Per-status mapping |
+| `JiraLabelAddedTrigger` | "cascade-ready" label added | `splitting` |
+
+### GitHub triggers (`src/triggers/github/`)
+
+| Handler | Event | Agent |
+|---------|-------|-------|
+| `CheckSuiteSuccessTrigger` | CI passed | `review` (with `authorMode` param) |
+| `CheckSuiteFailureTrigger` | CI failed | `respond-to-ci` |
+| `PrReviewSubmittedTrigger` | Review with changes_requested | `respond-to-review` |
+| `ReviewRequestedTrigger` | Bot requested as reviewer | `review` |
+| `PrOpenedTrigger` | PR opened | `review` |
+| `PrCommentMentionTrigger` | Bot @mentioned in PR comment | `respond-to-pr-comment` |
+| `PrMergedTrigger` | PR merged | PM status update (no agent) |
+| `PrReadyToMergeTrigger` | PR approved + checks pass | PM status update (no agent) |
+| `PrConflictDetectedTrigger` | Merge conflict on PR | `resolve-conflicts` |
+
+### Sentry triggers (`src/triggers/sentry/`)
+
+| Handler | Event | Agent |
+|---------|-------|-------|
+| `AlertingIssueTrigger` | Sentry issue alert | `alerting` |
+| `AlertingMetricTrigger` | Sentry metric alert | `alerting` |
+
+## Trigger Configuration
+
+### Event format
+
+Triggers use category-prefixed events: `{category}:{event-name}`
+- `pm:status-changed`, `pm:label-added`
+- `scm:check-suite-success`, `scm:pr-review-submitted`, `scm:review-requested`
+- `alerting:issue-created`, `alerting:metric-alert`
+
+### Config resolution
+
+`src/triggers/config-resolver.ts`
+
+Each trigger handler calls `isTriggerEnabled()` to check if it should fire. Resolution follows a three-tier cascade:
+
+1. **Database overrides** — `agent_trigger_configs` table entries per project/agent/event
+2. **Definition defaults** — `defaultEnabled` and default parameters from YAML definitions
+3. **Legacy fallback** — `project_integrations.triggers` JSONB (migrated automatically)
+
+### Context pipeline
+
+Each trigger in a YAML agent definition can declare a `contextPipeline` — an ordered list of context-fetching steps that run before the agent starts:
+
+| Step | Purpose |
+|------|---------|
+| `directoryListing` | List repository file structure |
+| `contextFiles` | Read key project files (README, etc.) |
+| `squint` | Query Squint semantic index |
+| `workItem` | Fetch work item details from PM tool |
+| `prepopulateTodos` | Pre-populate todo list from work item checklists |
+| `prContext` | Fetch PR details, diff, reviews |
+| `prConversation` | Fetch PR comments and review threads |
+| `pipelineSnapshot` | Fetch CI pipeline status |
+| `alertingIssue` | Fetch Sentry issue and event details |
+
+## Shared Agent Execution
+
+`src/triggers/shared/agent-execution.ts`
+
+After a trigger matches, the shared execution layer handles the agent lifecycle:
+
+```mermaid
+flowchart TD
+    A[Trigger matched] --> B[PM lifecycle: prepareForAgent]
+    B --> C[Check budget]
+    C -->|Over budget| D[Post budget warning, skip]
+    C -->|Within budget| E[Resolve agent definition]
+    E --> F[Set credential scope]
+    F --> G[Run agent via engine]
+    G -->|Success| H[PM lifecycle: handleSuccess]
+    G -->|Failure| I[PM lifecycle: handleFailure]
+    H --> J[Trigger debug analysis if configured]
+    I --> J
+```
+
+This includes:
+- PM lifecycle management (move card to "In Progress", post labels)
+- Budget checking (`workItemBudgetUsd`)
+- Credential scoping via `withCredentials()`
+- Agent execution via `runAgent()` (see [05-engine-backends](./05-engine-backends.md))
+- Post-run lifecycle (move card to "In Review", link PR, sync checklists)
+- Debug analysis triggering on failure
diff --git a/docs/architecture/04-agent-system.md b/docs/architecture/04-agent-system.md
new file mode 100644
index 00000000..eb583a7a
--- /dev/null
+++ b/docs/architecture/04-agent-system.md
@@ -0,0 +1,250 @@
+# Agent System
+
+Agents are the core automation units in CASCADE. Each agent is defined declaratively in YAML, specifying its identity, capabilities, triggers, prompts, and lifecycle hooks. At runtime, definitions are compiled into profiles that determine which tools the agent receives and how it interacts with the PM/SCM systems.
+
+## Agent Definitions
+
+`src/agents/definitions/`
+
+### YAML structure
+
+Each built-in agent is a YAML file in `src/agents/definitions/`. Custom agents are stored in the `agent_definitions` database table. The schema is defined in `src/agents/definitions/schema.ts`.
+
+```yaml
+identity:
+  emoji: "..."
+  label: "Implementation"
+  roleHint: "Writes code, runs tests, and prepares a pull request"
+  initialMessage: "**Implementing changes** — ..."
+
+integrations:
+  required: [pm, scm]    # Fail if not configured
+  optional: [alerting]    # Use if available
+
+capabilities:
+  required:
+    - fs:read
+    - fs:write
+    - shell:exec
+    - session:ctrl
+    - pm:read
+    - pm:write
+    - scm:pr
+  optional:
+    - pm:checklist
+
+triggers:
+  - event: pm:status-changed
+    label: "Status Changed to Todo"
+    defaultEnabled: false
+    parameters:
+      - name: targetStatus
+        type: select
+        options: [todo]
+        defaultValue: todo
+    contextPipeline: [directoryListing, contextFiles, squint, workItem, prepopulateTodos]
+
+prompts:
+  taskPrompt: |
+    Analyze and process the work item with ID: <%= it.workItemId %>.
+
+hooks:
+  trailing:
+    scm:
+      gitStatus: true
+      prStatus: true
+    builtin:
+      diagnostics: true
+      todoProgress: true
+      reminder: true
+  finish:
+    scm:
+      requiresPR: true
+  lifecycle:
+    moveOnPrepare: inProgress
+    moveOnSuccess: inReview
+    linkPR: true
+    syncChecklist: true
+
+hint: >-
+  Complete the current todo in as few iterations as possible.
+```
+
+### Key schema fields
+
+| Field | Purpose |
+|-------|---------|
+| `identity` | Agent display info (emoji, label, role hint, initial message) |
+| `integrations` | Explicit integration requirements (required/optional categories) |
+| `capabilities` | Required and optional capabilities that determine tool access |
+| `triggers` | Events that activate this agent, with parameters and context pipelines |
+| `prompts.taskPrompt` | Eta template for the agent's task prompt |
+| `hooks.trailing` | Info appended to each LLM turn (git status, PR status, diagnostics) |
+| `hooks.finish` | Completion requirements (must have PR, must have review, etc.) |
+| `hooks.lifecycle` | PM card movement on prepare/success, PR linking, checklist sync |
+| `hint` | Persistent guidance injected into the LLM context |
+| `strategies` | Engine-specific strategy overrides |
+| `gadgetOptions` | Special gadget builder flags (e.g., `includeReviewComments`) |
+
+### Three-tier definition resolution
+
+`src/agents/definitions/loader.ts`
+
+```
+1. In-memory cache (fastest, populated on first load)
+       ↓ miss
+2. Database lookup (agent_definitions table — custom agents)
+       ↓ miss
+3. YAML file on disk (src/agents/definitions/*.yaml — built-in agents)
+```
+
+Key functions:
+- `resolveAgentDefinition(agentType)` — single agent, three-tier
+- `resolveAllAgentDefinitions()` — merge DB + YAML
+- `resolveKnownAgentTypes()` — list all known types
+
+## Built-in Agents
+
+| Agent | Capabilities | Persona | Key Triggers |
+|-------|-------------|---------|--------------|
+| `implementation` | fs, shell, session, pm, scm:pr | Implementer | `pm:status-changed` (todo) |
+| `splitting` | fs, session, pm | Implementer | `pm:status-changed`, `pm:label-added` |
+| `planning` | fs, session, pm | Implementer | `pm:status-changed` (planning) |
+| `review` | fs, shell, scm:read, scm:review | Reviewer | `scm:check-suite-success`, `scm:review-requested` |
+| `respond-to-review` | fs, shell, session, pm, scm | Implementer | `scm:pr-review-submitted` |
+| `respond-to-ci` | fs, shell, session, scm | Implementer | `scm:check-suite-failure` |
+| `respond-to-pr-comment` | fs, shell, session, scm | Implementer | `scm:pr-comment-mention` |
+| `respond-to-planning-comment` | fs, session, pm | Implementer | `pm:comment-mention` |
+| `backlog-manager` | fs, session, pm, scm:read | Implementer | `pm:status-changed` (backlog, merged) |
+| `resolve-conflicts` | fs, shell, session, scm | Implementer | `scm:pr-conflict-detected` |
+| `alerting` | fs, shell, session, alerting, scm | Implementer | `alerting:issue-created`, `alerting:metric-alert` |
+| `debug` | fs, session, pm | Implementer | `internal:debug-analysis` |
+
+## Capabilities
+
+`src/agents/capabilities/`
+
+Capabilities are the bridge between agent definitions and concrete tools. The system maps capabilities to gadgets (for SDK engines) and SDK tools (for native-tool engines).
+
+### Registry
+
+`src/agents/capabilities/registry.ts`
+
+```typescript
+const CAPABILITIES = [
+  // Built-in (always available)
+  'fs:read', 'fs:write', 'shell:exec', 'session:ctrl',
+  // PM integration
+  'pm:read', 'pm:write', 'pm:checklist',
+  // SCM integration
+  'scm:read', 'scm:ci-logs', 'scm:comment', 'scm:review', 'scm:pr',
+  // Alerting integration
+  'alerting:read',
+] as const;
+```
+
+Each capability maps to a `CapabilityDefinition`:
+
+```typescript
+interface CapabilityDefinition {
+  integration: IntegrationCategory | null;  // null = built-in
+  description: string;
+  gadgetNames: string[];     // LLMist gadget classes
+  sdkToolNames: string[];    // Claude Code SDK tool names
+  cliToolNames: string[];    // cascade-tools CLI commands
+}
+```
+
+### Resolution flow
+
+`src/agents/capabilities/resolver.ts`
+
+```mermaid
+flowchart TD
+    A["Agent definition<br/>(capabilities.required + optional)"] --> B[Create integration checker]
+    B --> C["Check hasPmIntegration(),<br/>hasScmIntegration(),<br/>hasAlertingIntegration()"]
+    C --> D[resolveEffectiveCapabilities]
+    D --> E["Built-in caps: always included"]
+    D --> F["Integration caps: only if provider configured"]
+    E --> G[buildGadgetsFromCapabilities]
+    F --> G
+    G --> H["Instantiate gadget classes<br/>via GADGET_CONSTRUCTORS"]
+    H --> I["Gadget[] passed to engine"]
+```
+
+- Built-in capabilities (`fs:*`, `shell:*`, `session:*`) are always available
+- Integration capabilities (`pm:*`, `scm:*`, `alerting:*`) require the corresponding integration to be configured for the project
+- Optional capabilities degrade gracefully — missing integrations are noted in the system prompt
+
+## Prompts
+
+`src/agents/prompts/`
+
+Agent prompts are built using the [Eta](https://eta.js.org/) template engine.
+
+### Template context
+
+The `PromptContext` object passed to templates includes:
+- `workItemId`, `workItemUrl`, `workItemTitle` — from trigger result
+- `prNumber`, `prUrl`, `prBranch` — for SCM-focused agents
+- `projectConfig` — full project configuration
+- `agentType` — the running agent type
+- `capabilities` — resolved capability list
+- `hint` — persistent guidance from definition
+
+### Prompt partials
+
+Organizations can customize agent prompts via **prompt partials** — named template fragments stored in the `prompt_partials` database table. Partials are Eta includes (`<%~ include('partialName') %>`) that override default content when a custom version exists.
+
+Managed via:
+- Dashboard: Settings > Prompts
+- CLI: `cascade prompts set-partial`, `cascade prompts reset-partial`
+
+## Hooks
+
+### Trailing hooks
+
+Appended to each LLM turn as ephemeral context:
+
+| Hook | Purpose |
+|------|---------|
+| `scm.gitStatus` | Current git status (uncommitted changes) |
+| `scm.prStatus` | PR state, review status, CI checks |
+| `builtin.diagnostics` | TypeScript/lint errors in recently edited files |
+| `builtin.todoProgress` | Current todo list progress |
+| `builtin.reminder` | Iteration budget and guidance reminders |
+
+### Finish hooks
+
+Completion requirements verified before the agent can finish:
+
+| Hook | Purpose |
+|------|---------|
+| `scm.requiresPR` | Agent must have created/updated a PR |
+| `scm.requiresReview` | Agent must have submitted a review |
+| `scm.requiresPushedChanges` | Agent must have pushed commits |
+
+### Lifecycle hooks
+
+PM card management during agent execution:
+
+| Hook | Purpose |
+|------|---------|
+| `moveOnPrepare` | Move card to status on agent start (e.g., "In Progress") |
+| `moveOnSuccess` | Move card to status on success (e.g., "In Review") |
+| `linkPR` | Link the created PR to the work item |
+| `syncChecklist` | Sync todo list back to PM card checklists |
+
+## Agent Profiles
+
+`src/agents/definitions/profiles.ts`
+
+At runtime, a definition is compiled into an `AgentProfile` — the operational interface used by the execution pipeline:
+
+- `filterTools(allTools)` — filter available tools based on capabilities
+- `allCapabilities` — resolved capability list
+- `fetchContext(params)` — run context pipeline steps
+- `buildTaskPrompt(input)` — render Eta task prompt template
+- `getLlmistGadgets()` — instantiate gadgets for LLMist engine
+- `finishHooks` — PR/review/push requirements
+- `lifecycleHooks` — PM card movement rules
diff --git a/docs/architecture/05-engine-backends.md b/docs/architecture/05-engine-backends.md
new file mode 100644
index 00000000..fe638fe8
--- /dev/null
+++ b/docs/architecture/05-engine-backends.md
@@ -0,0 +1,154 @@
+# Engine Backends
+
+CASCADE abstracts LLM execution behind the `AgentEngine` interface. Multiple engines (Claude Code, LLMist, Codex, OpenCode) implement this interface, and a shared execution adapter orchestrates the full lifecycle around any engine.
+
+## AgentEngine Interface
+
+`src/backends/types.ts`
+
+```typescript
+interface AgentEngine {
+  readonly definition: AgentEngineDefinition;
+
+  execute(plan: AgentExecutionPlan): Promise<AgentEngineResult>;
+  supportsAgentType(agentType: string): boolean;
+
+  // Optional hooks
+  resolveModel?(cascadeModel: string): string;
+  getSettingsSchema?(): ZodType<Record<string, unknown>>;
+  beforeExecute?(plan: AgentExecutionPlan): Promise<void>;
+  afterExecute?(plan: AgentExecutionPlan, result: AgentEngineResult): Promise<void>;
+}
+```
+
+### AgentEngineDefinition
+
+Describes engine capabilities and configuration:
+
+```typescript
+interface AgentEngineDefinition {
+  readonly id: string;              // 'claude-code', 'llmist', 'codex', 'opencode'
+  readonly label: string;           // Display name
+  readonly archetype: 'sdk' | 'native-tool';
+  readonly capabilities: string[];
+  readonly modelSelection: { type: 'free-text' } | { type: 'select', options: [...] };
+  readonly logLabel: string;
+  readonly settings?: AgentEngineSettingsDefinition;
+}
+```
+
+### AgentExecutionPlan
+
+The fully resolved plan passed to `engine.execute()`, combining context, prompts, and policy:
+
+```typescript
+interface AgentExecutionPlan
+  extends AgentExecutionContext,   // repoDir, project, agentInput, logWriter, etc.
+          AgentPromptSpec,          // systemPrompt, taskPrompt, availableTools, contextInjections
+          AgentEnginePolicy {       // maxIterations, model, budgetUsd, engineSettings
+  cliToolsDir: string;
+  nativeToolShimDir?: string;
+  completionRequirements?: CompletionRequirements;
+}
+```
+
+## Two Engine Archetypes
+
+### `native-tool` — Subprocess-based CLI tools
+
+Used when the engine runs as an external CLI process with its own built-in file/bash tools.
+
+**Base class**: `NativeToolEngine` (`src/backends/shared/NativeToolEngine.ts`)
+
+Provides:
+- `buildEngineEnv()` — construct subprocess environment with allowlisted env vars and project secrets
+- `resolveModel()` delegation to `resolveEngineModel()`
+- `afterExecute()` cleanup for offloaded context files
+
+**Implementations**: Claude Code (`src/backends/claude-code/`), Codex (`src/backends/codex/`), OpenCode (`src/backends/opencode/`)
+
+Native-tool engines invoke CASCADE domain tools (PM, SCM, alerting) via the `cascade-tools` CLI binary through Bash commands. File operations use the engine's built-in tools (Read, Write, Edit, Bash, Glob, Grep).
+
+### `sdk` — In-process SDK integrations
+
+Used when the engine runs in-process and manages its own LLM API calls.
+
+**Implementation**: LLMist (`src/backends/llmist/`)
+
+SDK engines invoke gadgets server-side as synthetic tool calls — the engine calls the gadget function directly and injects the result into the LLM context.
+
+## Engine Registry
+
+`src/backends/registry.ts`
+
+```typescript
+function registerEngine(engine: AgentEngine): void;
+function getEngine(name: string): AgentEngine;
+function getEngineCatalog(): AgentEngineDefinition[];
+```
+
+Engines are registered at bootstrap (`src/backends/bootstrap.ts`) before any config loading or webhook processing begins.
+
+### Engine resolution
+
+When an agent runs, the engine is resolved in order:
+1. Agent-type override (from `agent_configs.agent_engine` for this project + agent type)
+2. Project-level default (`project.agentEngine.default`)
+3. Global fallback: `'claude-code'`
+
+## Execution Adapter
+
+`src/backends/adapter.ts` — `executeWithEngine()`
+
+This is the central orchestration function that wraps every engine call. It handles everything that is common across engines:
+
+```mermaid
+sequenceDiagram
+    participant C as Caller
+    participant A as Adapter
+    participant S as Secret Orchestrator
+    participant E as Engine
+    participant D as Database
+
+    C->>A: executeWithEngine(engine, agentType, input)
+    A->>A: Setup repo directory (clone if needed)
+    A->>A: Create FileLogger + LogWriter
+    A->>D: Create run record
+    A->>S: Build AgentExecutionPlan
+    S->>S: Resolve model, fetch context, build prompts
+    S->>S: Resolve project secrets, engine settings
+    A->>A: Start progress monitor
+    A->>E: engine.beforeExecute(plan)
+    A->>E: engine.execute(plan)
+    E-->>A: AgentEngineResult
+    A->>E: engine.afterExecute(plan, result)
+    A->>A: Post-process result (extract PR evidence)
+    A->>A: Run continuation loop if needed
+    A->>D: Finalize run record (status, cost, logs)
+    A->>A: Cleanup (repo deletion, temp files)
+    A-->>C: AgentResult
+```
+
+### Key stages
+
+1. **Repo setup** — Clone repository or use existing working directory
+2. **Run record** — Create `agent_runs` database entry with `running` status
+3. **Plan building** (`src/backends/secretOrchestrator.ts`) — Resolve model, fetch context injections, build system/task prompts, gather project secrets, merge engine settings
+4. **Progress monitoring** (`src/backends/progressMonitor.ts`) — Timer-based progress updates posted to PM card and/or GitHub PR comment
+5. **Engine execution** — `beforeExecute()` → `execute()` → `afterExecute()`
+6. **Completion verification** (`src/backends/completion.ts`) — Check sidecar files for PR/review/push evidence
+7. **Continuation loop** (`src/backends/shared/continuationLoop.ts`) — Re-invoke engine if completion requirements not met
+8. **Finalization** — Update run record with status, duration, cost, logs; upload logs
+
+### LLM call logging
+
+`src/backends/shared/llmCallLogger.ts`
+
+All LLM requests and responses are logged to the `agent_run_llm_calls` table, tracking:
+- Request/response content
+- Token counts (input, output, cached)
+- Cost (USD)
+- Duration
+- Tool calls made
+
+For further details on adding a new engine, see [`docs/adding-engines.md`](../adding-engines.md).
diff --git a/docs/architecture/06-integration-layer.md b/docs/architecture/06-integration-layer.md
new file mode 100644
index 00000000..8cc8bf4d
--- /dev/null
+++ b/docs/architecture/06-integration-layer.md
@@ -0,0 +1,173 @@
+# Integration Layer
+
+CASCADE uses a unified integration abstraction so that infrastructure code (router, worker, webhook handlers) never branches on provider type. Every PM, SCM, and alerting provider is a class implementing `IntegrationModule`, registered into a singleton `IntegrationRegistry` at bootstrap.
+
+## IntegrationModule
+
+`src/integrations/types.ts`
+
+The base contract for all integrations:
+
+```typescript
+interface IntegrationModule {
+  readonly type: string;              // 'trello', 'jira', 'github', 'sentry'
+  readonly category: IntegrationCategory; // 'pm' | 'scm' | 'alerting'
+
+  withCredentials<T>(projectId: string, fn: () => Promise<T>): Promise<T>;
+  hasIntegration(projectId: string): Promise<boolean>;
+
+  // Optional webhook methods
+  parseWebhookPayload?(raw: unknown): IntegrationWebhookEvent | null;
+  isSelfAuthored?(event: unknown, projectId: string): Promise<boolean>;
+  lookupProject?(identifier: string): Promise<{ project; config } | null>;
+  extractWorkItemId?(text: string): string | null;
+}
+```
+
+### Credential scoping
+
+`withCredentials()` uses `AsyncLocalStorage` to set provider-specific env vars for the duration of a callback, then restores the original values. This provides per-request credential isolation without global state mutation.
+
+### Integration checking
+
+`hasIntegration()` checks that all required credential roles for the provider are configured for the given project. Role definitions come from `src/config/integrationRoles.ts`.
+
+## IntegrationRegistry
+
+`src/integrations/registry.ts`
+
+```typescript
+class IntegrationRegistry {
+  register(integration: IntegrationModule): void;
+  get(type: string): IntegrationModule;          // throws if missing
+  getOrNull(type: string): IntegrationModule | null;
+  getByCategory(category: IntegrationCategory): IntegrationModule[];
+  all(): IntegrationModule[];
+}
+
+const integrationRegistry: IntegrationRegistry;  // singleton
+```
+
+## Category Interfaces
+
+### PMIntegration
+
+`src/pm/integration.ts` — extends `IntegrationModule` with PM-specific methods:
+
+- `createProvider(project)` — create a `PMProvider` instance for CRUD operations
+- `resolveLifecycleConfig(project)` — extract labels, statuses, list IDs from project config
+- `postAckComment(projectId, workItemId, message)` — post acknowledgment comment
+- `deleteAckComment(projectId, workItemId, commentId)` — remove ack comment
+- `sendReaction(projectId, event)` — add emoji reaction to source event
+- `lookupProject(identifier)` — find project by board ID or project key
+- `extractWorkItemId(text)` — parse work item ID from text (e.g., Trello URL, JIRA key)
+
+### SCMIntegration
+
+`src/integrations/scm.ts` — extends `IntegrationModule` with SCM-specific methods for webhook payload parsing and project lookup by repository name.
+
+### AlertingIntegration
+
+`src/integrations/alerting.ts` — extends `IntegrationModule` with alerting-specific methods.
+
+## Bootstrap
+
+`src/integrations/bootstrap.ts`
+
+Single, idempotent registration point for all four built-in integrations. Safe to import from router, worker, and dashboard — it does not pull in the agent execution pipeline or template files.
+
+```
+TrelloIntegration   → integrationRegistry + pmRegistry
+JiraIntegration     → integrationRegistry + pmRegistry
+GitHubSCMIntegration → integrationRegistry
+SentryAlertingIntegration → integrationRegistry
+```
+
+## Credential Roles
+
+`src/config/integrationRoles.ts`
+
+Each provider declares its credential roles — the mapping from logical role names to environment variable keys:
+
+| Provider | Category | Required Roles | Optional Roles |
+|----------|----------|---------------|----------------|
+| Trello | pm | `api_key` → `TRELLO_API_KEY`, `token` → `TRELLO_TOKEN` | `api_secret` |
+| JIRA | pm | `email` → `JIRA_EMAIL`, `api_token` → `JIRA_API_TOKEN` | `webhook_secret` |
+| GitHub | scm | `implementer_token` → `GITHUB_TOKEN_IMPLEMENTER`, `reviewer_token` → `GITHUB_TOKEN_REVIEWER` | `webhook_secret` |
+| Sentry | alerting | `api_token` → `SENTRY_API_TOKEN` | `webhook_secret` |
+
+## Provider Implementations
+
+### Trello (`src/pm/trello/`, `src/trello/`)
+
+- `TrelloIntegration` implements `PMIntegration`
+- `TrelloPMProvider` implements `PMProvider` (card CRUD, comments, labels, checklists)
+- `trelloClient` — Octokit-style client with AsyncLocalStorage credential scoping
+- Media extraction from markdown in card descriptions/comments
+- Status = list ID (cards grouped by lists)
+
+### JIRA (`src/pm/jira/`, `src/jira/`)
+
+- `JiraIntegration` implements `PMIntegration`
+- `JiraPMProvider` implements `PMProvider` (issue CRUD, transitions, comments)
+- `jiraClient` — wraps `jira.js` Version3Client with AsyncLocalStorage scoping
+- ADF (Atlassian Document Format) ↔ markdown conversion (`src/pm/jira/adf.ts`)
+- Status transitions via JIRA transition ID lookup
+- Issue key extraction via regex: `[A-Z][A-Z0-9]+-\d+`
+
+### GitHub (`src/github/`)
+
+- `GitHubSCMIntegration` implements `SCMIntegration`
+- `githubClient` — Octokit wrapper with `withGitHubToken()` AsyncLocalStorage scoping
+- **Dual-persona model** (`src/github/personas.ts`):
+  - **Implementer** — writes code, creates PRs (used by most agents)
+  - **Reviewer** — reviews PRs, can approve or request changes (used by `review` agent)
+  - `isCascadeBot(login)` — checks if a GitHub login belongs to either persona
+  - `resolvePersonaIdentities()` — resolves both tokens to usernames (cached 60s per project)
+- Loop prevention: `respond-to-review` only fires on reviewer's `changes_requested`; comment triggers skip @mentions from any known persona
+
+### Sentry (`src/sentry/`)
+
+- `SentryAlertingIntegration` implements `AlertingIntegration`
+- `sentryClient` — REST API client with Bearer token auth
+- Supports issue alerts, metric alerts, and issue lifecycle webhooks
+- Config: `organizationSlug` stored in `project_integrations.config` JSONB
+
+## PM Abstraction
+
+`src/pm/`
+
+### PMProvider interface
+
+Lower-level data operations consumed by gadgets and lifecycle hooks:
+
+```typescript
+interface PMProvider {
+  getWorkItem(id: string): Promise<WorkItem>;
+  listWorkItems(filter?): Promise<WorkItem[]>;
+  createWorkItem(config): Promise<WorkItem>;
+  updateWorkItem(id, updates): Promise<WorkItem>;
+  moveToStatus(id, status): Promise<void>;
+  addComment(id, text): Promise<WorkItemComment>;
+  getChecklists(id): Promise<Checklist[]>;
+  addLabel(id, label): Promise<void>;
+  removeLabel(id, label): Promise<void>;
+  linkPR(id, prUrl): Promise<void>;
+  // ... more operations
+}
+```
+
+### PMRegistry
+
+`src/pm/registry.ts` — backward-compatible PM-specific registry. Maps PM type to integration instance. Used by trigger handlers and gadgets that need PM operations.
+
+### PM Lifecycle Manager
+
+`src/pm/lifecycle.ts` — orchestrates card/issue state during agent execution:
+
+- `prepareForAgent()` — add processing label, move to "In Progress"
+- `handleSuccess()` — add processed label, move to "In Review", link PR
+- `handleFailure()` — add error label, post error comment
+- `cleanupProcessing()` — remove processing label
+
+For the complete step-by-step guide to adding a new integration, see [`src/integrations/README.md`](../../src/integrations/README.md).
diff --git a/docs/architecture/07-gadgets.md b/docs/architecture/07-gadgets.md
new file mode 100644
index 00000000..3d1ab228
--- /dev/null
+++ b/docs/architecture/07-gadgets.md
@@ -0,0 +1,119 @@
+# Gadgets
+
+Gadgets are the tool implementations that agents use to interact with their environment. They are the concrete operations behind capabilities — when an agent definition declares `fs:write`, the capability registry maps that to gadgets like `WriteFile`, `FileSearchAndReplace`, and `FileMultiEdit`.
+
+## Capability-to-Gadget Mapping
+
+The `CAPABILITY_REGISTRY` in `src/agents/capabilities/registry.ts` is the single source of truth:
+
+```
+Agent YAML definition
+  → capabilities.required + optional
+    → CAPABILITY_REGISTRY lookup
+      → gadgetNames[] per capability
+        → GADGET_CONSTRUCTORS instantiation
+          → Gadget[] passed to engine
+```
+
+For **SDK engines** (LLMist): gadgets are instantiated as server-side classes and invoked directly when the LLM makes a tool call.
+
+For **native-tool engines** (Claude Code, Codex, OpenCode): the engine uses its own built-in tools for file/shell operations. Domain tools (PM, SCM, alerting) are invoked via the `cascade-tools` CLI binary through Bash commands.
+
+## Built-in Gadgets
+
+### File system (`fs:read`, `fs:write`)
+
+| Gadget | Capability | Purpose |
+|--------|-----------|---------|
+| `ListDirectory` | `fs:read` | List directory contents |
+| `ReadFile` | `fs:read` | Read file contents |
+| `RipGrep` | `fs:read` | Regex code search |
+| `AstGrep` | `fs:read` | AST-based code search |
+| `WriteFile` | `fs:write` | Write file contents |
+| `FileSearchAndReplace` | `fs:write` | Search and replace in files |
+| `FileMultiEdit` | `fs:write` | Multiple edits in a single file |
+| `VerifyChanges` | `fs:write` | Verify edits produce expected results |
+
+All file gadgets validate paths against allowed directories (working directory + `/tmp`). Write gadgets run post-edit diagnostics to catch syntax errors immediately.
+
+### Shell (`shell:exec`)
+
+| Gadget | Capability | Purpose |
+|--------|-----------|---------|
+| `Tmux` | `shell:exec` | Execute shell commands in a tmux session |
+| `Sleep` | `shell:exec` | Wait for a specified duration |
+
+### Session (`session:ctrl`)
+
+| Gadget | Capability | Purpose |
+|--------|-----------|---------|
+| `Finish` | `session:ctrl` | Signal task completion |
+| `TodoUpsert` | `session:ctrl` | Create or update a todo item |
+| `TodoUpdateStatus` | `session:ctrl` | Mark todo as pending/in_progress/done |
+| `TodoDelete` | `session:ctrl` | Remove a todo item |
+
+Todos are stored in `.claude/todos.json` within the repo working directory.
+
+### PM (`pm:read`, `pm:write`, `pm:checklist`)
+
+| Gadget | Capability | Purpose |
+|--------|-----------|---------|
+| `ReadWorkItem` | `pm:read` | Fetch work item details |
+| `ListWorkItems` | `pm:read` | List work items with filters |
+| `UpdateWorkItem` | `pm:write` | Update work item fields |
+| `CreateWorkItem` | `pm:write` | Create new work item |
+| `MoveWorkItem` | `pm:write` | Move work item to a status/list |
+| `PostComment` | `pm:write` | Post comment on work item |
+| `AddChecklist` | `pm:write` | Add checklist to work item |
+| `PMUpdateChecklistItem` | `pm:checklist` | Update checklist item status |
+| `PMDeleteChecklistItem` | `pm:checklist` | Delete checklist item |
+
+PM gadgets use the active `PMProvider` from `AsyncLocalStorage` context, making them provider-agnostic.
+
+### SCM (`scm:read`, `scm:ci-logs`, `scm:comment`, `scm:review`, `scm:pr`)
+
+| Gadget | Capability | Purpose |
+|--------|-----------|---------|
+| `GetPRDetails` | `scm:read` | Fetch PR metadata and state |
+| `GetPRDiff` | `scm:read` | Get PR diff (additions/deletions) |
+| `GetPRChecks` | `scm:read` | Get CI check status |
+| `GetCIRunLogs` | `scm:ci-logs` | Download failed CI job logs |
+| `PostPRComment` | `scm:comment` | Post issue comment on PR |
+| `UpdatePRComment` | `scm:comment` | Update existing comment |
+| `GetPRComments` | `scm:comment` | List PR comments |
+| `ReplyToReviewComment` | `scm:comment` | Reply to inline review comment |
+| `CreatePRReview` | `scm:review` | Submit code review |
+| `CreatePR` | `scm:pr` | Create pull request |
+
+### Alerting (`alerting:read`)
+
+| Gadget | Capability | Purpose |
+|--------|-----------|---------|
+| `GetAlertingIssue` | `alerting:read` | Fetch Sentry issue details |
+| `GetAlertingEventDetail` | `alerting:read` | Fetch specific event with stacktrace |
+| `ListAlertingEvents` | `alerting:read` | List recent events for an issue |
+
+## cascade-tools CLI
+
+`src/cli/` — the `cascade-tools` binary
+
+Native-tool engines cannot invoke gadget classes directly (they run as subprocesses). Instead, they call `cascade-tools` via Bash commands. The CLI is organized by category:
+
+| Category | Commands | Example |
+|----------|----------|---------|
+| PM | `cascade-tools pm read-card`, `list-cards`, `update-card`, etc. | `cascade-tools pm read-card --cardId=abc123 --raw-json` |
+| SCM | `cascade-tools github get-pr-details`, `get-diff`, `post-comment`, etc. | `cascade-tools github get-pr-details --pr-number=42` |
+| Alerting | `cascade-tools sentry get-issue`, `list-events`, etc. | `cascade-tools sentry get-issue --issue-id=12345` |
+| Session | `cascade-tools session todo-upsert`, `todo-status`, etc. | `cascade-tools session todo-upsert --id=1 --title="Fix tests"` |
+
+The `cascade-tools` binary uses a separate oclif config (`bin/cascade-tools.js`) that discovers all non-dashboard commands, while `cascade` discovers only dashboard commands.
+
+## Session State
+
+`src/gadgets/sessionState.ts`
+
+Gadgets communicate session-level state via a shared `SessionState` object:
+- Progress comment ID (for updating in-place ack comments)
+- GitHub auth mode (which persona is active)
+- Read tracking — which files have been read (avoids re-reads)
+- Edited files tracking — for post-edit diagnostics
diff --git a/docs/architecture/08-config-credentials.md b/docs/architecture/08-config-credentials.md
new file mode 100644
index 00000000..700548c5
--- /dev/null
+++ b/docs/architecture/08-config-credentials.md
@@ -0,0 +1,153 @@
+# Configuration and Credentials
+
+CASCADE stores all project configuration in PostgreSQL. There are no config files read at runtime — the database is the sole source of truth.
+
+## Config Provider
+
+`src/config/provider.ts`
+
+The config provider loads project configuration from the database with in-memory caching.
+
+### Loading functions
+
+| Function | Lookup key | Returns |
+|----------|-----------|---------|
+| `loadConfig()` | All projects | `CascadeConfig` (all projects in org) |
+| `loadProjectConfigByBoardId(boardId)` | Trello board ID | `{ project, config }` |
+| `loadProjectConfigByRepo(repo)` | GitHub `owner/repo` | `{ project, config }` |
+| `loadProjectConfigByJiraProjectKey(key)` | JIRA project key | `{ project, config }` |
+| `loadProjectConfigById(id)` | CASCADE project ID | `{ project, config }` |
+
+### Caching
+
+`src/config/configCache.ts` — in-memory cache with TTL populated at service startup. Caches:
+- Full config object
+- Per-project lookups by board ID, repo, JIRA key
+- Invalidated on config writes (via tRPC mutations)
+
+## Config Schema
+
+`src/config/schema.ts`
+
+Project configuration is validated with Zod schemas. Key fields:
+
+```typescript
+interface ProjectConfig {
+  id: string;
+  orgId: string;
+  name: string;
+  repo?: string;                    // GitHub owner/repo
+  baseBranch: string;               // default: 'main'
+  branchPrefix: string;             // default: 'feature/'
+  model: string;                    // LLM model identifier
+  maxIterations: number;            // default: 50
+  watchdogTimeoutMs: number;        // default: 30 min
+  workItemBudgetUsd: number;        // default: $5
+  progressModel: string;
+  progressIntervalMinutes: number;  // default: 5
+  agentEngine?: { default: string; overrides: Record<string, string> };
+  engineSettings?: EngineSettings;
+  agentEngineSettings?: Record<string, EngineSettings>;
+  runLinksEnabled: boolean;
+  maxInFlightItems?: number;
+  // ... PM config (trello/jira), agent models, snapshot settings
+}
+```
+
+## Credential Resolution
+
+CASCADE uses a two-tier credential resolution system, selecting the appropriate resolver based on execution context.
+
+### Router / Dashboard context
+
+Uses `DbCredentialResolver` — reads credentials from the `project_credentials` database table:
+
+```typescript
+getIntegrationCredential(projectId, category, role)  // e.g., ('proj1', 'pm', 'api_key')
+getAllProjectCredentials(projectId)                     // All credentials as env-var-key map
+```
+
+### Worker context
+
+Uses `EnvCredentialResolver` — reads from `process.env` (pre-loaded by the router's `worker-env.ts`):
+
+The router builds the worker's environment by:
+1. Loading all project credentials from the database
+2. Setting them as individual env vars on the Docker container
+3. Setting `CASCADE_CREDENTIAL_KEYS` with a comma-separated list of the env var names
+
+When the worker starts, it detects `CASCADE_CREDENTIAL_KEYS` and uses `EnvCredentialResolver` instead of hitting the database.
+
+### Auto-selection
+
+```typescript
+// If CASCADE_CREDENTIAL_KEYS is set → worker context (env resolver)
+// Otherwise → router/dashboard context (DB resolver)
+```
+
+### AsyncLocalStorage scoping
+
+Provider clients use `AsyncLocalStorage` for per-request credential isolation:
+
+```typescript
+// GitHub
+await withGitHubToken(token, async () => {
+  // All GitHub API calls in this scope use this token
+});
+
+// Trello
+await withTrelloCredentials({ apiKey, token }, async () => {
+  // All Trello API calls use these credentials
+});
+
+// JIRA
+await withJiraCredentials({ email, apiToken, baseUrl }, async () => {
+  // All JIRA API calls use these credentials
+});
+```
+
+## Credential Encryption
+
+`src/db/crypto.ts`
+
+When `CREDENTIAL_MASTER_KEY` is set (64-char hex string = 32-byte AES-256 key), credentials are encrypted at rest.
+
+- **Algorithm**: AES-256-GCM with 12-byte random IV and 16-byte auth tag
+- **AAD**: `projectId` (additional authenticated data)
+- **Storage format**: `enc:v1:<iv_hex>:<authTag_hex>:<ciphertext_hex>`
+- **Transparent**: `writeProjectCredential()` encrypts before DB write; read functions decrypt automatically
+- **Opt-in**: Without the env var, credentials are stored and read as plaintext
+
+### Key management
+
+```bash
+npm run credentials:generate-key     # Generate new 32-byte key
+npm run credentials:encrypt           # Encrypt all existing plaintext credentials
+npm run credentials:decrypt           # Rollback to plaintext
+npm run credentials:rotate-key        # Re-encrypt with CREDENTIAL_MASTER_KEY_NEW
+```
+
+## Integration Roles
+
+`src/config/integrationRoles.ts`
+
+Maps provider → category → credential roles. Each role maps a logical name to an env var key:
+
+```typescript
+registerCredentialRoles('trello', 'pm', [
+  { role: 'api_key', label: 'API Key', envVarKey: 'TRELLO_API_KEY' },
+  { role: 'token',   label: 'Token',   envVarKey: 'TRELLO_TOKEN' },
+]);
+```
+
+`hasIntegration()` returns `true` only if all non-optional roles have values stored.
+
+## Engine Settings
+
+`src/config/engineSettings.ts`
+
+Per-engine configuration schemas registered dynamically at bootstrap. Settings are merged at execution time:
+1. Project-level `engineSettings` (base)
+2. Agent-config-level `agentEngineSettings[agentType]` (override)
+
+Each engine optionally provides a `getSettingsSchema()` method that returns a Zod schema, registered via `registerEngineSettingsSchema()`.
diff --git a/docs/architecture/09-database.md b/docs/architecture/09-database.md
new file mode 100644
index 00000000..55ba8f08
--- /dev/null
+++ b/docs/architecture/09-database.md
@@ -0,0 +1,197 @@
+# Database
+
+CASCADE uses PostgreSQL with [Drizzle ORM](https://orm.drizzle.team/) for type-safe database access. All data access goes through repository modules — no raw SQL in application code.
+
+## Schema
+
+`src/db/schema/`
+
+```mermaid
+erDiagram
+    organizations ||--o{ projects : "has"
+    organizations ||--o{ users : "has"
+    organizations ||--o{ prompt_partials : "has"
+
+    projects ||--o{ project_integrations : "has"
+    projects ||--o{ project_credentials : "has"
+    projects ||--o{ agent_configs : "has"
+    projects ||--o{ agent_definitions : "has"
+    projects ||--o{ agent_trigger_configs : "has"
+    projects ||--o{ agent_runs : "tracks"
+    projects ||--o{ pr_work_items : "maps"
+
+    agent_runs ||--o| agent_run_logs : "has"
+    agent_runs ||--o{ agent_run_llm_calls : "logs"
+    agent_runs ||--o| debug_analyses : "analyzed by"
+
+    users ||--o{ sessions : "has"
+
+    organizations {
+        text id PK
+        text name
+        jsonb settings
+    }
+
+    projects {
+        text id PK
+        text org_id FK
+        text name
+        text repo
+        text base_branch
+        text model
+        integer max_iterations
+        integer watchdog_timeout_ms
+        numeric work_item_budget_usd
+        jsonb agent_engine
+        jsonb engine_settings
+    }
+
+    project_integrations {
+        uuid id PK
+        text project_id FK
+        text category
+        text provider
+        jsonb config
+        jsonb triggers
+    }
+
+    project_credentials {
+        uuid id PK
+        text project_id FK
+        text env_var_key
+        text value
+    }
+
+    agent_configs {
+        uuid id PK
+        text project_id FK
+        text agent_type
+        text model
+        integer max_iterations
+        text agent_engine
+        jsonb agent_engine_settings
+        integer max_concurrency
+        text system_prompt
+        text task_prompt
+    }
+
+    agent_trigger_configs {
+        uuid id PK
+        text project_id FK
+        text agent_type
+        text event
+        boolean enabled
+        jsonb parameters
+    }
+
+    agent_runs {
+        uuid id PK
+        text project_id FK
+        text agent_type
+        text status
+        text model
+        integer llm_iterations
+        integer gadget_calls
+        numeric cost_usd
+        integer duration_ms
+        text pr_url
+        text work_item_id
+        text error
+    }
+
+    agent_run_logs {
+        uuid id PK
+        uuid run_id FK
+        text cascade_log
+        text engine_log
+    }
+
+    agent_run_llm_calls {
+        uuid id PK
+        uuid run_id FK
+        integer call_number
+        jsonb request
+        jsonb response
+        integer input_tokens
+        integer output_tokens
+        numeric cost_usd
+        integer duration_ms
+    }
+```
+
+### Key tables
+
+| Table | Purpose | Key constraints |
+|-------|---------|-----------------|
+| `organizations` | Multi-tenant organization definitions | — |
+| `projects` | Per-project config (repo, model, budget, engine) | `repo` UNIQUE |
+| `project_integrations` | Integration configs with category/provider | UNIQUE(`project_id`, `category`) |
+| `project_credentials` | Encrypted credentials keyed by env var name | UNIQUE(`project_id`, `env_var_key`) |
+| `agent_configs` | Per-agent-type overrides per project | UNIQUE(`project_id`, `agent_type`), `project_id NOT NULL` |
+| `agent_definitions` | Agent YAML definitions (built-in + custom) | UNIQUE(`agent_type`) |
+| `agent_trigger_configs` | Trigger enable/disable + parameters per project/agent/event | UNIQUE(`project_id`, `agent_type`, `event`) |
+| `agent_runs` | Agent execution records with status, cost, duration | Indexed on `project_id`, `status`, `started_at` |
+| `agent_run_logs` | Cascade log + engine log per run | One-to-one with `agent_runs` |
+| `agent_run_llm_calls` | LLM request/response pairs with token/cost tracking | — |
+| `prompt_partials` | Org-scoped prompt template customizations | UNIQUE(`org_id`, `name`) |
+| `pr_work_items` | Maps PRs to work items for run-link display | — |
+| `webhook_logs` | Raw webhook payloads for debugging | — |
+| `users` | Dashboard users (email, bcrypt hash, role) | Org-scoped |
+| `sessions` | Session tokens for cookie auth (30-day expiry) | — |
+| `debug_analyses` | AI debug analysis results | — |
+
+## Repositories
+
+`src/db/repositories/`
+
+Each table has a dedicated repository providing typed query methods. Key repositories:
+
+| Repository | Purpose |
+|------------|---------|
+| `configRepository` | Load full project config from DB, merge integrations + credentials |
+| `configMapper` | Transform raw DB rows to typed `ProjectConfig` objects |
+| `credentialsRepository` | Credential CRUD with transparent encryption/decryption |
+| `runsRepository` | Run lifecycle (create, update status, query by project/status) |
+| `runLogsRepository` | Store and retrieve cascade + engine logs |
+| `llmCallsRepository` | Log and query LLM request/response pairs |
+| `agentConfigsRepository` | Per-agent settings CRUD |
+| `agentDefinitionsRepository` | Agent definition CRUD (YAML ↔ JSONB) |
+| `agentTriggerConfigsRepository` | Trigger enable/disable/params per project/agent/event |
+| `integrationsRepository` | Query integration configuration |
+| `projectsRepository` | Project CRUD |
+| `organizationsRepository` | Organization CRUD |
+| `usersRepository` | User management |
+| `partialsRepository` | Prompt partial CRUD |
+| `prWorkItemsRepository` | PR ↔ work item mapping |
+| `webhookLogsRepository` | Webhook audit trail |
+| `debugAnalysisRepository` | Debug analysis results |
+
+## Connection Management
+
+`src/db/client.ts`
+
+- `DatabaseContext` class wraps Drizzle instance + `pg.Pool`
+- `getDb()` returns a singleton, lazily initialized from `DATABASE_URL`
+- SSL support with optional CA certificate (`DATABASE_CA_CERT`)
+- In workers, the DB connection is initialized eagerly (before env scrub removes `DATABASE_URL`)
+
+## Migrations
+
+Migrations are hand-written SQL files in `src/db/migrations/`, tracked by drizzle-kit's journal (`meta/_journal.json`).
+
+### Adding a migration
+
+1. Create `src/db/migrations/NNNN_description.sql`
+2. Add entry to `src/db/migrations/meta/_journal.json` with unique `when` timestamp and `tag` matching filename
+3. Run `npm run db:migrate`
+
+### Scripts
+
+| Command | Purpose |
+|---------|---------|
+| `npm run db:migrate` | Apply pending migrations |
+| `npm run db:generate` | Generate migration SQL from schema changes |
+| `npm run db:push` | Push schema directly (dev only) |
+| `npm run db:studio` | Open Drizzle Studio |
+| `npm run db:seed` | Seed from `config/projects.json` |
+| `npm run db:bootstrap-journal` | Register existing migrations (one-time for `push`-initialized DBs) |
diff --git a/docs/architecture/10-resilience.md b/docs/architecture/10-resilience.md
new file mode 100644
index 00000000..8123e192
--- /dev/null
+++ b/docs/architecture/10-resilience.md
@@ -0,0 +1,141 @@
+# Resilience
+
+CASCADE runs long-lived agent sessions (up to 30+ minutes) against external LLM APIs. The resilience layer ensures reliable operation through watchdog timers, concurrency controls, rate limiting, retry strategies, and loop prevention.
+
+## Watchdog
+
+`src/utils/lifecycle.ts`
+
+Each worker container has a configurable watchdog timer that force-exits the process if the agent exceeds its timeout:
+
+- **Timeout**: Configurable per project via `watchdogTimeoutMs` (default: 30 minutes)
+- **Cleanup**: A cleanup callback is registered via `setWatchdogCleanup()` and called before force exit (with a 10-second cap)
+- **Router-side buffer**: The router's worker manager adds a 2-minute buffer on top of the worker watchdog before considering a container orphaned
+
+```typescript
+startWatchdog(timeoutMs, () => {
+  // cleanup callback: finalize run record, upload logs
+});
+```
+
+## Concurrency Controls
+
+### Work-item lock
+
+`src/router/work-item-lock.ts`
+
+Prevents multiple agents from working on the same card/issue simultaneously. The lock is in-memory (router process) with TTL expiry.
+
+- Checked at webhook processing time (step 8 of the pipeline)
+- Marked when job is enqueued, cleared when worker completes
+- Key: `(projectId, workItemId, agentType)`
+
+### Agent-type concurrency limit
+
+`src/router/agent-type-lock.ts`
+
+Configurable `max_concurrency` per agent type per project (set via `agent_configs.max_concurrency`). Prevents too many instances of the same agent type running simultaneously.
+
+- Tracks enqueued + running counts
+- Blocks new jobs when limit reached
+- Includes a "recently dispatched" window to prevent race conditions between enqueueing and worker startup
+
+### Max in-flight items
+
+`projects.max_in_flight_items` — project-level cap on total concurrent agent runs. Checked during trigger dispatch.
+
+### BullMQ concurrency
+
+The router's worker manager limits how many Docker containers run in parallel via `routerConfig.maxWorkers`.
+
+## Rate Limiting
+
+`src/config/rateLimits.ts`
+
+Proactive, model-specific rate limits prevent hitting LLM provider quotas. Configured per model with safety margins (80-90% of actual limits):
+
+- **RPM** (requests per minute)
+- **TPM** (tokens per minute)
+- **Daily token limit**
+
+Rate limits are enforced by the LLMist SDK for `sdk`-archetype engines. Native-tool engines (Claude Code, Codex) handle rate limiting internally.
+
+## Retry Strategy
+
+`src/config/retryConfig.ts`
+
+Handles transient LLM API failures:
+
+- **5 retry attempts** with exponential backoff (1s base, 60s max)
+- **Jitter** randomization prevents thundering herd
+- **Respects `Retry-After` headers** (capped at 2 minutes)
+- **Custom detection** for undici/fetch stream termination errors
+- **Logging** and Sentry breadcrumbs on each retry and exhaustion
+
+Retries cover: HTTP 429 (rate limit), 5xx (server errors), timeouts, and connection failures.
+
+## Context Compaction
+
+`src/config/compactionConfig.ts`
+
+Prevents context window overflow during long-running agent sessions:
+
+- **Trigger**: 80% context usage
+- **Target**: Reduce to 50%
+- **Preserve**: 5 most recent turns
+- **Strategy**: Hybrid summarization + sliding window
+- Summarization preserves: task goals, key decisions, discovered facts, errors, and failed approaches (to avoid repeating them)
+- Clears read-tracking state after compaction
+
+## Iteration Hints
+
+`src/config/hintConfig.ts`
+
+Ephemeral trailing messages showing the agent its iteration budget:
+
+- Displayed at configurable thresholds
+- Urgency warnings at >80%: "ITERATION BUDGET: 17/20 - Only 3 remaining!"
+- Helps the LLM prioritize and wrap up before hitting limits
+
+## Loop Prevention
+
+### Bot identity detection
+
+`src/github/personas.ts` — `isCascadeBot(login)`
+
+Both GitHub persona usernames (implementer + reviewer) are resolved and cached. Event handlers check if the event author is a known persona to prevent self-triggered loops:
+
+- `respond-to-review` only fires when the **reviewer** persona submits `changes_requested`
+- `respond-to-pr-comment` skips @mentions from **any** known persona
+- Trello/JIRA handlers check their bot member/account IDs similarly
+
+### Self-authored event filtering
+
+Each `RouterPlatformAdapter.isSelfAuthored()` checks the webhook payload author against known bot identities. Self-authored events are logged and discarded at step 4 of the webhook pipeline.
+
+## Security
+
+### Environment scrubbing
+
+`src/utils/envScrub.ts` — `scrubSensitiveEnv()`
+
+After the worker initializes its DB connection and caches config, sensitive env vars (`DATABASE_URL`, master keys) are removed from `process.env`. This prevents LLM-generated shell commands (executed by agents) from accessing database credentials.
+
+### Credential encryption at rest
+
+See [08-config-credentials](./08-config-credentials.md) — AES-256-GCM encryption with transparent encrypt/decrypt.
+
+## Orphan Cleanup
+
+`src/router/orphan-cleanup.ts`
+
+Periodic scan for Docker containers that outlived their expected lifetime (watchdog timeout + buffer). Orphans are killed and their run records marked as failed.
+
+## Snapshot Management
+
+`src/router/snapshot-manager.ts`, `src/router/snapshot-cleanup.ts`
+
+Optional container snapshots for warm restarts:
+- After a worker completes, its container state can be snapshotted
+- Subsequent runs for the same project reuse the snapshot (faster startup, cached dependencies)
+- Snapshots have a configurable TTL (`snapshotTtlMs`) and are cleaned up periodically
diff --git a/tests/unit/architecture-docs.test.ts b/tests/unit/architecture-docs.test.ts
new file mode 100644
index 00000000..037eef5e
--- /dev/null
+++ b/tests/unit/architecture-docs.test.ts
@@ -0,0 +1,163 @@
+import { existsSync, readFileSync } from 'node:fs';
+import path from 'node:path';
+
+const DOCS_ROOT = path.resolve(__dirname, '../../docs');
+const ARCH_DIR = path.join(DOCS_ROOT, 'architecture');
+
+function readDoc(filePath: string): string {
+	return readFileSync(filePath, 'utf-8');
+}
+
+function extractMarkdownLinks(content: string): string[] {
+	const linkPattern = /\[.*?\]\((\.\.?\/[^)]+\.md)\)/g;
+	return Array.from(content.matchAll(linkPattern), (m) => m[1]);
+}
+
+describe('Architecture documentation', () => {
+	describe('hub document (ARCHITECTURE.md)', () => {
+		const hubPath = path.join(DOCS_ROOT, 'ARCHITECTURE.md');
+
+		it('exists', () => {
+			expect(existsSync(hubPath)).toBe(true);
+		});
+
+		it('contains expected sections', () => {
+			const content = readDoc(hubPath);
+			const expectedSections = [
+				'System Overview',
+				'Service Topology',
+				'End-to-End Request Flow',
+				'Architectural Patterns',
+				'Directory Map',
+				'Deep-Dive Documents',
+			];
+			for (const section of expectedSections) {
+				expect(content).toContain(section);
+			}
+		});
+
+		it('contains mermaid diagrams', () => {
+			const content = readDoc(hubPath);
+			expect(content).toContain('```mermaid');
+		});
+
+		it('links to all 10 deep-dive documents', () => {
+			const content = readDoc(hubPath);
+			const deepDiveFiles = [
+				'01-services.md',
+				'02-webhook-pipeline.md',
+				'03-trigger-system.md',
+				'04-agent-system.md',
+				'05-engine-backends.md',
+				'06-integration-layer.md',
+				'07-gadgets.md',
+				'08-config-credentials.md',
+				'09-database.md',
+				'10-resilience.md',
+			];
+			for (const file of deepDiveFiles) {
+				expect(content).toContain(file);
+			}
+		});
+	});
+
+	const deepDiveDocuments = [
+		{
+			file: '01-services.md',
+			expectedHeading: 'Services and Deployment',
+			expectedSections: ['Router', 'Worker', 'Dashboard'],
+		},
+		{
+			file: '02-webhook-pipeline.md',
+			expectedHeading: 'Webhook Pipeline',
+			expectedSections: ['Webhook Handler Factory', 'Platform Adapters'],
+		},
+		{
+			file: '03-trigger-system.md',
+			expectedHeading: 'Trigger System',
+			expectedSections: ['TriggerRegistry', 'TriggerHandler', 'Built-in Triggers'],
+		},
+		{
+			file: '04-agent-system.md',
+			expectedHeading: 'Agent System',
+			expectedSections: ['Agent Definitions', 'Capabilities', 'Prompts'],
+		},
+		{
+			file: '05-engine-backends.md',
+			expectedHeading: 'Engine Backends',
+			expectedSections: ['AgentEngine Interface', 'Execution Adapter'],
+		},
+		{
+			file: '06-integration-layer.md',
+			expectedHeading: 'Integration Layer',
+			expectedSections: ['IntegrationModule', 'IntegrationRegistry'],
+		},
+		{
+			file: '07-gadgets.md',
+			expectedHeading: 'Gadgets',
+			expectedSections: ['Capability-to-Gadget Mapping', 'Built-in Gadgets'],
+		},
+		{
+			file: '08-config-credentials.md',
+			expectedHeading: 'Configuration and Credentials',
+			expectedSections: ['Config Provider', 'Credential Resolution'],
+		},
+		{
+			file: '09-database.md',
+			expectedHeading: 'Database',
+			expectedSections: ['Schema', 'Repositories'],
+		},
+		{
+			file: '10-resilience.md',
+			expectedHeading: 'Resilience',
+			expectedSections: ['Watchdog', 'Concurrency Controls'],
+		},
+	];
+
+	describe.each(deepDiveDocuments)('$file', ({ file, expectedHeading, expectedSections }) => {
+		const filePath = path.join(ARCH_DIR, file);
+
+		it('exists', () => {
+			expect(existsSync(filePath)).toBe(true);
+		});
+
+		it(`contains heading: ${expectedHeading}`, () => {
+			const content = readDoc(filePath);
+			expect(content).toContain(expectedHeading);
+		});
+
+		it('contains expected sections', () => {
+			const content = readDoc(filePath);
+			for (const section of expectedSections) {
+				expect(content).toContain(section);
+			}
+		});
+	});
+
+	describe('cross-references', () => {
+		it('all relative .md links in hub document resolve to existing files', () => {
+			const hubPath = path.join(DOCS_ROOT, 'ARCHITECTURE.md');
+			const content = readDoc(hubPath);
+			const links = extractMarkdownLinks(content);
+
+			expect(links.length).toBeGreaterThan(0);
+			for (const link of links) {
+				const resolved = path.resolve(DOCS_ROOT, link);
+				expect(existsSync(resolved)).toBe(true);
+			}
+		});
+
+		it('all relative .md links in deep-dive documents resolve to existing files', () => {
+			for (const { file } of deepDiveDocuments) {
+				const filePath = path.join(ARCH_DIR, file);
+				if (!existsSync(filePath)) continue;
+				const content = readDoc(filePath);
+				const links = extractMarkdownLinks(content);
+				for (const link of links) {
+					const resolved = path.resolve(ARCH_DIR, link);
+					expect(existsSync(resolved)).toBe(true);
+				}
+			}
+		});
+	});
+});
diff --git a/web/src/components/projects/project-harness-form.tsx b/web/src/components/projects/project-harness-form.tsx
index 31f3e085..e6370e0a 100644
--- a/web/src/components/projects/project-harness-form.tsx
+++ b/web/src/components/projects/project-harness-form.tsx
@@ -42,7 +42,6 @@ function capitalize(s: string): string {
 	return s.charAt(0).toUpperCase() + s.slice(1);
 }
 
-// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: multiple query dependencies and per-engine tab rendering for credentials and settings
 export function ProjectHarnessForm({ project }: { project: Project }) {
 	const updateMutation = useProjectUpdate(project.id);
 	const enginesQuery = useQuery(trpc.agentConfigs.engines.queryOptions());