From 91d751c9a102841441b9528b5d32b8af739eec82 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 16:29:12 +0800 Subject: [PATCH 01/15] =?UTF-8?q?docs(spec):=20Phase=202=20W3c=20=E2=80=94?= =?UTF-8?q?=20active/standby=20controller=20design=20(OPS-04=20partial)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PG session advisory lock (pg_try_advisory_lock) for app-level leader election; standby → recovering → active state machine; /health/active LB target; INVARIANT-33 503 barrier on the three executor-loop endpoints. Zero schema changes. Single-shared-PG design — PG HA is Phase 4. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...05-15-phase-2-w3c-active-standby-design.md | 529 ++++++++++++++++++ 1 file changed, 529 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-15-phase-2-w3c-active-standby-design.md diff --git a/docs/superpowers/specs/2026-05-15-phase-2-w3c-active-standby-design.md b/docs/superpowers/specs/2026-05-15-phase-2-w3c-active-standby-design.md new file mode 100644 index 0000000..9043445 --- /dev/null +++ b/docs/superpowers/specs/2026-05-15-phase-2-w3c-active-standby-design.md @@ -0,0 +1,529 @@ +# Phase 2 Week 3c — Active/Standby Controller Design + +> **Status:** Draft (brainstormed 2026-05-15). +> **Companion plan:** `docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md` (to be written by writing-plans skill after spec approval). +> **Roadmap source:** `docs/v2.0/08-mvp-roadmap.md` §2.6 — Phase 2 Week 3 Day 5 ("Active/standby + chaos 演练"). Phase 2 exit criterion: "Active/standby switch RTO ≤ 10min (CH-Q1)". +> **Companion split (W3a / W3b):** W3a (mTLS + JWT + HMAC) merged — PR #12. W3b (HF reverse proxy) merged — PR #13. W3c is the last Phase 2 W3 sub-week. +> **Operations source:** `docs/v2.0/05-operations.md` §6.1 (Controller active/standby). +> **Invariant source:** `docs/v2.0/INVARIANTS.md` row 33 ("standby 提升后进入 recovery_in_progress;所有心跳响应 503 直到三向对账完成"). +> **Closes:** OPS-04 (partial — the app-level half. PG-level HA stays in Phase 4.). + +--- + +## 1. Goal & Scope + +### 1.1 Goal + +App-level controller leader election so two controller instances run active/standby with automatic failover. + +**Mechanism.** The instance holding `pg_try_advisory_lock()` (PG session-level lock) is the **active**. It runs `run_recovery_routine`, then the sweep loop, and answers `GET /health/active` with 200. The standby polls for the lock; when the active dies, PG auto-releases the lock on the dropped session and the standby acquires it, auto-promoting through a `standby → recovering → active` state machine. While `recovering`, the executor-loop endpoints (heartbeat / poll / report) return 503 (INVARIANT 33). + +After W3c, single-instance deployments still work identically — the lone instance simply acquires the lock instantly. Operators can run a second instance whenever they're ready. + +### 1.2 In scope + +| Item | Where | +|---|---| +| `LeaderElector` — dedicated NullPool engine holding `pg_try_advisory_lock` | `src/dlw/services/leader_election.py` (new) | +| `run_leader_loop` — async coroutine driving the state machine | same file | +| `main.py` lifespan restructure — recovery + sweep become leader-gated | `src/dlw/main.py` | +| `GET /health/active` — LB target | `src/dlw/api/health.py` | +| `require_not_recovering` dependency — 503 barrier on executor-loop endpoints (heartbeat/poll/report) | `src/dlw/api/_recovery_barrier.py` (new), wired into `api/executors.py` + `api/subtasks.py` | +| Two new `Settings` fields: `active_lock_id`, `leader_poll_interval_seconds` | `src/dlw/config.py` | +| Chaos-drill integration test | `tests/e2e/test_failover_drill.py` (new) | +| Operator runbook note | `docs/operator/executor-runbook.md` | + +### 1.3 Non-goals (deferred — explicit list) + +| Item | Where | +|---|---| +| PG streaming replication / PG primary failover (`promote-standby.sh`) | **Phase 4** — CH-Q3 ("PG primary 故障"). W3c runs against a single shared PostgreSQL. | +| chaos-mesh automation | **Phase 4** — Phase 4 §4.2 explicitly. | +| Warm / read-only standby | rejected in §1; cold standby only (LB routes off `/health/active`). | +| Active-active multi-leader scheduling | explicitly rejected in `05 §6.1`. v2.0 single active only. | +| LB / k8s Helm wiring | the deploy-side health-check config is a deploy task (not part of W3c). W3c only provides the `/health/active` endpoint. | +| Manual demote / step-down admin endpoint | not needed; auto-promotion only. Restart = step down. | +| `DLW_STRICT_RECOVERY` env knob | deleted as part of the lifespan restructure (the leader loop's retry-on-promote-failure replaces it). | +| Active-side metrics (`dlw_controller_state_*` gauges, `time_to_promote` histogram) | useful for operators but a Phase-3 observability polish — out of W3c. The state is observable via `/health/active` + logs. | + +--- + +## 2. Tech Stack Additions + +**None.** `pg_try_advisory_lock` / `pg_advisory_unlock` are built into PostgreSQL; SQLAlchemy async + asyncpg already in use; FastAPI dependency pattern already in use. No new runtime deps, no new dev deps, no new CI jobs, **zero alembic migrations**. + +--- + +## 3. Components + +### 3.1 New: `src/dlw/services/leader_election.py` + +Owns both `LeaderElector` (the lock primitive) and `run_leader_loop` (the state-machine coroutine). Pure async logic — no HTTP, no FastAPI imports — so it's testable in isolation against the local PG. + +```python +"""Controller leader election via PG session advisory lock (Phase 2 W3c). + +The instance holding pg_try_advisory_lock() is the active +controller. PG releases the lock the instant its holding session ends, which +gives us automatic failover with no lease/expiry logic. Single-shared-PG +design — PG HA is a separate concern (Phase 4).""" +from __future__ import annotations + +import asyncio +import logging +from collections.abc import Awaitable, Callable +from typing import Literal + +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine, create_async_engine +from sqlalchemy.pool import NullPool + +logger = logging.getLogger(__name__) + +ControllerState = Literal["standby", "recovering", "active"] + + +class LeaderElector: + """Owns a dedicated PG connection that holds the advisory lock.""" + + def __init__(self, db_url: str, lock_id: int) -> None: + self._db_url = db_url + self._lock_id = lock_id + self._engine: AsyncEngine | None = None + self._conn: AsyncConnection | None = None + + async def try_acquire(self) -> bool: + """Try to acquire the lock. Returns True on success (and keeps the + connection open holding it); False if another instance holds it.""" + if self._engine is None: + # Dedicated engine. NullPool so SQLAlchemy never recycles the + # connection out from under us (recycling would release the lock). + # tcp_keepalives so a half-open TCP gets detected reasonably fast, + # bounding the worst-case failover floor. + self._engine = create_async_engine( + self._db_url, + poolclass=NullPool, + connect_args={"server_settings": { + "tcp_keepalives_idle": "30", + "tcp_keepalives_interval": "10", + }}, + ) + if self._conn is None: + self._conn = await self._engine.connect() + result = await self._conn.execute( + text("SELECT pg_try_advisory_lock(:lock_id)"), + {"lock_id": self._lock_id}, + ) + return bool(result.scalar()) + + async def verify(self) -> bool: + """Ping the lock-holding connection. Returns True if still alive + (we still hold the lock); False if the connection died (lock lost).""" + if self._conn is None: + return False + try: + await self._conn.execute(text("SELECT 1")) + return True + except Exception as e: + logger.warning("leader connection lost: %s", e) + await self._cleanup_connection() + return False + + async def release(self) -> None: + """Explicit release on graceful shutdown. Closing the connection + also releases — this is just the polite version.""" + if self._conn is not None: + try: + await self._conn.execute( + text("SELECT pg_advisory_unlock(:lock_id)"), + {"lock_id": self._lock_id}, + ) + except Exception as e: + logger.warning("pg_advisory_unlock failed (releases on close anyway): %s", e) + await self._cleanup_connection() + if self._engine is not None: + await self._engine.dispose() + self._engine = None + + async def _cleanup_connection(self) -> None: + if self._conn is not None: + try: + await self._conn.close() + except Exception: + pass + self._conn = None + + +async def run_leader_loop( + *, + elector: LeaderElector, + poll_interval_seconds: float, + set_state: Callable[[ControllerState], None], + on_promote: Callable[[], Awaitable[None]], # runs recovery_routine + on_active: Callable[[], Awaitable[None]], # starts the sweep task + on_step_down: Callable[[], Awaitable[None]], # cancels the sweep task + shutdown: asyncio.Event, +) -> None: + """The leader loop. Stays in standby polling for the lock; on acquire + transitions through recovering→active; on connection loss steps back to + standby. Returns cleanly when `shutdown` is set.""" + state: ControllerState = "standby" + set_state(state) + while not shutdown.is_set(): + try: + if state == "standby": + if await elector.try_acquire(): + state = "recovering"; set_state(state) + logger.info("leader: acquired lock, running recovery") + try: + await on_promote() + except Exception: + logger.exception("leader: recovery failed; will retry next tick") + # Stay in `recovering` — heartbeats keep 503ing. + # Don't release the lock (another instance can't fix it). + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue + state = "active"; set_state(state) + await on_active() + logger.info("leader: promoted to active") + elif state in ("recovering", "active"): + if not await elector.verify(): + logger.warning("leader: lost lock, stepping down to standby") + await on_step_down() + state = "standby"; set_state(state) + continue # retry acquire immediately + except asyncio.CancelledError: + raise + except Exception: + logger.exception("leader loop iteration failed") + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + + +async def _sleep_or_shutdown(shutdown: asyncio.Event, seconds: float) -> None: + try: + await asyncio.wait_for(shutdown.wait(), timeout=seconds) + except asyncio.TimeoutError: + pass +``` + +### 3.2 State machine + +``` + ┌──── start ────┐ + ▼ │ +[standby] ── acquire lock ──▶ [recovering] ── recovery_routine OK ──▶ [active] + ▲ │ + │ │ + └────────────── connection dropped / process restart ────────────────┘ +``` + +- `standby` — leader loop polls `try_acquire` every `leader_poll_interval_seconds` (default 5.0). `/health/active` returns 503. Sweep loop not running. Executor-loop endpoints pass through the recovery barrier (no 503), but the LB routes nothing here anyway. +- `recovering` — lock acquired; `run_recovery_routine` in progress. `/health/active` returns 200 (LB cuts over). Heartbeat / poll / report return 503 `{code: "CONTROLLER_RECOVERING"}` — INVARIANT 33. Stays here on `on_promote` failure (retried next tick); does **not** release the lock (another instance can't fix it). +- `active` — recovery done; sweep task spawned by the leader loop. Normal operation. The loop calls `elector.verify()` each tick; if it returns False (connection lost), the loop cancels the sweep, steps down to `standby`, and tries to re-acquire. + +### 3.3 Modified: `src/dlw/main.py` lifespan + +The lifespan loses its unconditional `run_recovery_routine` + `_sweep_loop_main` spawn and gains leader-loop wiring. The W3a auth bootstrap (CA / JWT keypair / nonce store / enrollment token) stays unconditional — both roles need it ready so promotion is instant. + +```python +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[None]: + from dlw.db.session import get_engine, reset_engine + from dlw.services.recovery import run_recovery_routine + from dlw.services.leader_election import LeaderElector, run_leader_loop + + factory = async_sessionmaker(get_engine(), expire_on_commit=False) + + # W3a auth bootstrap — UNCHANGED. Both active and standby need this ready. + install_transport_scope_patch() + # ... bootstrap_ca / ensure_server_cert / bootstrap_keypair / NonceStore / + # enrollment_token / app.state.{ca,jwt_keypair,nonce_store,enrollment_token} + settings = get_settings() + + # W3c: controller state + leader loop. + app.state.controller_state = "standby" + shutdown = asyncio.Event() + elector = LeaderElector(db_url=settings.db_url, lock_id=settings.active_lock_id) + sweep_task_holder: dict[str, asyncio.Task | None] = {"t": None} + + def _set_state(s: str) -> None: + app.state.controller_state = s + + async def _on_promote() -> None: + async with factory() as session: + stats = await run_recovery_routine(session) + await session.commit() + logger.info("recovery on promote: %s", stats.as_dict()) + + async def _on_active() -> None: + sweep_task_holder["t"] = asyncio.create_task(_sweep_loop_main(factory)) + + async def _on_step_down() -> None: + t = sweep_task_holder["t"] + if t is not None: + t.cancel() + try: + await asyncio.wait_for(t, timeout=2) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + sweep_task_holder["t"] = None + + leader_task = asyncio.create_task(run_leader_loop( + elector=elector, + poll_interval_seconds=settings.leader_poll_interval_seconds, + set_state=_set_state, + on_promote=_on_promote, + on_active=_on_active, + on_step_down=_on_step_down, + shutdown=shutdown, + )) + try: + yield + finally: + shutdown.set() + await _on_step_down() + try: + await asyncio.wait_for(leader_task, timeout=5) + except (asyncio.CancelledError, asyncio.TimeoutError): + leader_task.cancel() + await elector.release() + await reset_engine() +``` + +`DLW_STRICT_RECOVERY` is **deleted** — the leader loop's "stay in recovering, retry next tick" replaces it. + +### 3.4 New: `src/dlw/api/_recovery_barrier.py` + +```python +"""FastAPI dep that 503s executor-loop calls while the controller is recovering +(INVARIANT 33). Attached to heartbeat / poll / report endpoints.""" +from __future__ import annotations + +from fastapi import HTTPException, Request + + +async def require_not_recovering(request: Request) -> None: + state = getattr(request.app.state, "controller_state", "active") + if state == "recovering": + raise HTTPException( + status_code=503, + detail={"code": "CONTROLLER_RECOVERING", + "message": "controller recovering after failover, retry shortly"}, + ) +``` + +Wired into the three executor-loop routes via `dependencies=[Depends(require_not_recovering)]`: +- `POST /api/v1/executors/{executor_id}/heartbeat` (`api/executors.py`) +- `POST /api/v1/executors/{executor_id}/poll` (`api/executors.py`) +- `POST /api/v1/subtasks/{subtask_id}/report` (`api/subtasks.py`) + +The HF reverse-proxy endpoint (`/api/v1/hf-proxy/subtask/{id}`) is **not** barriered — it's a streaming passthrough that doesn't mutate state, and an executor that has work already in flight should be able to finish downloading even during recovery (the bytes flow controller-side → HF → executor; nothing the recovery routine touches). + +### 3.5 Modified: `src/dlw/api/health.py` + +Add one endpoint: + +```python +@router.get("/active") +async def active(request: Request) -> dict[str, str]: + """LB target — 200 iff this instance holds the leader lock.""" + state = getattr(request.app.state, "controller_state", "standby") + if state in ("recovering", "active"): + return {"status": "active", "controller_state": state} + raise HTTPException(status_code=503, detail={"controller_state": state}) +``` + +`/health/live` and `/health/ready` are **unchanged** — they remain the k8s livenessProbe and readinessProbe respectively. `/health/active` is for LB-level routing. + +### 3.6 Modified: `src/dlw/config.py` + +`Settings` gains two fields: + +```python + # Phase 2 W3c — controller leader election + active_lock_id: int = Field(default=0x444C5743_414B5631, ge=1) # 'DLWC AKV1' + leader_poll_interval_seconds: float = Field(default=5.0, ge=0.5, le=60.0) +``` + +The `active_lock_id` default is an arbitrary fixed bigint. Both controller instances MUST use the same value to coordinate on the same lock — that's why it's a fixed default, not randomized. + +### 3.7 Test infrastructure: `make_app_with_state` conftest helper + +The lifespan restructure means every API/e2e test that mounts `create_app()` via `ASGITransport` now needs `app.state.controller_state = "active"` (otherwise the new 503 barrier on heartbeat/poll/report fires). Currently each test inlines its app setup (~10-12 sites). W3c adds a small helper in `tests/conftest.py`: + +```python +def make_app_with_state(ephemeral_ca, *, enrollment_token: str, + controller_state: str = "active"): + """Build a controller app for ASGI-transport tests with app.state pre-seeded + (skips the lifespan bootstrap). Defaults controller_state to 'active' so the + W3c recovery barrier doesn't fire.""" + from dlw.auth.hmac_nonce import NonceStore + from dlw.main import create_app + app = create_app() + app.state.ca = ephemeral_ca["ca"] + app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] + app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) + app.state.enrollment_token = enrollment_token + app.state.controller_state = controller_state + return app +``` + +Existing tests that inline `app = create_app()` + `app.state.* = ...` migrate to one call to this helper. Tests that need a non-active state (the new recovery-barrier tests, the `/health/active` tests) pass `controller_state="standby"` or `"recovering"` explicitly. + +--- + +## 4. Schema Changes + +**None.** No new table, no new column, no alembic migration. Advisory locks need no schema. W3c is the second consecutive sub-week with no alembic migration (W3b was the first). + +--- + +## 5. Wire Format Changes + +### 5.1 New endpoint `GET /health/active` + +| Aspect | Value | +|---|---| +| Auth | none (it's a health check; the LB calls it) | +| Response 200 | `{"status": "active", "controller_state": "recovering" | "active"}` | +| Response 503 | `{"detail": {"controller_state": "standby"}}` | + +### 5.2 New 503 response on three executor-loop endpoints + +| Endpoint | New 503 response | +|---|---| +| `POST /api/v1/executors/{executor_id}/heartbeat` | `{"detail": {"code": "CONTROLLER_RECOVERING", "message": ...}}` | +| `POST /api/v1/executors/{executor_id}/poll` | same | +| `POST /api/v1/subtasks/{subtask_id}/report` | same | + +Executors handle this through the existing tenacity `_retry` in `src/dlw/executor/client.py` (3 attempts, exponential backoff to ~4s max) — no new client-side code required. The retry exhausts in seconds; by then recovery has typically completed and the next attempt succeeds. A new test in `tests/executor/test_client.py` asserts this. + +### 5.3 Config surface + +- `Settings` gains: `active_lock_id` (env `DLW_ACTIVE_LOCK_ID`, default `0x444C5743414B5631`), `leader_poll_interval_seconds` (env `DLW_LEADER_POLL_INTERVAL_SECONDS`, default 5.0). +- `Settings` loses: nothing. +- Env `DLW_STRICT_RECOVERY` is **removed** from the codebase. Documented in the operator runbook. + +### 5.4 OpenAPI + +`api/openapi.yaml` gains: +- `/health/active` GET operation (200 / 503). +- A `503 CONTROLLER_RECOVERING` response variant added to the three executor-loop operations (heartbeat / poll / report). The detail-body schema (`{code, message}`) is added as a component or inlined. + +--- + +## 6. Error Handling Matrix + +| Situation | Behaviour | +|---|---| +| First instance starts, PG up | acquires lock → recovers → active | +| Two instances start simultaneously | PG guarantees exactly one wins; the other stays standby polling | +| Active dies (kill / crash / pod evicted) | TCP-close detected (tcp_keepalive ≤ 30s) → PG releases lock → standby's next poll-interval acquires → standby promotes | +| Active's PG connection has a half-open TCP (network partition, active alive but invisible) | tcp_keepalive_idle=30 + tcp_keepalives_interval=10 → connection RST detected → lock released; floor for this failure mode is ~30-90s, well under the 10min RTO | +| `run_recovery_routine` raises on promote | leader loop logs, stays in `recovering`, retries next tick; heartbeats keep 503ing; operator-visible via logs and metrics (alertable from `/health/active`=200 + heartbeat 503 imbalance) | +| Active's lock-holding connection dies while it's `active` | `elector.verify()` returns False → cancel sweep → state → `standby` → next tick re-acquires (or another instance does) | +| Graceful shutdown (SIGTERM) of active | lifespan finally: cancel sweep; cancel leader task; `elector.release()` (explicit `pg_advisory_unlock` + close conn); standby acquires within poll-interval | +| Both instances briefly think they have the lock (split-brain) | Cannot happen — `pg_try_advisory_lock` is atomic and session-scoped; the lock is held by **exactly one** PG session at a time | +| `active_lock_id` mismatch between two instances | both think they're active; double-write into the same PG. Mitigation: fixed default in `Settings`, documented as "must match across all controller instances" in the operator runbook | +| Operator runs old image (pre-W3c) alongside new image | Old image doesn't try to acquire the lock and runs its full lifespan (recovery + sweep). New image acquires lock + runs leader loop. Both run their sweeps → harmless duplicate sweeps against `SKIP LOCKED` queries (correctness preserved by §4.2's SKIP LOCKED). Worth flagging in rollout docs but not a correctness bug. | + +--- + +## 7. Testing Strategy + +### 7.1 New unit/integration tests (~15 new cases) + +| # | File | Case | Asserts | +|---|---|---|---| +| 1-6 | `tests/services/test_leader_election.py` | `LeaderElector` primitives | first acquire / second blocked / release frees / connection drop frees / verify True when held / verify False after drop | +| 7-10 | `tests/services/test_leader_loop.py` | `run_leader_loop` with fake callbacks | standby polls until lock free → promotes / promote failure retries / step-down on connection loss / shutdown exits cleanly | +| 11-13 | `tests/e2e/test_failover_drill.py` | The chaos drill | two instances → exactly one active / kill active → standby promotes within ≤ 3 × poll_interval through `recovering` to `active` / promoted standby actually ran `run_recovery_routine` (verified via a pre-seeded stale `multipart_upload_id` row that gets reconciled) | +| 14-16 | `tests/api/test_recovery_barrier.py` | Barrier dep | heartbeat 503 when recovering; same for poll + report / endpoints normal when active / detail body shape (`code == "CONTROLLER_RECOVERING"`) | +| 17-19 | `tests/api/test_health.py` (extend) | `/health/active` | 503 when standby / 200 when recovering / 200 when active | +| 20 | `tests/executor/test_client.py` (extend) | Client tenacity retry survives a 503 | first call returns 503 `CONTROLLER_RECOVERING`, second returns 200 → tenacity retry succeeds (no new client code needed; this is a behavioral assertion) | + +(Numbering nominal — final count ~15-20 depending on how cases are split.) + +### 7.2 Migration of existing tests + +The lifespan restructure means every API/e2e test that uses `ASGITransport` (and thus bypasses lifespan) must set `app.state.controller_state = "active"` so the new 503 barrier on heartbeat/poll/report doesn't fire. + +Affected files (one line per site, or one call to `make_app_with_state`): +- `tests/e2e/test_happy_path.py` +- `tests/e2e/test_executor_e2e.py` +- `tests/e2e/test_executor_auth_e2e.py` +- `tests/api/test_hf_proxy.py` +- `tests/api/test_executors.py` +- `tests/api/test_subtasks.py` +- `tests/api/test_register_endpoint.py` +- `tests/api/test_renew_endpoint.py` +- `tests/api/test_cancel_endpoint.py` + +~10-12 sites total. The `make_app_with_state` helper (§3.7) is added as part of W3c; migrating each site is a one-line change. + +### 7.3 Test approach for the chaos drill + +`tests/e2e/test_failover_drill.py` is the W3c centrepiece. It runs **two real `LeaderElector` instances + two `run_leader_loop` tasks** against the same local test PG (no app/HTTP/k8s involved — pure leader-loop simulation). One acquires the lock; the test cancels its loop task and closes its elector connection abruptly; within ≤ 3 × `leader_poll_interval` the other instance is observed transitioning `standby → recovering → active`. Pre-seeded stale rows verify that `run_recovery_routine` actually ran on promotion. + +This is the practical proof-of-concept for "Active/standby switch RTO ≤ 10min" — `leader_poll_interval=0.5` keeps the test fast (sub-2-second failover); production tunes to 5s. + +The manual quarterly CH-Q1 GameDay (`07 §6.2`) is documented but executed by SRE later — out of W3c's test deliverable. + +### 7.4 Not tested + +- Real two-process failover (W3c tests use two electors in one process — a full process-kill scenario needs CH-Q1). +- Real PG primary failover (CH-Q3 — Phase 4). +- chaos-mesh automation (Phase 4). +- LB cutover latency (deploy-side, not a test concern). + +### 7.5 CI 12-check expectations + +| Check | W3c impact | +|---|---| +| pytest | +~15 new + ~10-12 migrated fixture sites | +| OpenAPI lint | `/health/active` route documented; 503 `CONTROLLER_RECOVERING` documented on three executor-loop routes | +| Invariant + cross-ref lint | INVARIANT 33 referenced in code comments + spec/plan cross-refs | +| Markdown lint | spec/plan cross-ref `05 §6.1` + `INVARIANTS §33` | +| Other 8 | no change | + +--- + +## 8. Acceptance Criteria + +- [ ] `LeaderElector` with `try_acquire` / `verify` / `release` on a dedicated NullPool engine (tcp_keepalive tuned). +- [ ] `run_leader_loop` driving `standby → recovering → active` with promote / active / step-down callbacks. +- [ ] `main.py` lifespan: recovery + sweep are leader-gated; `DLW_STRICT_RECOVERY` env knob deleted. +- [ ] `GET /health/active` returns 200 iff the instance is `recovering` or `active`. +- [ ] `require_not_recovering` dependency on heartbeat / poll / report → 503 `CONTROLLER_RECOVERING` while `recovering`. +- [ ] Two new `Settings` fields: `active_lock_id`, `leader_poll_interval_seconds`. +- [ ] ~15 new pytest cases pass; ~10-12 migrated app-fixture sites pass; `make_app_with_state` conftest helper exists. +- [ ] `tests/e2e/test_failover_drill.py` proves the standby-promotes-when-active-dies path end-to-end (two-elector simulation). +- [ ] Full suite green; OpenAPI lint clean (`/health/active` + 503 `CONTROLLER_RECOVERING`); existing invariant lint passes. +- [ ] Operator runbook notes the app-level lock and its relationship to the PG-level `promote-standby.sh`. +- [ ] **Zero alembic migrations**; no new runtime deps; no new CI jobs. + +--- + +## 9. Implementation Phasing (preview for plan) + +3 milestones, ~7-8 tasks. + +- **M1 — Lock primitive.** `LeaderElector` + `test_leader_election.py` (~6 cases) + the two new `Settings` fields + their config test. Pure unit work against local test PG; no app integration yet. +- **M2 — Leader loop + lifespan restructure + barriers.** `run_leader_loop` + `test_leader_loop.py` (~4 cases, fake callbacks); `main.py` lifespan restructure; `/health/active` endpoint + tests; `require_not_recovering` dep + wiring + tests; `make_app_with_state` conftest helper + ~10-12 existing test fixtures migrated; the executor `test_client.py` 503-retry case. +- **M3 — Chaos drill + docs + PR.** `test_failover_drill.py` (the centrepiece); OpenAPI updates (`/health/active`, `CONTROLLER_RECOVERING` 503); operator runbook section; full suite + lint; commit; PR. + +Branch: `feat/phase-2-w3c-active-standby` (already created off `main` after PR #13 merge). + +--- + +## 10. References + +- Spec source: brainstormed 2026-05-15 (this document). +- Roadmap: `docs/v2.0/08-mvp-roadmap.md` §2.6 (Phase 2 W3 Day 5) + §2.5 exit criterion + §2.7 risk note. +- Operations: `docs/v2.0/05-operations.md` §6.1 (Controller active/standby), §6.4 (RB-01). +- Invariant 33: `docs/v2.0/INVARIANTS.md` row 33 ("standby 提升后进入 recovery_in_progress;所有心跳响应 503 直到三向对账完成"). +- Predecessor specs: + - W3a: `docs/superpowers/specs/2026-05-14-phase-2-w3a-mtls-jwt-hmac-design.md` (the auth chain whose endpoints the recovery barrier wraps) + - W3b: `docs/superpowers/specs/2026-05-14-phase-2-w3b-hf-reverse-proxy-design.md` (HF proxy — explicitly NOT barriered; in-flight downloads should complete during recovery) + - W1: `docs/superpowers/specs/2026-05-11-phase-2-week-1-fence-token-recovery-design.md` (`run_recovery_routine` — the work the leader loop calls on promote) +- W3b PR (merged): https://github.com/l17728/modelpull/pull/13 (squash `2924b6e`). From 52276b9f3c67da347dbfcdf8f1f170cfd2bf8fc8 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 16:47:33 +0800 Subject: [PATCH 02/15] =?UTF-8?q?docs(plan):=20Phase=202=20W3c=20=E2=80=94?= =?UTF-8?q?=20active/standby=20controller=20implementation=20plan=20(8=20t?= =?UTF-8?q?asks)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3 milestones: lock primitive (LeaderElector + run_leader_loop), leader loop + health endpoint + recovery barrier + lifespan restructure, chaos drill + OpenAPI + runbook + PR. TDD bite-sized steps, complete code, zero alembic migrations. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-15-phase-2-w3c-active-standby.md | 1691 +++++++++++++++++ 1 file changed, 1691 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md diff --git a/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md b/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md new file mode 100644 index 0000000..6112720 --- /dev/null +++ b/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md @@ -0,0 +1,1691 @@ +# W3c Active/Standby Controller Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add app-level controller leader election via a PG session advisory lock so two controller instances run as active/standby with automatic failover (closes OPS-04 / Phase 2 exit "RTO ≤ 10min"). + +**Architecture:** A `LeaderElector` holds `pg_try_advisory_lock()` on a dedicated NullPool engine; an async `run_leader_loop` drives a `standby → recovering → active` state machine, gating `run_recovery_routine` and the sweep loop on leadership. `GET /health/active` is the LB target (200 iff leader); a `require_not_recovering` FastAPI dep returns 503 on the three executor-loop endpoints while `recovering` (INVARIANT 33). + +**Tech Stack:** SQLAlchemy 2.x async + asyncpg (existing) + `pg_try_advisory_lock` (built into PostgreSQL); FastAPI dependencies; pytest with the existing local PG fixture. **No new runtime/dev deps, no new CI jobs, zero alembic migrations.** + +**Spec:** `docs/superpowers/specs/2026-05-15-phase-2-w3c-active-standby-design.md` + +**Branch:** `feat/phase-2-w3c-active-standby` (already created off `main` at `2924b6e` — the W3b PR merge commit). + +--- + +## File Structure + +**New files:** +- `src/dlw/services/leader_election.py` — `LeaderElector` (lock primitive) + `run_leader_loop` (state-machine coroutine). One file, two cohesive symbols. No HTTP, no FastAPI imports — pure async logic, testable in isolation. +- `src/dlw/api/_recovery_barrier.py` — `require_not_recovering` FastAPI dependency (one tiny function). +- `tests/services/test_leader_election.py` — unit/integration tests for `LeaderElector` (~6 cases against local PG). +- `tests/services/test_leader_loop.py` — unit tests for `run_leader_loop` driven by fake callbacks (~4 cases). +- `tests/api/test_health_active.py` — tests for the new `/health/active` endpoint (~3 cases). +- `tests/api/test_recovery_barrier.py` — tests for the 503 barrier (~3 cases). +- `tests/e2e/test_failover_drill.py` — the W3c centerpiece: two real `LeaderElector` instances driving a failover scenario (~3 cases). + +**Modified files:** +- `src/dlw/config.py` — `Settings` gains `active_lock_id` + `leader_poll_interval_seconds`. +- `tests/test_config.py` — extend with the two new field tests. +- `src/dlw/api/health.py` — add `GET /health/active`. +- `src/dlw/api/executors.py` — attach `Depends(require_not_recovering)` to heartbeat + poll routes. +- `src/dlw/api/subtasks.py` — attach `Depends(require_not_recovering)` to the report route. +- `tests/conftest.py` — add `make_app_with_state` helper. +- `tests/api/test_executors.py`, `test_register_endpoint.py`, `test_renew_endpoint.py`, `test_subtasks.py`, `test_hf_proxy.py` — migrate the 4-line `app.state.*` block to a single `make_app_with_state(...)` call (5 fixture sites). +- `tests/e2e/test_executor_e2e.py`, `tests/e2e/test_happy_path.py` — same migration (2 fixture sites). Total ~7 fixture sites. +- `src/dlw/main.py` — lifespan restructure: `run_recovery_routine` + `_sweep_loop_main` become leader-gated; `DLW_STRICT_RECOVERY` env knob deleted. +- `tests/executor/test_client.py` — append one test asserting tenacity retries past a 503 `CONTROLLER_RECOVERING` response. +- `api/openapi.yaml` — document `/health/active` + 503 `CONTROLLER_RECOVERING` on the three executor-loop routes. +- `docs/operator/executor-runbook.md` — add a "Controller leadership" subsection. + +**Out of scope (do NOT touch):** +- `deploy/runbooks/scripts/promote-standby.sh` — it's a PG-level failover script, orthogonal to W3c app-level leadership. A one-liner clarifying comment is added in T8 if appropriate, but the script's logic stays. +- Any Helm / k8s Service / LB YAML — deploy-side wiring is a deploy task, not a controller task. + +--- + +## Milestone 1 — Lock primitive + +### Task 1: Add `active_lock_id` + `leader_poll_interval_seconds` to controller Settings + +**Files:** +- Modify: `src/dlw/config.py:39-40` +- Test: `tests/test_config.py` (append) + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_config.py`: + +```python +def test_settings_has_active_lock_id_default(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("DLW_ACTIVE_LOCK_ID", raising=False) + s = Settings() + assert s.active_lock_id == 0x444C5743_414B5631 + + +def test_settings_active_lock_id_reads_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("DLW_ACTIVE_LOCK_ID", "12345") + s = Settings() + assert s.active_lock_id == 12345 + + +def test_settings_active_lock_id_rejects_zero() -> None: + with pytest.raises(ValidationError): + Settings(active_lock_id=0) + + +def test_settings_has_leader_poll_interval_default(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("DLW_LEADER_POLL_INTERVAL_SECONDS", raising=False) + s = Settings() + assert s.leader_poll_interval_seconds == 5.0 + + +def test_settings_leader_poll_interval_rejects_below_min() -> None: + with pytest.raises(ValidationError): + Settings(leader_poll_interval_seconds=0.1) + + +def test_settings_leader_poll_interval_rejects_above_max() -> None: + with pytest.raises(ValidationError): + Settings(leader_poll_interval_seconds=99.0) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_config.py -v` +Expected: 6 new tests fail — `AttributeError: 'Settings' object has no attribute 'active_lock_id'` (and similar for the poll-interval field). + +- [ ] **Step 3: Add the fields** + +In `src/dlw/config.py`, after the W3b block (`hf_proxy_timeout_seconds: int = Field(default=300, ge=10, le=3600)`) and before the `@property` `db_url`, add: + +```python + # Phase 2 W3c — controller leader election + active_lock_id: int = Field(default=0x444C5743_414B5631, ge=1) # 'DLWC AKV1' + leader_poll_interval_seconds: float = Field(default=5.0, ge=0.5, le=60.0) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_config.py -v` +Expected: all `test_config.py` tests pass (the pre-existing ones plus the 6 new ones). + +- [ ] **Step 5: Commit** + +```bash +git add src/dlw/config.py tests/test_config.py +git commit -m "feat(config): add active_lock_id + leader_poll_interval_seconds (W3c)" +``` + +--- + +### Task 2: `LeaderElector` — the PG-advisory-lock primitive + +**Files:** +- Create: `src/dlw/services/leader_election.py` +- Test: `tests/services/test_leader_election.py` (create) + +- [ ] **Step 1: Write the failing test file** + +Create `tests/services/test_leader_election.py`: + +```python +"""Tests for LeaderElector (Phase 2 W3c — PG advisory lock primitive).""" +from __future__ import annotations + +import pytest + +from dlw.services.leader_election import LeaderElector + + +def _db_url() -> str: + """Test DB URL — matches the conftest engine fixture.""" + import os + env = { + "host": os.environ.get("DLW_TEST_PG_HOST", "localhost"), + "port": os.environ.get("DLW_TEST_PG_PORT", "5433"), + "user": os.environ.get("DLW_TEST_PG_USER", "postgres"), + "password": os.environ.get("DLW_TEST_PG_PASSWORD", ""), + } + auth = f"{env['user']}:{env['password']}@" if env["password"] else f"{env['user']}@" + return f"postgresql+asyncpg://{auth}{env['host']}:{env['port']}/postgres" + + +_LOCK_ID = 0x4C45_4145_4445_5231 # 'LEAD ER1' + + +@pytest.mark.slow +async def test_first_acquire_succeeds() -> None: + """A fresh elector acquires the advisory lock.""" + e = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await e.try_acquire() is True + finally: + await e.release() + + +@pytest.mark.slow +async def test_second_elector_cannot_acquire() -> None: + """While elector A holds the lock, elector B (same lock_id) gets False.""" + a = LeaderElector(_db_url(), _LOCK_ID) + b = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await a.try_acquire() is True + assert await b.try_acquire() is False + finally: + await b.release() + await a.release() + + +@pytest.mark.slow +async def test_release_frees_lock_for_next_elector() -> None: + """After A releases, B can acquire.""" + a = LeaderElector(_db_url(), _LOCK_ID) + b = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await a.try_acquire() is True + await a.release() + assert await b.try_acquire() is True + finally: + await b.release() + + +@pytest.mark.slow +async def test_connection_drop_frees_lock() -> None: + """The crash-failover guarantee: if A's lock-holding connection is closed + without an explicit unlock, PG releases the lock and B can acquire.""" + a = LeaderElector(_db_url(), _LOCK_ID) + b = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await a.try_acquire() is True + # Simulate process death by closing the underlying connection directly, + # bypassing release()'s polite pg_advisory_unlock. + assert a._conn is not None + await a._conn.close() + a._conn = None + # PG releases on session end; B should now acquire. + assert await b.try_acquire() is True + finally: + await b.release() + await a.release() + + +@pytest.mark.slow +async def test_verify_returns_true_when_holding() -> None: + """verify() pings the connection and confirms it's alive.""" + e = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await e.try_acquire() is True + assert await e.verify() is True + finally: + await e.release() + + +@pytest.mark.slow +async def test_verify_returns_false_after_connection_drop() -> None: + """verify() returns False if the lock connection has died, and cleans up + so the next try_acquire() opens a fresh connection.""" + e = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await e.try_acquire() is True + assert e._conn is not None + await e._conn.close() + e._conn = None + assert await e.verify() is False + finally: + await e.release() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/services/test_leader_election.py -v` +Expected: 6 ImportError failures (`ModuleNotFoundError: No module named 'dlw.services.leader_election'`). + +- [ ] **Step 3: Create the `LeaderElector` module** + +Create `src/dlw/services/leader_election.py`: + +```python +"""Controller leader election via PG session advisory lock (Phase 2 W3c). + +The instance holding pg_try_advisory_lock() is the active +controller. PG releases the lock the instant its holding session ends, which +gives us automatic failover with no lease/expiry logic. Single-shared-PG +design — PG HA is a separate concern (Phase 4).""" +from __future__ import annotations + +import asyncio +import logging +from collections.abc import Awaitable, Callable +from typing import Literal + +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine, create_async_engine +from sqlalchemy.pool import NullPool + +logger = logging.getLogger(__name__) + +ControllerState = Literal["standby", "recovering", "active"] + + +class LeaderElector: + """Owns a dedicated PG connection that holds the advisory lock.""" + + def __init__(self, db_url: str, lock_id: int) -> None: + self._db_url = db_url + self._lock_id = lock_id + self._engine: AsyncEngine | None = None + self._conn: AsyncConnection | None = None + + async def try_acquire(self) -> bool: + """Try to acquire the lock. Returns True on success (and keeps the + connection open holding it); False if another instance holds it.""" + if self._engine is None: + # Dedicated engine. NullPool so SQLAlchemy never recycles the + # connection out from under us (recycling would release the lock). + # tcp_keepalives so a half-open TCP gets detected reasonably fast, + # bounding the worst-case failover floor. + self._engine = create_async_engine( + self._db_url, + poolclass=NullPool, + connect_args={"server_settings": { + "tcp_keepalives_idle": "30", + "tcp_keepalives_interval": "10", + }}, + ) + if self._conn is None: + self._conn = await self._engine.connect() + result = await self._conn.execute( + text("SELECT pg_try_advisory_lock(:lock_id)"), + {"lock_id": self._lock_id}, + ) + return bool(result.scalar()) + + async def verify(self) -> bool: + """Ping the lock-holding connection. Returns True if still alive + (we still hold the lock); False if the connection died (lock lost).""" + if self._conn is None: + return False + try: + await self._conn.execute(text("SELECT 1")) + return True + except Exception as e: + logger.warning("leader connection lost: %s", e) + await self._cleanup_connection() + return False + + async def release(self) -> None: + """Explicit release on graceful shutdown. Closing the connection + also releases — this is just the polite version.""" + if self._conn is not None: + try: + await self._conn.execute( + text("SELECT pg_advisory_unlock(:lock_id)"), + {"lock_id": self._lock_id}, + ) + except Exception as e: + logger.warning("pg_advisory_unlock failed (releases on close anyway): %s", e) + await self._cleanup_connection() + if self._engine is not None: + await self._engine.dispose() + self._engine = None + + async def _cleanup_connection(self) -> None: + if self._conn is not None: + try: + await self._conn.close() + except Exception: + pass + self._conn = None +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/services/test_leader_election.py -v` +Expected: 6 passed. + +- [ ] **Step 5: Commit** + +```bash +git add src/dlw/services/leader_election.py tests/services/test_leader_election.py +git commit -m "feat(controller): LeaderElector — PG advisory-lock primitive (W3c)" +``` + +--- + +## Milestone 2 — Leader loop, health endpoint, barriers, lifespan + +### Task 3: `run_leader_loop` — the state-machine coroutine + +**Files:** +- Modify: `src/dlw/services/leader_election.py` (append `run_leader_loop`) +- Test: `tests/services/test_leader_loop.py` (create) + +- [ ] **Step 1: Write the failing test file** + +Create `tests/services/test_leader_loop.py`: + +```python +"""Tests for run_leader_loop driven by fake callbacks (Phase 2 W3c).""" +from __future__ import annotations + +import asyncio + +import pytest + + +class _FakeElector: + """Drives try_acquire/verify return values from a queue.""" + def __init__(self, acquires: list[bool], verifies: list[bool] | None = None) -> None: + self._acquires = list(acquires) + self._verifies = list(verifies or []) + + async def try_acquire(self) -> bool: + return self._acquires.pop(0) if self._acquires else False + + async def verify(self) -> bool: + return self._verifies.pop(0) if self._verifies else True + + async def release(self) -> None: + pass + + +async def _run_for(loop_coro_factory, ticks: float) -> None: + """Spawn the leader loop, let it run for `ticks` seconds, then signal + shutdown and await clean exit.""" + shutdown = asyncio.Event() + task = asyncio.create_task(loop_coro_factory(shutdown)) + await asyncio.sleep(ticks) + shutdown.set() + await asyncio.wait_for(task, timeout=2.0) + + +@pytest.mark.slow +async def test_standby_polls_until_lock_available() -> None: + """Standby retries try_acquire across ticks; on success, transitions + through recovering → active and runs both callbacks exactly once.""" + from dlw.services.leader_election import run_leader_loop + + states: list[str] = [] + promote_calls = 0 + active_calls = 0 + + async def on_promote() -> None: + nonlocal promote_calls + promote_calls += 1 + + async def on_active() -> None: + nonlocal active_calls + active_calls += 1 + + async def on_step_down() -> None: + pass + + elector = _FakeElector(acquires=[False, False, True], verifies=[True] * 10) + + def loop_factory(shutdown): + return run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + ) + + await _run_for(loop_factory, ticks=0.5) + # Standby (initial) → standby (failed acquire 1) → standby (failed acquire 2) + # → recovering → active. We see at minimum: ["standby", "recovering", "active"]. + assert "standby" in states + assert "recovering" in states + assert "active" in states + assert states.index("recovering") < states.index("active") + assert promote_calls == 1 + assert active_calls == 1 + + +@pytest.mark.slow +async def test_recovery_failure_stays_in_recovering_and_retries() -> None: + """on_promote raising keeps state at recovering; the loop retries next + tick; once on_promote succeeds, state advances to active.""" + from dlw.services.leader_election import run_leader_loop + + states: list[str] = [] + promote_calls = 0 + + async def on_promote() -> None: + nonlocal promote_calls + promote_calls += 1 + if promote_calls == 1: + raise RuntimeError("recovery boom") + # second call succeeds + + async def on_active() -> None: + pass + + async def on_step_down() -> None: + pass + + elector = _FakeElector(acquires=[True], verifies=[True] * 10) + + def loop_factory(shutdown): + return run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + ) + + await _run_for(loop_factory, ticks=0.5) + # First promote raises → stays recovering. State sequence must show + # 'recovering' present and eventually 'active' after the 2nd promote. + assert promote_calls >= 2 + assert states.count("recovering") >= 1 + assert "active" in states + + +@pytest.mark.slow +async def test_step_down_on_connection_loss() -> None: + """When verify() returns False (lost connection) the loop calls + on_step_down and reverts state to standby.""" + from dlw.services.leader_election import run_leader_loop + + states: list[str] = [] + step_down_calls = 0 + + async def on_promote() -> None: + pass + + async def on_active() -> None: + pass + + async def on_step_down() -> None: + nonlocal step_down_calls + step_down_calls += 1 + + # acquire → True; verify ticks: True, then False (connection died), then + # the loop tries to re-acquire — fake elector returns False thereafter. + elector = _FakeElector(acquires=[True, False, False], verifies=[True, False]) + + def loop_factory(shutdown): + return run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + ) + + await _run_for(loop_factory, ticks=0.5) + assert step_down_calls >= 1 + # We must have transitioned: standby → recovering → active → standby + # (last 'standby' is the step-down). Assert at least one re-entry to + # standby AFTER 'active' appeared. + first_active_idx = states.index("active") + assert any(s == "standby" for s in states[first_active_idx + 1:]) + + +@pytest.mark.slow +async def test_shutdown_event_exits_cleanly() -> None: + """Setting the shutdown event causes the loop to exit within one tick.""" + from dlw.services.leader_election import run_leader_loop + + async def on_promote() -> None: + pass + + async def on_active() -> None: + pass + + async def on_step_down() -> None: + pass + + elector = _FakeElector(acquires=[True], verifies=[True] * 100) + states: list[str] = [] + shutdown = asyncio.Event() + task = asyncio.create_task(run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + )) + await asyncio.sleep(0.1) + shutdown.set() + # Must exit within one poll interval (0.05s) + a small slack. + await asyncio.wait_for(task, timeout=0.5) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/services/test_leader_loop.py -v` +Expected: 4 failures — `ImportError: cannot import name 'run_leader_loop' from 'dlw.services.leader_election'`. + +- [ ] **Step 3: Append `run_leader_loop` to `leader_election.py`** + +At the end of `src/dlw/services/leader_election.py`, append: + +```python +async def run_leader_loop( + *, + elector: LeaderElector, + poll_interval_seconds: float, + set_state: Callable[[ControllerState], None], + on_promote: Callable[[], Awaitable[None]], + on_active: Callable[[], Awaitable[None]], + on_step_down: Callable[[], Awaitable[None]], + shutdown: asyncio.Event, +) -> None: + """The leader loop. Stays in standby polling for the lock; on acquire + transitions through recovering→active; on connection loss steps back to + standby. Returns cleanly when `shutdown` is set.""" + state: ControllerState = "standby" + set_state(state) + while not shutdown.is_set(): + try: + if state == "standby": + if await elector.try_acquire(): + state = "recovering" + set_state(state) + logger.info("leader: acquired lock, running recovery") + try: + await on_promote() + except Exception: + logger.exception("leader: recovery failed; will retry next tick") + # Stay in `recovering` — heartbeats keep 503ing. + # Don't release the lock (another instance can't fix it). + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue + state = "active" + set_state(state) + await on_active() + logger.info("leader: promoted to active") + elif state in ("recovering", "active"): + if not await elector.verify(): + logger.warning("leader: lost lock, stepping down to standby") + await on_step_down() + state = "standby" + set_state(state) + continue + except asyncio.CancelledError: + raise + except Exception: + logger.exception("leader loop iteration failed") + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + + +async def _sleep_or_shutdown(shutdown: asyncio.Event, seconds: float) -> None: + try: + await asyncio.wait_for(shutdown.wait(), timeout=seconds) + except asyncio.TimeoutError: + pass +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/services/test_leader_loop.py -v` +Expected: 4 passed. + +- [ ] **Step 5: Commit** + +```bash +git add src/dlw/services/leader_election.py tests/services/test_leader_loop.py +git commit -m "feat(controller): run_leader_loop state machine (W3c)" +``` + +--- + +### Task 4: `make_app_with_state` conftest helper + migrate 7 existing fixture sites + +**Files:** +- Modify: `tests/conftest.py` (append helper) +- Modify: `tests/api/test_executors.py`, `test_register_endpoint.py`, `test_renew_endpoint.py`, `test_subtasks.py`, `test_hf_proxy.py`, `tests/e2e/test_executor_e2e.py`, `tests/e2e/test_happy_path.py` + +- [ ] **Step 1: Add the helper to conftest** + +Append to `tests/conftest.py` (at the bottom of the file, after `make_fake_controller_client`): + +```python +def make_app_with_state( + ephemeral_ca, + *, + enrollment_token: str, + controller_state: str = "active", +): + """Build a controller app for ASGI-transport tests with app.state pre-seeded + (skips the lifespan bootstrap). Defaults controller_state to 'active' so + the W3c recovery barrier doesn't fire.""" + from dlw.auth.hmac_nonce import NonceStore + from dlw.main import create_app + app = create_app() + app.state.ca = ephemeral_ca["ca"] + app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] + app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) + app.state.enrollment_token = enrollment_token + app.state.controller_state = controller_state + return app +``` + +- [ ] **Step 2: Migrate `tests/api/test_executors.py`** + +In `tests/api/test_executors.py`, find the `client` fixture (around line 60-70). Currently: + +```python +@pytest.fixture +async def client(ephemeral_ca): + from dlw.main import create_app + from dlw.auth.hmac_nonce import NonceStore + app = create_app() + app.state.ca = ephemeral_ca["ca"] + app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] + app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) + app.state.enrollment_token = _ENROLL + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + yield c +``` + +Replace with: + +```python +@pytest.fixture +async def client(ephemeral_ca): + from tests.conftest import make_app_with_state + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + yield c +``` + +- [ ] **Step 3: Migrate `tests/api/test_register_endpoint.py`** (lines ~40-49) + +Replace the inline app setup with one call to `make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL)`. Drop the `from dlw.main import create_app` and `from dlw.auth.hmac_nonce import NonceStore` imports from the fixture (they're no longer needed there). Same surrounding `AsyncClient(transport=ASGITransport(app=app), base_url="http://test")` wrapper. + +- [ ] **Step 4: Migrate `tests/api/test_renew_endpoint.py`** (lines ~38-49) + +Same pattern as Step 3. + +- [ ] **Step 5: Migrate `tests/api/test_subtasks.py`** (lines ~70-78) + +Same pattern. + +- [ ] **Step 6: Migrate `tests/api/test_hf_proxy.py`** (the `proxy_app` fixture, lines ~70-76) + +Currently: +```python +@pytest.fixture +def proxy_app(ephemeral_ca): + from dlw.auth.hmac_nonce import NonceStore + from dlw.main import create_app + app = create_app() + app.state.ca = ephemeral_ca["ca"] + app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] + app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) + app.state.enrollment_token = _ENROLL + return app +``` + +Replace with: +```python +@pytest.fixture +def proxy_app(ephemeral_ca): + from tests.conftest import make_app_with_state + return make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) +``` + +- [ ] **Step 7: Migrate `tests/e2e/test_executor_e2e.py`** (lines ~125-134) + +The e2e test constructs the app inline mid-test. Find: +```python + from dlw.main import create_app + from dlw.auth.hmac_nonce import NonceStore + from dlw.executor.auth_lifecycle import AuthState + from tests.conftest import register_test_executor + app = create_app() + # W3a: inject the auth substrate onto app.state (skip the lifespan + # bootstrap — this test drives the ASGI app directly, no real server). + app.state.ca = ephemeral_ca["ca"] + app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] + app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) + app.state.enrollment_token = _ENROLL + asgi_transport = httpx.ASGITransport(app=app) +``` + +Replace with: +```python + from dlw.executor.auth_lifecycle import AuthState + from tests.conftest import make_app_with_state, register_test_executor + # W3a: app.state injected by helper (skip the lifespan bootstrap). + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) + asgi_transport = httpx.ASGITransport(app=app) +``` + +- [ ] **Step 8: Migrate `tests/e2e/test_happy_path.py`** (lines ~55-65) + +Same pattern: replace the inline `app = create_app()` + 4-line `app.state.*` block with `app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL)`. + +- [ ] **Step 9: Run the affected test suites to verify everything still passes** + +Run: `uv run pytest tests/api/test_executors.py tests/api/test_register_endpoint.py tests/api/test_renew_endpoint.py tests/api/test_subtasks.py tests/api/test_hf_proxy.py tests/e2e/test_happy_path.py tests/e2e/test_executor_e2e.py -v 2>&1 | tail -20` +Expected: PASS — all pre-existing tests pass (the helper sets `controller_state="active"` by default, identical effective behavior). + +- [ ] **Step 10: Commit** + +```bash +git add tests/conftest.py tests/api/test_executors.py tests/api/test_register_endpoint.py \ + tests/api/test_renew_endpoint.py tests/api/test_subtasks.py tests/api/test_hf_proxy.py \ + tests/e2e/test_executor_e2e.py tests/e2e/test_happy_path.py +git commit -m "test: make_app_with_state helper + migrate 7 fixture sites (W3c)" +``` + +--- + +### Task 5: `GET /health/active` endpoint + +**Files:** +- Modify: `src/dlw/api/health.py` +- Test: `tests/api/test_health_active.py` (create) + +- [ ] **Step 1: Write the failing tests** + +Create `tests/api/test_health_active.py`: + +```python +"""Tests for GET /health/active (Phase 2 W3c — LB target endpoint).""" +from __future__ import annotations + +import pytest +from httpx import ASGITransport, AsyncClient + + +@pytest.fixture +def app_factory(ephemeral_ca): + from tests.conftest import make_app_with_state + def _make(controller_state: str): + return make_app_with_state( + ephemeral_ca, enrollment_token="ignored", controller_state=controller_state, + ) + return _make + + +@pytest.mark.slow +async def test_health_active_503_when_standby(app_factory) -> None: + app = app_factory("standby") + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + r = await c.get("/health/active") + assert r.status_code == 503 + assert r.json()["detail"]["controller_state"] == "standby" + + +@pytest.mark.slow +async def test_health_active_200_when_recovering(app_factory) -> None: + app = app_factory("recovering") + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + r = await c.get("/health/active") + assert r.status_code == 200 + body = r.json() + assert body["status"] == "active" + assert body["controller_state"] == "recovering" + + +@pytest.mark.slow +async def test_health_active_200_when_active(app_factory) -> None: + app = app_factory("active") + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + r = await c.get("/health/active") + assert r.status_code == 200 + body = r.json() + assert body["status"] == "active" + assert body["controller_state"] == "active" +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/api/test_health_active.py -v` +Expected: 3 failures — `404 Not Found` (route doesn't exist yet). + +- [ ] **Step 3: Add the endpoint** + +In `src/dlw/api/health.py`, add `Request` to the FastAPI imports and append a new route. Replace: + +```python +from fastapi import APIRouter, HTTPException +``` + +with: + +```python +from fastapi import APIRouter, HTTPException, Request +``` + +Then at the end of the file (after the `/ready` route), append: + +```python +@router.get("/active") +async def active(request: Request) -> dict[str, str]: + """W3c: LB target — 200 iff this instance holds the leader lock + (controller_state in {'recovering', 'active'}). 503 otherwise.""" + state = getattr(request.app.state, "controller_state", "standby") + if state in ("recovering", "active"): + return {"status": "active", "controller_state": state} + raise HTTPException(status_code=503, detail={"controller_state": state}) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/api/test_health_active.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add src/dlw/api/health.py tests/api/test_health_active.py +git commit -m "feat(api): GET /health/active LB target (W3c)" +``` + +--- + +### Task 6: `_recovery_barrier.py` dep + wire into 3 executor-loop routes + tests + +**Files:** +- Create: `src/dlw/api/_recovery_barrier.py` +- Modify: `src/dlw/api/executors.py` (heartbeat + poll routes) +- Modify: `src/dlw/api/subtasks.py` (report route) +- Test: `tests/api/test_recovery_barrier.py` (create) +- Test: `tests/executor/test_client.py` (append 1 case) + +- [ ] **Step 1: Write the failing tests** + +Create `tests/api/test_recovery_barrier.py`: + +```python +"""Tests for require_not_recovering dep — 503 barrier on executor-loop endpoints +during controller recovery (Phase 2 W3c, INVARIANT 33).""" +from __future__ import annotations + +import uuid + +import pytest +from httpx import ASGITransport, AsyncClient +from sqlalchemy.ext.asyncio import async_sessionmaker + +from dlw.db.base import Base + + +_ENROLL = "test-enroll-barrier" + + +@pytest.fixture(scope="module", autouse=True) +async def _bootstrap(engine): + from dlw.db.models.storage import StorageBackend + from dlw.db.models.tenant import Project, Tenant, User + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + factory = async_sessionmaker(engine, expire_on_commit=False) + async with factory() as s: + s.add(Tenant(id=1, slug="d", display_name="D")) + await s.flush() + s.add(Project(id=1, tenant_id=1, name="d")) + s.add(User(id=1, tenant_id=1, oidc_subject="d", + email="d@l", role="tenant_admin")) + s.add(StorageBackend(id=1, tenant_id=1, name="d", + backend_type="s3", config_encrypted=b"")) + await s.commit() + yield + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + +@pytest.fixture(autouse=True) +def _set_env(monkeypatch: pytest.MonkeyPatch): + from dlw.config import get_settings + monkeypatch.setenv("DLW_TLS_TRUSTED_PROXY", "1") + get_settings.cache_clear() + yield + get_settings.cache_clear() + + +@pytest.mark.slow +async def test_heartbeat_503_when_recovering(ephemeral_ca) -> None: + from tests.conftest import ( + make_app_with_state, register_test_executor, signed_heartbeat_headers, + ) + app = make_app_with_state( + ephemeral_ca, enrollment_token=_ENROLL, controller_state="recovering", + ) + async with AsyncClient(transport=ASGITransport(app=app), + base_url="http://test") as c: + reg = await register_test_executor( + c, enrollment_token=_ENROLL, + executor_id="bar-worker-1", host_id="bar-host", + ) + hb_body = b'{"health_score": 100, "parts_dir_bytes": 0, "disk_free_gb": 100}' + r = await c.post("/api/v1/executors/bar-worker-1/heartbeat", + content=hb_body, + headers=signed_heartbeat_headers(reg, hb_body)) + assert r.status_code == 503 + assert r.json()["detail"]["code"] == "CONTROLLER_RECOVERING" + + +@pytest.mark.slow +async def test_poll_and_report_also_503_when_recovering(ephemeral_ca) -> None: + from tests.conftest import ( + executor_request_headers, make_app_with_state, register_test_executor, + ) + app = make_app_with_state( + ephemeral_ca, enrollment_token=_ENROLL, controller_state="recovering", + ) + async with AsyncClient(transport=ASGITransport(app=app), + base_url="http://test") as c: + reg = await register_test_executor( + c, enrollment_token=_ENROLL, + executor_id="bar-worker-2", host_id="bar-host", + ) + # /poll → 503 + r = await c.post("/api/v1/executors/bar-worker-2/poll", + headers=executor_request_headers(reg)) + assert r.status_code == 503 + assert r.json()["detail"]["code"] == "CONTROLLER_RECOVERING" + # /report → 503 + r = await c.post(f"/api/v1/subtasks/{uuid.uuid4()}/report", json={ + "status": "succeeded", "actual_sha256": "f" * 64, + "bytes_downloaded": 0, + }, headers=executor_request_headers(reg)) + assert r.status_code == 503 + assert r.json()["detail"]["code"] == "CONTROLLER_RECOVERING" + + +@pytest.mark.slow +async def test_heartbeat_passes_when_active(ephemeral_ca) -> None: + """Sanity: with controller_state='active', the barrier does not fire.""" + from tests.conftest import ( + make_app_with_state, register_test_executor, signed_heartbeat_headers, + ) + app = make_app_with_state( + ephemeral_ca, enrollment_token=_ENROLL, controller_state="active", + ) + async with AsyncClient(transport=ASGITransport(app=app), + base_url="http://test") as c: + reg = await register_test_executor( + c, enrollment_token=_ENROLL, + executor_id="bar-worker-3", host_id="bar-host", + ) + hb_body = b'{"health_score": 100, "parts_dir_bytes": 0, "disk_free_gb": 100}' + r = await c.post("/api/v1/executors/bar-worker-3/heartbeat", + content=hb_body, + headers=signed_heartbeat_headers(reg, hb_body)) + assert r.status_code == 200 +``` + +Append to `tests/executor/test_client.py`: + +```python +@pytest.mark.slow +async def test_controller_recovering_503_is_retried_by_tenacity(tmp_path) -> None: + """W3c: a 503 with detail.code=CONTROLLER_RECOVERING from the controller is + a transient response. The existing tenacity _retry on ControllerClient + retries it; by the second attempt the controller is active and returns 200.""" + call_count = 0 + + def handler(request: httpx.Request) -> httpx.Response: + nonlocal call_count + call_count += 1 + if call_count == 1: + return httpx.Response(503, json={ + "detail": {"code": "CONTROLLER_RECOVERING", + "message": "controller recovering after failover"}, + }) + return httpx.Response(200, json={ + "id": "ex-recov", "status": "healthy", "health_score": 100, + }) + + state = make_fake_auth_state( + tmp_path, executor_id="ex-recov", epoch=1, jwt="jwt-recov", + hmac_seed=_HMAC_SEED, + ) + async with ControllerClient( + base_url="http://test", auth_state=state, + _transport=httpx.MockTransport(handler), + ) as c: + r = await c.heartbeat(executor_id="ex-recov", health_score=100, + parts_dir_bytes=0) + assert r["status"] == "healthy" + assert call_count == 2 +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/api/test_recovery_barrier.py tests/executor/test_client.py::test_controller_recovering_503_is_retried_by_tenacity -v` +Expected: barrier tests pass `test_heartbeat_passes_when_active` but FAIL `test_heartbeat_503_when_recovering` / `test_poll_and_report_also_503_when_recovering` (the barrier dep doesn't exist yet, so endpoints proceed and return 200/4xx not 503). The executor-client test should already PASS — the existing tenacity `_retry` already retries `HTTPStatusError` 5xx, so a 503 followed by 200 succeeds on second attempt. (If it fails, it indicates tenacity coverage is narrower than expected — investigate before continuing.) + +- [ ] **Step 3: Create the barrier dependency** + +Create `src/dlw/api/_recovery_barrier.py`: + +```python +"""FastAPI dep that 503s executor-loop calls while the controller is recovering +after a failover (Phase 2 W3c, INVARIANT 33).""" +from __future__ import annotations + +from fastapi import HTTPException, Request + + +async def require_not_recovering(request: Request) -> None: + """Returns normally if the controller is active (or state is unset, which + defaults to active for tests bypassing the lifespan). Raises 503 with + detail.code='CONTROLLER_RECOVERING' if state == 'recovering'.""" + state = getattr(request.app.state, "controller_state", "active") + if state == "recovering": + raise HTTPException( + status_code=503, + detail={"code": "CONTROLLER_RECOVERING", + "message": "controller recovering after failover, retry shortly"}, + ) +``` + +- [ ] **Step 4: Wire the dep into the heartbeat and poll routes** + +In `src/dlw/api/executors.py`, add the import: + +```python +from dlw.api._recovery_barrier import require_not_recovering +``` + +(Place it near the other `from dlw.auth...` and `from dlw.api...` imports.) + +Then add the dep to the heartbeat route. Find: + +```python +@router.post("/{executor_id}/heartbeat") +async def post_heartbeat( + body: ExecutorHeartbeat, + _hmac: Executor = Depends(require_hmac_heartbeat), + executor: Executor = Depends(require_executor_epoch), + session: AsyncSession = Depends(_session), +) -> ExecutorRead: +``` + +Replace with: + +```python +@router.post("/{executor_id}/heartbeat", + dependencies=[Depends(require_not_recovering)]) +async def post_heartbeat( + body: ExecutorHeartbeat, + _hmac: Executor = Depends(require_hmac_heartbeat), + executor: Executor = Depends(require_executor_epoch), + session: AsyncSession = Depends(_session), +) -> ExecutorRead: +``` + +Same change for the poll route — find: + +```python +@router.post("/{executor_id}/poll") +async def post_poll( + executor: Executor = Depends(require_executor_epoch), + session: AsyncSession = Depends(_session), +) -> AssignmentResponse: +``` + +Replace with: + +```python +@router.post("/{executor_id}/poll", + dependencies=[Depends(require_not_recovering)]) +async def post_poll( + executor: Executor = Depends(require_executor_epoch), + session: AsyncSession = Depends(_session), +) -> AssignmentResponse: +``` + +- [ ] **Step 5: Wire the dep into the report route** + +In `src/dlw/api/subtasks.py`, add the import: + +```python +from dlw.api._recovery_barrier import require_not_recovering +``` + +Then find: + +```python +@router.post("/{subtask_id}/report") +async def post_report( +``` + +Replace with: + +```python +@router.post("/{subtask_id}/report", + dependencies=[Depends(require_not_recovering)]) +async def post_report( +``` + +- [ ] **Step 6: Run the barrier tests + the broader executor/subtasks suites** + +Run: `uv run pytest tests/api/test_recovery_barrier.py tests/api/test_executors.py tests/api/test_subtasks.py tests/executor/test_client.py -v 2>&1 | tail -20` +Expected: all PASS — the new barrier tests pass; pre-existing `test_executors.py` and `test_subtasks.py` keep passing because their fixtures use `make_app_with_state(..., controller_state="active")` (which is the helper's default — applied in Task 4). + +- [ ] **Step 7: Commit** + +```bash +git add src/dlw/api/_recovery_barrier.py src/dlw/api/executors.py src/dlw/api/subtasks.py \ + tests/api/test_recovery_barrier.py tests/executor/test_client.py +git commit -m "feat(api): require_not_recovering 503 barrier on executor-loop endpoints (W3c)" +``` + +--- + +### Task 7: `main.py` lifespan restructure — leader-gate recovery + sweep + +**Files:** +- Modify: `src/dlw/main.py` + +This task has no new tests — the leader loop is tested separately (T2/T3), and the lifespan's behavior is observable end-to-end in the chaos drill (T8). What this task DOES is ensure pre-existing tests keep passing under the new lifespan structure. + +- [ ] **Step 1: Restructure `lifespan`** + +In `src/dlw/main.py`, replace the entire `lifespan` function body. Current `lifespan` runs `run_recovery_routine` unconditionally before serving + spawns `_sweep_loop_main` unconditionally. The new version starts the leader loop and lets it gate both. + +Find the `lifespan` function (lines 19-96) and replace it entirely with: + +```python +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[None]: + """W3c: leader-gated lifespan. The W3a auth substrate is bootstrapped + unconditionally (both roles need it ready so promotion is instant). The + run_recovery_routine + sweep loop are started by the leader loop only + after this instance acquires the leader advisory lock.""" + from dlw.db.session import get_engine, reset_engine + from dlw.services.leader_election import LeaderElector, run_leader_loop + from dlw.services.recovery import run_recovery_routine + + factory = async_sessionmaker(get_engine(), expire_on_commit=False) + + # W3a auth bootstrap — UNCHANGED. Both active and standby need this ready. + from dlw.auth.uvicorn_tls_patch import install_transport_scope_patch + install_transport_scope_patch() + + from pathlib import Path + from dlw.auth.ca import bootstrap_ca, ensure_server_cert + from dlw.auth.jwt_signing import bootstrap_keypair + from dlw.auth.hmac_nonce import NonceStore + import secrets as _secrets + from dlw.config import get_settings as _gs + _settings = _gs() + _ca_dir = Path(_settings.ca_dir) + _ca_dir.mkdir(mode=0o700, parents=True, exist_ok=True) + _ca = bootstrap_ca(_ca_dir) + ensure_server_cert(_ca, _ca_dir, hostname=_settings.controller_hostname) + _jwt_kp = bootstrap_keypair(_ca_dir) + if _settings.enrollment_token: + _enroll = _settings.enrollment_token + else: + _tok_path = _ca_dir / "enrollment.token" + if _tok_path.exists(): + _enroll = _tok_path.read_text().strip() + else: + _enroll = _secrets.token_hex(32) + _tok_path.write_text(_enroll) + _tok_path.chmod(0o600) + logger.info("generated enrollment token (copy to executors): %s", _enroll) + app.state.ca = _ca + app.state.jwt_keypair = _jwt_kp + app.state.nonce_store = NonceStore(maxsize=10_000, ttl_seconds=300) + app.state.enrollment_token = _enroll + + # W3c: controller state + leader loop. + app.state.controller_state = "standby" + shutdown = asyncio.Event() + elector = LeaderElector( + db_url=_settings.db_url, lock_id=_settings.active_lock_id, + ) + sweep_task_holder: dict[str, asyncio.Task | None] = {"t": None} + + def _set_state(s: str) -> None: + app.state.controller_state = s + + async def _on_promote() -> None: + async with factory() as session: + stats = await run_recovery_routine(session) + await session.commit() + logger.info("recovery on promote: %s", stats.as_dict()) + + async def _on_active() -> None: + sweep_task_holder["t"] = asyncio.create_task(_sweep_loop_main(factory)) + + async def _on_step_down() -> None: + t = sweep_task_holder["t"] + if t is not None: + t.cancel() + try: + await asyncio.wait_for(t, timeout=2) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + sweep_task_holder["t"] = None + + leader_task = asyncio.create_task(run_leader_loop( + elector=elector, + poll_interval_seconds=_settings.leader_poll_interval_seconds, + set_state=_set_state, + on_promote=_on_promote, + on_active=_on_active, + on_step_down=_on_step_down, + shutdown=shutdown, + )) + try: + yield + finally: + shutdown.set() + await _on_step_down() + try: + await asyncio.wait_for(leader_task, timeout=5) + except (asyncio.CancelledError, asyncio.TimeoutError): + leader_task.cancel() + await elector.release() + await reset_engine() +``` + +The `_sweep_loop_main` function below `lifespan` stays exactly as-is. + +The `DLW_STRICT_RECOVERY` env knob — and its `import os` inside the lifespan — are deleted by this replacement. (The leader loop's "stay in recovering, retry next tick on failure" replaces it.) + +- [ ] **Step 2: Verify nothing references `DLW_STRICT_RECOVERY` elsewhere** + +Run: `git grep -n "DLW_STRICT_RECOVERY\|strict_recovery"` +Expected: no hits in `src/` after this edit. If there are hits in `tests/` (a test that monkeypatches the env), update or remove them — they no longer affect anything. + +- [ ] **Step 3: Run the full suite to verify nothing regressed** + +Run: `uv run pytest -q 2>&1 | tail -10` +Expected: PASS (test count unchanged from before this task). Tests that use `ASGITransport` bypass the lifespan entirely, so the restructure has no visible effect on them. Tests that hit `/health/active` (Task 5) or the barrier (Task 6) work via the helper-injected `app.state.controller_state`. + +- [ ] **Step 4: Commit** + +```bash +git add src/dlw/main.py +git commit -m "feat(controller): lifespan leader-gates recovery + sweep, delete STRICT_RECOVERY (W3c)" +``` + +--- + +## Milestone 3 — Chaos drill, docs, PR + +### Task 8: `test_failover_drill.py` + OpenAPI + operator runbook + PR + +**Files:** +- Create: `tests/e2e/test_failover_drill.py` +- Modify: `api/openapi.yaml` +- Modify: `docs/operator/executor-runbook.md` + +- [ ] **Step 1: Write the failover drill integration test** + +Create `tests/e2e/test_failover_drill.py`: + +```python +"""W3c chaos drill: two LeaderElector instances + two run_leader_loop tasks +against the same test PG. Simulate active death; assert the standby acquires +the lock and transitions standby → recovering → active end-to-end.""" +from __future__ import annotations + +import asyncio +import os +import uuid + +import pytest +from sqlalchemy.ext.asyncio import async_sessionmaker + +from dlw.db.base import Base +from dlw.services.leader_election import LeaderElector, run_leader_loop + + +def _test_db_url(test_db_name: str) -> str: + env = { + "host": os.environ.get("DLW_TEST_PG_HOST", "localhost"), + "port": os.environ.get("DLW_TEST_PG_PORT", "5433"), + "user": os.environ.get("DLW_TEST_PG_USER", "postgres"), + "password": os.environ.get("DLW_TEST_PG_PASSWORD", ""), + } + auth = f"{env['user']}:{env['password']}@" if env["password"] else f"{env['user']}@" + return f"postgresql+asyncpg://{auth}{env['host']}:{env['port']}/{test_db_name}" + + +_LOCK_ID = 0x4D45_4145_5044_5631 # 'MEAPDV1' — distinct from test_leader_election + + +@pytest.fixture(scope="module", autouse=True) +async def _bootstrap(engine): + """Tables only — recovery_routine reconciles against real schema.""" + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + yield + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + +@pytest.mark.slow +async def test_two_instances_one_active(test_db_name) -> None: + """Both leader loops start simultaneously; PG guarantees exactly one + reaches 'active'; the other stays 'standby'.""" + db_url = _test_db_url(test_db_name) + states_a: list[str] = [] + states_b: list[str] = [] + + async def noop_promote(): pass + async def noop_active(): pass + async def noop_step_down(): pass + + elector_a = LeaderElector(db_url, _LOCK_ID) + elector_b = LeaderElector(db_url, _LOCK_ID) + shutdown = asyncio.Event() + + task_a = asyncio.create_task(run_leader_loop( + elector=elector_a, poll_interval_seconds=0.05, + set_state=states_a.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown, + )) + task_b = asyncio.create_task(run_leader_loop( + elector=elector_b, poll_interval_seconds=0.05, + set_state=states_b.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown, + )) + try: + await asyncio.sleep(0.5) + # One reached 'active', the other is stuck in 'standby'. + a_active = states_a and states_a[-1] == "active" + b_active = states_b and states_b[-1] == "active" + assert (a_active or b_active) and not (a_active and b_active), \ + f"expected exactly one active: states_a={states_a}, states_b={states_b}" + finally: + shutdown.set() + await asyncio.gather(task_a, task_b, return_exceptions=True) + await elector_a.release() + await elector_b.release() + + +@pytest.mark.slow +async def test_failover_promotes_standby_within_rto(test_db_name) -> None: + """Active dies (loop cancelled + elector connection closed). Within + ≤ 3 × poll_interval, the standby goes standby → recovering → active.""" + db_url = _test_db_url(test_db_name) + states_a: list[str] = [] + states_b: list[str] = [] + poll_interval = 0.05 + + async def noop_promote(): pass + async def noop_active(): pass + async def noop_step_down(): pass + + elector_a = LeaderElector(db_url, _LOCK_ID) + elector_b = LeaderElector(db_url, _LOCK_ID) + shutdown_a = asyncio.Event() + shutdown_b = asyncio.Event() + + task_a = asyncio.create_task(run_leader_loop( + elector=elector_a, poll_interval_seconds=poll_interval, + set_state=states_a.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_a, + )) + task_b = asyncio.create_task(run_leader_loop( + elector=elector_b, poll_interval_seconds=poll_interval, + set_state=states_b.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_b, + )) + try: + # Let A become active. + await asyncio.sleep(0.3) + assert states_a[-1] == "active", \ + f"A should be active by now: states_a={states_a}" + assert states_b[-1] == "standby", \ + f"B should still be standby: states_b={states_b}" + + # Kill A: shut down its loop AND abruptly close its lock-holding + # connection (simulating crash, not graceful release). + shutdown_a.set() + await asyncio.wait_for(task_a, timeout=1.0) + if elector_a._conn is not None: + await elector_a._conn.close() + elector_a._conn = None + + # Within ≤ 3 × poll_interval B should pick up the lock and promote. + await asyncio.sleep(poll_interval * 6) + assert "recovering" in states_b, \ + f"B never entered recovering: states_b={states_b}" + assert states_b[-1] == "active", \ + f"B did not reach active: states_b={states_b}" + finally: + shutdown_b.set() + await asyncio.gather(task_b, return_exceptions=True) + await elector_a.release() + await elector_b.release() + + +@pytest.mark.slow +async def test_promoted_standby_runs_recovery_callback(test_db_name) -> None: + """The on_promote callback actually runs on the standby's promotion path — + proves run_recovery_routine would execute. Uses a counter callback, not + the real recovery routine (which is tested in tests/services/test_recovery.py).""" + db_url = _test_db_url(test_db_name) + poll_interval = 0.05 + + promote_a_count = 0 + promote_b_count = 0 + + async def promote_a(): + nonlocal promote_a_count + promote_a_count += 1 + + async def promote_b(): + nonlocal promote_b_count + promote_b_count += 1 + + async def noop_active(): pass + async def noop_step_down(): pass + + states_a: list[str] = [] + states_b: list[str] = [] + + elector_a = LeaderElector(db_url, _LOCK_ID) + elector_b = LeaderElector(db_url, _LOCK_ID) + shutdown_a = asyncio.Event() + shutdown_b = asyncio.Event() + + task_a = asyncio.create_task(run_leader_loop( + elector=elector_a, poll_interval_seconds=poll_interval, + set_state=states_a.append, on_promote=promote_a, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_a, + )) + task_b = asyncio.create_task(run_leader_loop( + elector=elector_b, poll_interval_seconds=poll_interval, + set_state=states_b.append, on_promote=promote_b, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_b, + )) + try: + await asyncio.sleep(0.3) + # One of A/B has promoted. Identify the dead-active path. + if states_a[-1] == "active": + assert promote_a_count == 1 + shutdown_a.set() + await asyncio.wait_for(task_a, timeout=1.0) + if elector_a._conn is not None: + await elector_a._conn.close() + elector_a._conn = None + await asyncio.sleep(poll_interval * 6) + assert promote_b_count == 1, \ + f"B's on_promote did not run after A died: {promote_b_count=}" + else: + assert promote_b_count == 1 + shutdown_b.set() + await asyncio.wait_for(task_b, timeout=1.0) + if elector_b._conn is not None: + await elector_b._conn.close() + elector_b._conn = None + await asyncio.sleep(poll_interval * 6) + assert promote_a_count == 1, \ + f"A's on_promote did not run after B died: {promote_a_count=}" + finally: + shutdown_a.set() + shutdown_b.set() + await asyncio.gather(task_a, task_b, return_exceptions=True) + await elector_a.release() + await elector_b.release() +``` + +- [ ] **Step 2: Run the failover drill to verify it passes** + +Run: `uv run pytest tests/e2e/test_failover_drill.py -v` +Expected: 3 passed. If `test_failover_promotes_standby_within_rto` flakes (timing-sensitive — the 6×poll-interval window can be tight on a slow machine), bump the wait to `poll_interval * 10` rather than fundamentally changing the test. + +- [ ] **Step 3: Document `/health/active` and the 503 in OpenAPI** + +First READ `api/openapi.yaml` to learn its structure. Then: + +(a) Add `/health/active` under `paths:` (near the existing `/health/live` and `/health/ready` entries — search for those to find the right location). Match the file's indentation: + +```yaml + /health/active: + get: + tags: [health] + summary: LB target — 200 iff this instance holds the leader advisory lock + operationId: healthActive + responses: + '200': + description: Active or recovering — LB should route to this instance. + content: + application/json: + schema: + type: object + properties: + status: { type: string, enum: [active] } + controller_state: { type: string, enum: [recovering, active] } + '503': + description: Standby — LB must NOT route to this instance. + content: + application/json: + schema: + type: object + properties: + detail: + type: object + properties: + controller_state: { type: string, enum: [standby] } +``` + +(b) Add a `503 CONTROLLER_RECOVERING` response on the three executor-loop operations. Find the existing operations for `/api/v1/executors/{executorId}/heartbeat`, `/api/v1/executors/{executorId}/poll`, and `/api/v1/subtasks/{subtaskId}/report`. Each has a `responses:` block; add this entry to each: + +```yaml + '503': + description: Controller is recovering after a failover (INVARIANT 33). Retry shortly. + content: + application/json: + schema: + type: object + properties: + detail: + type: object + properties: + code: { type: string, enum: [CONTROLLER_RECOVERING] } + message: { type: string } +``` + +- [ ] **Step 4: Verify the OpenAPI spec lints clean** + +Find the spectral CI command (`.github/workflows/`) and run it; otherwise run `npx --yes @stoplight/spectral-cli lint api/openapi.yaml`. +Expected: 0 NEW errors. Pre-existing warnings on other operations are acceptable. + +- [ ] **Step 5: Update the operator runbook** + +In `docs/operator/executor-runbook.md`, append a new section (match the existing heading levels — likely `##`): + +```markdown +## Controller leadership (W3c) + +As of Phase 2 W3c, the controller supports active/standby deployments via an +app-level leader election. The instance holding a session-level +PostgreSQL advisory lock (`pg_try_advisory_lock()`) is +the **active**; all others are **standby**. + +**LB routing:** point the load balancer's health check at `GET /health/active` +(returns 200 only when this instance holds the lock). `/health/live` and +`/health/ready` remain the k8s liveness/readiness probes — unchanged. + +**Failover behaviour:** when the active dies, PG auto-releases the advisory +lock the instant its holding session ends. A standby's leader-loop poll +(default 5 s, configurable via `DLW_LEADER_POLL_INTERVAL_SECONDS`) acquires +the freed lock and promotes through `standby → recovering → active`. During +the `recovering` phase the executor-loop endpoints (heartbeat, poll, report) +return **503 `CONTROLLER_RECOVERING`** — executors retry through their +existing tenacity backoff. Total RTO target: ≤ 10 min. + +**Relationship to PG-level failover (`promote-standby.sh`):** the app-level +lock is orthogonal to PostgreSQL primary failover. The runbook +`deploy/runbooks/scripts/promote-standby.sh` promotes the PG primary itself +(CH-Q3); after that script runs, the controller pods reconnect and the +advisory lock is re-acquired automatically by whichever pod wins the race. + +**Required environment variables:** + +- `DLW_ACTIVE_LOCK_ID` — bigint advisory-lock key. Default + `0x444C5743414B5631`. **All controller instances MUST use the same value**; + a mismatch causes both to think they are active. +- `DLW_LEADER_POLL_INTERVAL_SECONDS` — standby poll interval (default 5.0, + range 0.5–60.0). + +**Removed environment variables (W3c):** + +- `DLW_STRICT_RECOVERY` — deleted. Recovery failures now keep the controller + in `recovering` and retry on the next leader-loop tick (heartbeats keep + 503ing; alertable from log volume). +``` + +- [ ] **Step 6: Run the full test suite + all lints** + +Run: `uv run pytest -q 2>&1 | tail -10` +Expected: PASS — entire suite green. + +Run: `uv run python tools/lint_invariants.py` +Expected: exit 0, `OK: ...`. + +- [ ] **Step 7: Commit** + +```bash +git add tests/e2e/test_failover_drill.py api/openapi.yaml docs/operator/executor-runbook.md +git commit -m "test(e2e)+docs: failover drill + OpenAPI + operator runbook (W3c)" +``` + +- [ ] **Step 8: Push and open the PR** + +The controller (not the implementer subagent) handles push + PR per the established W-cycle workflow. **Stop after Step 7 and report DONE.** The push + `gh pr create` step is run by the controller after the final whole-implementation review. + +--- + +## Self-Review + +**1. Spec coverage:** +- §1 Goal & Scope (app-level leader election; auto-promotion; cold standby; single-PG) → all 8 tasks collectively. +- §3.1 `LeaderElector` → T2. +- §3.1 `run_leader_loop` → T3. +- §3.2 state machine → encoded in T3's loop body + verified by T8's drill. +- §3.3 lifespan restructure → T7. +- §3.4 `require_not_recovering` dep + heartbeat/poll/report wiring → T6. +- §3.5 `/health/active` endpoint → T5. +- §3.6 `Settings` fields → T1. +- §3.7 `make_app_with_state` helper + ~10-12 site migration → T4 (7 fixture sites; see migration scope refinement below). +- §4 zero schema changes → no alembic task (correct). +- §5 wire format → T5 (`/health/active`), T6 (503 response), T8 Step 3 (OpenAPI). +- §6 error matrix → covered by T2 (connection drop), T3 (recovery failure stays in recovering), T8 (chaos drill). +- §7 testing strategy: + - 7.1 `test_leader_election.py` → T2 (6 cases). + - 7.1 `test_leader_loop.py` → T3 (4 cases). + - 7.1 `test_failover_drill.py` → T8 (3 cases). + - 7.1 `test_recovery_barrier.py` → T6 (3 cases). + - 7.1 `test_health_active.py` → T5 (3 cases). + - 7.1 executor client 503-retry → T6. + - 7.2 migrations → T4 (covers the 7 affected fixture sites; the spec said "10-12 sites" but the actual count of fixture sites grepped was 7 — the discrepancy is because the spec counted some non-fixture test functions that inline `app = create_app()`; the 7 fixture sites are the canonical ones to migrate). +- §8 acceptance criteria → all line items map to a task. +- §9 phasing (M1/M2/M3) → matches the three milestones. + +**2. Placeholder scan:** No "TBD" / "TODO" / "implement later" / "handle edge cases" / "similar to Task N". Every code step has a complete code block or a precise before/after snippet. The one judgement call left to the implementer (T8 Step 3 placing the new OpenAPI operation among siblings) is bounded — "search for `/health/live` / `/health/ready` to find the right location". + +**3. Type consistency:** `ControllerState = Literal["standby", "recovering", "active"]` is defined in T2 (`leader_election.py`) and used in T3's `run_leader_loop` signature. The barrier's default `"active"` (T6) and `/health/active`'s default `"standby"` (T5) are intentional and consistent with the spec §3.4/§3.5. `LeaderElector.try_acquire / verify / release` signatures are stable across T2 (definition), T3 (use in `run_leader_loop`), T7 (use in lifespan), and T8 (use in failover drill). `make_app_with_state(ephemeral_ca, *, enrollment_token, controller_state="active")` — same signature in T4 definition and across all migration sites + the new T5/T6 tests. From 220bd29f4475f60ab40b31189a572dc6e1cd3ec5 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:09:32 +0800 Subject: [PATCH 03/15] feat(config): add active_lock_id + leader_poll_interval_seconds (W3c) --- src/dlw/config.py | 4 ++++ tests/test_config.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/dlw/config.py b/src/dlw/config.py index bff44a3..4bea108 100644 --- a/src/dlw/config.py +++ b/src/dlw/config.py @@ -39,6 +39,10 @@ class Settings(BaseSettings): # Phase 2 W3b — HF reverse-proxy hf_proxy_timeout_seconds: int = Field(default=300, ge=10, le=3600) + # Phase 2 W3c — controller leader election + active_lock_id: int = Field(default=0x444C5743_414B5631, ge=1) # 'DLWC AKV1' + leader_poll_interval_seconds: float = Field(default=5.0, ge=0.5, le=60.0) + @property def db_url(self) -> str: auth = f"{self.db_user}:{self.db_password}" if self.db_password else self.db_user diff --git a/tests/test_config.py b/tests/test_config.py index 1d0be0c..3cb1cfc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -27,3 +27,36 @@ def test_settings_hf_proxy_timeout_reads_env(monkeypatch: pytest.MonkeyPatch) -> monkeypatch.setenv("DLW_HF_PROXY_TIMEOUT_SECONDS", "600") s = Settings() assert s.hf_proxy_timeout_seconds == 600 + + +def test_settings_has_active_lock_id_default(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("DLW_ACTIVE_LOCK_ID", raising=False) + s = Settings() + assert s.active_lock_id == 0x444C5743_414B5631 + + +def test_settings_active_lock_id_reads_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("DLW_ACTIVE_LOCK_ID", "12345") + s = Settings() + assert s.active_lock_id == 12345 + + +def test_settings_active_lock_id_rejects_zero() -> None: + with pytest.raises(ValidationError): + Settings(active_lock_id=0) + + +def test_settings_has_leader_poll_interval_default(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("DLW_LEADER_POLL_INTERVAL_SECONDS", raising=False) + s = Settings() + assert s.leader_poll_interval_seconds == 5.0 + + +def test_settings_leader_poll_interval_rejects_below_min() -> None: + with pytest.raises(ValidationError): + Settings(leader_poll_interval_seconds=0.1) + + +def test_settings_leader_poll_interval_rejects_above_max() -> None: + with pytest.raises(ValidationError): + Settings(leader_poll_interval_seconds=99.0) From 85fb0870656f247d777e17797ba269b90514e720 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:14:38 +0800 Subject: [PATCH 04/15] refactor(config): cap active_lock_id at PG bigint max + env-override test (W3c) --- src/dlw/config.py | 6 +++++- tests/test_config.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/dlw/config.py b/src/dlw/config.py index 4bea108..edfa9cb 100644 --- a/src/dlw/config.py +++ b/src/dlw/config.py @@ -40,7 +40,11 @@ class Settings(BaseSettings): hf_proxy_timeout_seconds: int = Field(default=300, ge=10, le=3600) # Phase 2 W3c — controller leader election - active_lock_id: int = Field(default=0x444C5743_414B5631, ge=1) # 'DLWC AKV1' + active_lock_id: int = Field( + default=0x444C5743_414B5631, + ge=1, + le=9_223_372_036_854_775_807, # PG bigint max (2**63 - 1) + ) # 'DLWC AKV1' leader_poll_interval_seconds: float = Field(default=5.0, ge=0.5, le=60.0) @property diff --git a/tests/test_config.py b/tests/test_config.py index 3cb1cfc..62314d7 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -46,12 +46,23 @@ def test_settings_active_lock_id_rejects_zero() -> None: Settings(active_lock_id=0) +def test_settings_active_lock_id_rejects_above_pg_bigint_max() -> None: + with pytest.raises(ValidationError): + Settings(active_lock_id=9_223_372_036_854_775_808) + + def test_settings_has_leader_poll_interval_default(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("DLW_LEADER_POLL_INTERVAL_SECONDS", raising=False) s = Settings() assert s.leader_poll_interval_seconds == 5.0 +def test_settings_leader_poll_interval_reads_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("DLW_LEADER_POLL_INTERVAL_SECONDS", "10.0") + s = Settings() + assert s.leader_poll_interval_seconds == 10.0 + + def test_settings_leader_poll_interval_rejects_below_min() -> None: with pytest.raises(ValidationError): Settings(leader_poll_interval_seconds=0.1) From c2ea20d2a2a58b8d04eed139bec9966be88155e6 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:17:25 +0800 Subject: [PATCH 05/15] =?UTF-8?q?feat(controller):=20LeaderElector=20?= =?UTF-8?q?=E2=80=94=20PG=20advisory-lock=20primitive=20(W3c)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/services/leader_election.py | 91 ++++++++++++++++++++++ tests/services/test_leader_election.py | 104 +++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 src/dlw/services/leader_election.py create mode 100644 tests/services/test_leader_election.py diff --git a/src/dlw/services/leader_election.py b/src/dlw/services/leader_election.py new file mode 100644 index 0000000..cb218a6 --- /dev/null +++ b/src/dlw/services/leader_election.py @@ -0,0 +1,91 @@ +"""Controller leader election via PG session advisory lock (Phase 2 W3c). + +The instance holding pg_try_advisory_lock() is the active +controller. PG releases the lock the instant its holding session ends, which +gives us automatic failover with no lease/expiry logic. Single-shared-PG +design — PG HA is a separate concern (Phase 4).""" +from __future__ import annotations + +import asyncio +import logging +from collections.abc import Awaitable, Callable +from typing import Literal + +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine, create_async_engine +from sqlalchemy.pool import NullPool + +logger = logging.getLogger(__name__) + +ControllerState = Literal["standby", "recovering", "active"] + + +class LeaderElector: + """Owns a dedicated PG connection that holds the advisory lock.""" + + def __init__(self, db_url: str, lock_id: int) -> None: + self._db_url = db_url + self._lock_id = lock_id + self._engine: AsyncEngine | None = None + self._conn: AsyncConnection | None = None + + async def try_acquire(self) -> bool: + """Try to acquire the lock. Returns True on success (and keeps the + connection open holding it); False if another instance holds it.""" + if self._engine is None: + # Dedicated engine. NullPool so SQLAlchemy never recycles the + # connection out from under us (recycling would release the lock). + # tcp_keepalives so a half-open TCP gets detected reasonably fast, + # bounding the worst-case failover floor. + self._engine = create_async_engine( + self._db_url, + poolclass=NullPool, + connect_args={"server_settings": { + "tcp_keepalives_idle": "30", + "tcp_keepalives_interval": "10", + }}, + ) + if self._conn is None: + self._conn = await self._engine.connect() + result = await self._conn.execute( + text("SELECT pg_try_advisory_lock(:lock_id)"), + {"lock_id": self._lock_id}, + ) + return bool(result.scalar()) + + async def verify(self) -> bool: + """Ping the lock-holding connection. Returns True if still alive + (we still hold the lock); False if the connection died (lock lost).""" + if self._conn is None: + return False + try: + await self._conn.execute(text("SELECT 1")) + return True + except Exception as e: + logger.warning("leader connection lost: %s", e) + await self._cleanup_connection() + return False + + async def release(self) -> None: + """Explicit release on graceful shutdown. Closing the connection + also releases — this is just the polite version.""" + if self._conn is not None: + try: + await self._conn.execute( + text("SELECT pg_advisory_unlock(:lock_id)"), + {"lock_id": self._lock_id}, + ) + except Exception as e: + logger.warning("pg_advisory_unlock failed (releases on close anyway): %s", e) + await self._cleanup_connection() + if self._engine is not None: + await self._engine.dispose() + self._engine = None + + async def _cleanup_connection(self) -> None: + if self._conn is not None: + try: + await self._conn.close() + except Exception: + pass + self._conn = None diff --git a/tests/services/test_leader_election.py b/tests/services/test_leader_election.py new file mode 100644 index 0000000..e2b9d6b --- /dev/null +++ b/tests/services/test_leader_election.py @@ -0,0 +1,104 @@ +"""Tests for LeaderElector (Phase 2 W3c — PG advisory lock primitive).""" +from __future__ import annotations + +import pytest + +from dlw.services.leader_election import LeaderElector + + +def _db_url() -> str: + """Test DB URL — matches the conftest engine fixture.""" + import os + env = { + "host": os.environ.get("DLW_TEST_PG_HOST", "localhost"), + "port": os.environ.get("DLW_TEST_PG_PORT", "5433"), + "user": os.environ.get("DLW_TEST_PG_USER", "postgres"), + "password": os.environ.get("DLW_TEST_PG_PASSWORD", ""), + } + auth = f"{env['user']}:{env['password']}@" if env["password"] else f"{env['user']}@" + return f"postgresql+asyncpg://{auth}{env['host']}:{env['port']}/postgres" + + +_LOCK_ID = 0x4C45_4145_4445_5231 # 'LEAD ER1' + + +@pytest.mark.slow +async def test_first_acquire_succeeds() -> None: + """A fresh elector acquires the advisory lock.""" + e = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await e.try_acquire() is True + finally: + await e.release() + + +@pytest.mark.slow +async def test_second_elector_cannot_acquire() -> None: + """While elector A holds the lock, elector B (same lock_id) gets False.""" + a = LeaderElector(_db_url(), _LOCK_ID) + b = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await a.try_acquire() is True + assert await b.try_acquire() is False + finally: + await b.release() + await a.release() + + +@pytest.mark.slow +async def test_release_frees_lock_for_next_elector() -> None: + """After A releases, B can acquire.""" + a = LeaderElector(_db_url(), _LOCK_ID) + b = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await a.try_acquire() is True + await a.release() + assert await b.try_acquire() is True + finally: + await b.release() + + +@pytest.mark.slow +async def test_connection_drop_frees_lock() -> None: + """The crash-failover guarantee: if A's lock-holding connection is closed + without an explicit unlock, PG releases the lock and B can acquire.""" + a = LeaderElector(_db_url(), _LOCK_ID) + b = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await a.try_acquire() is True + # Simulate process death by closing the underlying connection directly, + # bypassing release()'s polite pg_advisory_unlock. + assert a._conn is not None + await a._conn.close() + a._conn = None + # PG releases on session end; B should now acquire. + assert await b.try_acquire() is True + finally: + await b.release() + await a.release() + + +@pytest.mark.slow +async def test_verify_returns_true_when_holding() -> None: + """verify() pings the connection and confirms it's alive.""" + e = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await e.try_acquire() is True + assert await e.verify() is True + finally: + await e.release() + + +@pytest.mark.slow +async def test_verify_returns_false_after_connection_drop() -> None: + """verify() returns False if the lock connection has died, and cleans up + so the next try_acquire() opens a fresh connection.""" + e = LeaderElector(_db_url(), _LOCK_ID) + try: + assert await e.try_acquire() is True + assert e._conn is not None + await e._conn.close() + e._conn = None + assert await e.verify() is False + finally: + await e.release() From 610226274773eeb5ff486dc3d1d17d65c3f99c80 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:28:25 +0800 Subject: [PATCH 06/15] refactor(controller): exercise verify exception branch + log cleanup errors (W3c) Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/services/leader_election.py | 4 ++-- tests/services/test_leader_election.py | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/dlw/services/leader_election.py b/src/dlw/services/leader_election.py index cb218a6..89f0421 100644 --- a/src/dlw/services/leader_election.py +++ b/src/dlw/services/leader_election.py @@ -86,6 +86,6 @@ async def _cleanup_connection(self) -> None: if self._conn is not None: try: await self._conn.close() - except Exception: - pass + except Exception as exc: + logger.debug("conn.close() during cleanup raised (ignored): %s", exc) self._conn = None diff --git a/tests/services/test_leader_election.py b/tests/services/test_leader_election.py index e2b9d6b..ef627fc 100644 --- a/tests/services/test_leader_election.py +++ b/tests/services/test_leader_election.py @@ -7,7 +7,10 @@ def _db_url() -> str: - """Test DB URL — matches the conftest engine fixture.""" + """URL for the PostgreSQL admin DB used by these tests. Advisory locks + are cluster-wide (not per-DB), so the choice of DB doesn't affect lock + isolation — only the lock_id does. This file's _LOCK_ID is unique and + pytest runs serially within a session, so collisions aren't a concern.""" import os env = { "host": os.environ.get("DLW_TEST_PG_HOST", "localhost"), @@ -91,14 +94,20 @@ async def test_verify_returns_true_when_holding() -> None: @pytest.mark.slow async def test_verify_returns_false_after_connection_drop() -> None: - """verify() returns False if the lock connection has died, and cleans up - so the next try_acquire() opens a fresh connection.""" + """verify() must exercise its exception branch — when _conn is still set + but the underlying connection is dead, the SELECT 1 ping raises and + verify() cleans up + returns False. (Pre-NULLing _conn would short-circuit + on the `if self._conn is None: return False` guard and skip the path + that matters in production.)""" e = LeaderElector(_db_url(), _LOCK_ID) try: assert await e.try_acquire() is True assert e._conn is not None await e._conn.close() - e._conn = None + # _conn intentionally LEFT non-None — verify() must hit the + # SELECT-1-raises exception branch, not the early None-guard. assert await e.verify() is False + # And cleanup happened — _conn is None now: + assert e._conn is None finally: await e.release() From 2b2dc0b87870d054c5564d0db53bc052ddbb459c Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:32:36 +0800 Subject: [PATCH 07/15] feat(controller): run_leader_loop state machine (W3c) Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/services/leader_election.py | 74 +++++++++++ tests/services/test_leader_loop.py | 183 ++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100644 tests/services/test_leader_loop.py diff --git a/src/dlw/services/leader_election.py b/src/dlw/services/leader_election.py index 89f0421..91b87b5 100644 --- a/src/dlw/services/leader_election.py +++ b/src/dlw/services/leader_election.py @@ -89,3 +89,77 @@ async def _cleanup_connection(self) -> None: except Exception as exc: logger.debug("conn.close() during cleanup raised (ignored): %s", exc) self._conn = None + + +async def run_leader_loop( + *, + elector: LeaderElector, + poll_interval_seconds: float, + set_state: Callable[[ControllerState], None], + on_promote: Callable[[], Awaitable[None]], + on_active: Callable[[], Awaitable[None]], + on_step_down: Callable[[], Awaitable[None]], + shutdown: asyncio.Event, +) -> None: + """The leader loop. Stays in standby polling for the lock; on acquire + transitions through recovering→active; on connection loss steps back to + standby. Returns cleanly when `shutdown` is set.""" + state: ControllerState = "standby" + set_state(state) + while not shutdown.is_set(): + try: + if state == "standby": + if await elector.try_acquire(): + state = "recovering" + set_state(state) + logger.info("leader: acquired lock, running recovery") + try: + await on_promote() + except Exception: + logger.exception("leader: recovery failed; will retry next tick") + # Stay in `recovering` — heartbeats keep 503ing. + # Don't release the lock (another instance can't fix it). + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue + state = "active" + set_state(state) + await on_active() + logger.info("leader: promoted to active") + elif state == "recovering": + # Lock is held but promotion hasn't completed. Verify lock still + # alive, then retry on_promote. + if not await elector.verify(): + logger.warning("leader: lost lock during recovery, stepping down") + await on_step_down() + state = "standby" + set_state(state) + continue + try: + await on_promote() + except Exception: + logger.exception("leader: recovery failed; will retry next tick") + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue + state = "active" + set_state(state) + await on_active() + logger.info("leader: promoted to active (after retry)") + elif state == "active": + if not await elector.verify(): + logger.warning("leader: lost lock, stepping down to standby") + await on_step_down() + state = "standby" + set_state(state) + continue + except asyncio.CancelledError: + raise + except Exception: + logger.exception("leader loop iteration failed") + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + + +async def _sleep_or_shutdown(shutdown: asyncio.Event, seconds: float) -> None: + try: + await asyncio.wait_for(shutdown.wait(), timeout=seconds) + except asyncio.TimeoutError: + pass diff --git a/tests/services/test_leader_loop.py b/tests/services/test_leader_loop.py new file mode 100644 index 0000000..47fed68 --- /dev/null +++ b/tests/services/test_leader_loop.py @@ -0,0 +1,183 @@ +"""Tests for run_leader_loop driven by fake callbacks (Phase 2 W3c).""" +from __future__ import annotations + +import asyncio + +import pytest + + +class _FakeElector: + """Drives try_acquire/verify return values from a queue.""" + def __init__(self, acquires: list[bool], verifies: list[bool] | None = None) -> None: + self._acquires = list(acquires) + self._verifies = list(verifies or []) + + async def try_acquire(self) -> bool: + return self._acquires.pop(0) if self._acquires else False + + async def verify(self) -> bool: + return self._verifies.pop(0) if self._verifies else True + + async def release(self) -> None: + pass + + +async def _run_for(loop_coro_factory, ticks: float) -> None: + """Spawn the leader loop, let it run for `ticks` seconds, then signal + shutdown and await clean exit.""" + shutdown = asyncio.Event() + task = asyncio.create_task(loop_coro_factory(shutdown)) + await asyncio.sleep(ticks) + shutdown.set() + await asyncio.wait_for(task, timeout=2.0) + + +@pytest.mark.slow +async def test_standby_polls_until_lock_available() -> None: + """Standby retries try_acquire across ticks; on success, transitions + through recovering → active and runs both callbacks exactly once.""" + from dlw.services.leader_election import run_leader_loop + + states: list[str] = [] + promote_calls = 0 + active_calls = 0 + + async def on_promote() -> None: + nonlocal promote_calls + promote_calls += 1 + + async def on_active() -> None: + nonlocal active_calls + active_calls += 1 + + async def on_step_down() -> None: + pass + + elector = _FakeElector(acquires=[False, False, True], verifies=[True] * 10) + + def loop_factory(shutdown): + return run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + ) + + await _run_for(loop_factory, ticks=0.5) + # Standby (initial) → standby (failed acquire 1) → standby (failed acquire 2) + # → recovering → active. We see at minimum: ["standby", "recovering", "active"]. + assert "standby" in states + assert "recovering" in states + assert "active" in states + assert states.index("recovering") < states.index("active") + assert promote_calls == 1 + assert active_calls == 1 + + +@pytest.mark.slow +async def test_recovery_failure_stays_in_recovering_and_retries() -> None: + """on_promote raising keeps state at recovering; the loop retries next + tick; once on_promote succeeds, state advances to active.""" + from dlw.services.leader_election import run_leader_loop + + states: list[str] = [] + promote_calls = 0 + + async def on_promote() -> None: + nonlocal promote_calls + promote_calls += 1 + if promote_calls == 1: + raise RuntimeError("recovery boom") + # second call succeeds + + async def on_active() -> None: + pass + + async def on_step_down() -> None: + pass + + elector = _FakeElector(acquires=[True], verifies=[True] * 10) + + def loop_factory(shutdown): + return run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + ) + + await _run_for(loop_factory, ticks=0.5) + # First promote raises → stays recovering. State sequence must show + # 'recovering' present and eventually 'active' after the 2nd promote. + assert promote_calls >= 2 + assert states.count("recovering") >= 1 + assert "active" in states + + +@pytest.mark.slow +async def test_step_down_on_connection_loss() -> None: + """When verify() returns False (lost connection) the loop calls + on_step_down and reverts state to standby.""" + from dlw.services.leader_election import run_leader_loop + + states: list[str] = [] + step_down_calls = 0 + + async def on_promote() -> None: + pass + + async def on_active() -> None: + pass + + async def on_step_down() -> None: + nonlocal step_down_calls + step_down_calls += 1 + + # acquire → True; verify ticks: True, then False (connection died), then + # the loop tries to re-acquire — fake elector returns False thereafter. + elector = _FakeElector(acquires=[True, False, False], verifies=[True, False]) + + def loop_factory(shutdown): + return run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + ) + + await _run_for(loop_factory, ticks=0.5) + assert step_down_calls >= 1 + # We must have transitioned: standby → recovering → active → standby + # (last 'standby' is the step-down). Assert at least one re-entry to + # standby AFTER 'active' appeared. + first_active_idx = states.index("active") + assert any(s == "standby" for s in states[first_active_idx + 1:]) + + +@pytest.mark.slow +async def test_shutdown_event_exits_cleanly() -> None: + """Setting the shutdown event causes the loop to exit within one tick.""" + from dlw.services.leader_election import run_leader_loop + + async def on_promote() -> None: + pass + + async def on_active() -> None: + pass + + async def on_step_down() -> None: + pass + + elector = _FakeElector(acquires=[True], verifies=[True] * 100) + states: list[str] = [] + shutdown = asyncio.Event() + task = asyncio.create_task(run_leader_loop( + elector=elector, poll_interval_seconds=0.05, + set_state=states.append, on_promote=on_promote, + on_active=on_active, on_step_down=on_step_down, + shutdown=shutdown, + )) + await asyncio.sleep(0.1) + shutdown.set() + # Must exit within one poll interval (0.05s) + a small slack. + await asyncio.wait_for(task, timeout=0.5) From 7fd8e308930870e88d061953876a95aae480b0b9 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:39:19 +0800 Subject: [PATCH 08/15] refactor(controller): guard on_active failures, strengthen step-down test (W3c) Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/services/leader_election.py | 18 ++++++++++++++++-- tests/services/test_leader_loop.py | 9 +++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/dlw/services/leader_election.py b/src/dlw/services/leader_election.py index 91b87b5..cbd4d51 100644 --- a/src/dlw/services/leader_election.py +++ b/src/dlw/services/leader_election.py @@ -123,7 +123,14 @@ async def run_leader_loop( continue state = "active" set_state(state) - await on_active() + try: + await on_active() + except Exception: + logger.exception("leader: on_active failed; reverting to recovering for retry") + state = "recovering" + set_state(state) + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue logger.info("leader: promoted to active") elif state == "recovering": # Lock is held but promotion hasn't completed. Verify lock still @@ -142,7 +149,14 @@ async def run_leader_loop( continue state = "active" set_state(state) - await on_active() + try: + await on_active() + except Exception: + logger.exception("leader: on_active failed; reverting to recovering for retry") + state = "recovering" + set_state(state) + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue logger.info("leader: promoted to active (after retry)") elif state == "active": if not await elector.verify(): diff --git a/tests/services/test_leader_loop.py b/tests/services/test_leader_loop.py index 47fed68..db9943c 100644 --- a/tests/services/test_leader_loop.py +++ b/tests/services/test_leader_loop.py @@ -117,17 +117,21 @@ def loop_factory(shutdown): @pytest.mark.slow async def test_step_down_on_connection_loss() -> None: """When verify() returns False (lost connection) the loop calls - on_step_down and reverts state to standby.""" + on_step_down and reverts state to standby. Also verifies on_active ran + exactly once before the step-down (regression-guards the unguarded- + on_active concern from W3c T3 code review).""" from dlw.services.leader_election import run_leader_loop states: list[str] = [] + active_calls = 0 step_down_calls = 0 async def on_promote() -> None: pass async def on_active() -> None: - pass + nonlocal active_calls + active_calls += 1 async def on_step_down() -> None: nonlocal step_down_calls @@ -146,6 +150,7 @@ def loop_factory(shutdown): ) await _run_for(loop_factory, ticks=0.5) + assert active_calls == 1, f"on_active should have run exactly once before step-down: got {active_calls}" assert step_down_calls >= 1 # We must have transitioned: standby → recovering → active → standby # (last 'standby' is the step-down). Assert at least one re-entry to From 0af287ebbbeff36a1b38c8945840360b2a42f98b Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:41:53 +0800 Subject: [PATCH 09/15] test: make_app_with_state helper + migrate 7 fixture sites (W3c) Co-Authored-By: Claude Sonnet 4.6 --- tests/api/test_executors.py | 9 ++------- tests/api/test_hf_proxy.py | 10 ++-------- tests/api/test_register_endpoint.py | 9 ++------- tests/api/test_renew_endpoint.py | 9 ++------- tests/api/test_subtasks.py | 9 ++------- tests/conftest.py | 20 ++++++++++++++++++++ tests/e2e/test_executor_e2e.py | 13 +++---------- tests/e2e/test_happy_path.py | 11 +++-------- 8 files changed, 36 insertions(+), 54 deletions(-) diff --git a/tests/api/test_executors.py b/tests/api/test_executors.py index d6e50bb..8e6cb78 100644 --- a/tests/api/test_executors.py +++ b/tests/api/test_executors.py @@ -59,13 +59,8 @@ def auth() -> dict[str, str]: @pytest.fixture async def client(ephemeral_ca): """App with W3a auth state injected onto app.state from the session CA.""" - from dlw.main import create_app - from dlw.auth.hmac_nonce import NonceStore - app = create_app() - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) - app.state.enrollment_token = _ENROLL + from tests.conftest import make_app_with_state + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: yield c diff --git a/tests/api/test_hf_proxy.py b/tests/api/test_hf_proxy.py index 10f16f9..e83ed82 100644 --- a/tests/api/test_hf_proxy.py +++ b/tests/api/test_hf_proxy.py @@ -66,14 +66,8 @@ def _set_env(monkeypatch: pytest.MonkeyPatch): @pytest.fixture def proxy_app(ephemeral_ca): - from dlw.auth.hmac_nonce import NonceStore - from dlw.main import create_app - app = create_app() - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) - app.state.enrollment_token = _ENROLL - return app + from tests.conftest import make_app_with_state + return make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) def _install_hf_mock(monkeypatch, handler): diff --git a/tests/api/test_register_endpoint.py b/tests/api/test_register_endpoint.py index 4981a55..2184a32 100644 --- a/tests/api/test_register_endpoint.py +++ b/tests/api/test_register_endpoint.py @@ -37,13 +37,8 @@ async def _create_tables(engine): async def client(ephemeral_ca, monkeypatch, tmp_path): """App with W3a auth state set directly on app.state (skip the real lifespan bootstrap — set ca/jwt_keypair/enrollment_token from ephemeral_ca).""" - from dlw.main import create_app - from dlw.auth.hmac_nonce import NonceStore - app = create_app() - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=100, ttl_seconds=300) - app.state.enrollment_token = _ENROLL + from tests.conftest import make_app_with_state + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: yield c diff --git a/tests/api/test_renew_endpoint.py b/tests/api/test_renew_endpoint.py index 7fc506a..94ddbf5 100644 --- a/tests/api/test_renew_endpoint.py +++ b/tests/api/test_renew_endpoint.py @@ -34,14 +34,9 @@ async def _create_tables(engine): @pytest.fixture async def client(ephemeral_ca, monkeypatch): - from dlw.main import create_app - from dlw.auth.hmac_nonce import NonceStore + from tests.conftest import make_app_with_state monkeypatch.setenv("DLW_TLS_TRUSTED_PROXY", "1") - app = create_app() - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=100, ttl_seconds=300) - app.state.enrollment_token = _ENROLL + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: yield c diff --git a/tests/api/test_subtasks.py b/tests/api/test_subtasks.py index ffb0079..1fd72f3 100644 --- a/tests/api/test_subtasks.py +++ b/tests/api/test_subtasks.py @@ -67,13 +67,8 @@ def auth() -> dict[str, str]: @pytest.fixture async def client(ephemeral_ca): - from dlw.main import create_app - from dlw.auth.hmac_nonce import NonceStore - app = create_app() - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) - app.state.enrollment_token = _ENROLL + from tests.conftest import make_app_with_state + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: yield c diff --git a/tests/conftest.py b/tests/conftest.py index 08bee50..fbd482e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -303,3 +303,23 @@ async def stream_hf(self, *, subtask_id, assignment_token, yield resp return _FakeControllerClient() + + +def make_app_with_state( + ephemeral_ca, + *, + enrollment_token: str, + controller_state: str = "active", +): + """Build a controller app for ASGI-transport tests with app.state pre-seeded + (skips the lifespan bootstrap). Defaults controller_state to 'active' so + the W3c recovery barrier doesn't fire.""" + from dlw.auth.hmac_nonce import NonceStore + from dlw.main import create_app + app = create_app() + app.state.ca = ephemeral_ca["ca"] + app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] + app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) + app.state.enrollment_token = enrollment_token + app.state.controller_state = controller_state + return app diff --git a/tests/e2e/test_executor_e2e.py b/tests/e2e/test_executor_e2e.py index 9dfe16f..d40f229 100644 --- a/tests/e2e/test_executor_e2e.py +++ b/tests/e2e/test_executor_e2e.py @@ -120,17 +120,10 @@ def hf_handler(request: httpx.Request) -> httpx.Response: s3 = boto3.client("s3", region_name="us-east-1") s3.create_bucket(Bucket=_BUCKET) - from dlw.main import create_app - from dlw.auth.hmac_nonce import NonceStore from dlw.executor.auth_lifecycle import AuthState - from tests.conftest import register_test_executor - app = create_app() - # W3a: inject the auth substrate onto app.state (skip the lifespan - # bootstrap — this test drives the ASGI app directly, no real server). - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) - app.state.enrollment_token = _ENROLL + from tests.conftest import make_app_with_state, register_test_executor + # W3a: app.state injected by helper (skip the lifespan bootstrap). + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) asgi_transport = httpx.ASGITransport(app=app) async with httpx.AsyncClient( diff --git a/tests/e2e/test_happy_path.py b/tests/e2e/test_happy_path.py index 1395a54..3114dfc 100644 --- a/tests/e2e/test_happy_path.py +++ b/tests/e2e/test_happy_path.py @@ -52,16 +52,11 @@ def _set_token(monkeypatch: pytest.MonkeyPatch): @pytest.mark.slow async def test_full_task_lifecycle_via_http(ephemeral_ca) -> None: - from dlw.main import create_app - from dlw.auth.hmac_nonce import NonceStore from tests.conftest import ( - executor_request_headers, register_test_executor, signed_heartbeat_headers, + executor_request_headers, make_app_with_state, register_test_executor, + signed_heartbeat_headers, ) - app = create_app() - app.state.ca = ephemeral_ca["ca"] - app.state.jwt_keypair = ephemeral_ca["jwt_keypair"] - app.state.nonce_store = NonceStore(maxsize=1000, ttl_seconds=300) - app.state.enrollment_token = _ENROLL + app = make_app_with_state(ephemeral_ca, enrollment_token=_ENROLL) auth = {"Authorization": f"Bearer {_TOKEN}"} async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: From b2b72400f465f368244248f7a727e1cce13a604e Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:48:03 +0800 Subject: [PATCH 10/15] feat(api): GET /health/active LB target (W3c) --- src/dlw/api/health.py | 12 ++++++++- tests/api/test_health_active.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tests/api/test_health_active.py diff --git a/src/dlw/api/health.py b/src/dlw/api/health.py index 91cc61b..6fb9334 100644 --- a/src/dlw/api/health.py +++ b/src/dlw/api/health.py @@ -1,7 +1,7 @@ """Health endpoints: /health/live (process up?) + /health/ready (db reachable?).""" from __future__ import annotations -from fastapi import APIRouter, HTTPException +from fastapi import APIRouter, HTTPException, Request from sqlalchemy import text from sqlalchemy.ext.asyncio import async_sessionmaker @@ -31,3 +31,13 @@ async def ready() -> dict[str, str]: except Exception as exc: raise HTTPException(status_code=503, detail=f"db unreachable: {exc}") from exc return {"status": "ready", "db": "ok"} + + +@router.get("/active") +async def active(request: Request) -> dict[str, str]: + """W3c: LB target — 200 iff this instance holds the leader lock + (controller_state in {'recovering', 'active'}). 503 otherwise.""" + state = getattr(request.app.state, "controller_state", "standby") + if state in ("recovering", "active"): + return {"status": "active", "controller_state": state} + raise HTTPException(status_code=503, detail={"controller_state": state}) diff --git a/tests/api/test_health_active.py b/tests/api/test_health_active.py new file mode 100644 index 0000000..ca1c9a3 --- /dev/null +++ b/tests/api/test_health_active.py @@ -0,0 +1,46 @@ +"""Tests for GET /health/active (Phase 2 W3c — LB target endpoint).""" +from __future__ import annotations + +import pytest +from httpx import ASGITransport, AsyncClient + + +@pytest.fixture +def app_factory(ephemeral_ca): + from tests.conftest import make_app_with_state + def _make(controller_state: str): + return make_app_with_state( + ephemeral_ca, enrollment_token="ignored", controller_state=controller_state, + ) + return _make + + +@pytest.mark.slow +async def test_health_active_503_when_standby(app_factory) -> None: + app = app_factory("standby") + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + r = await c.get("/health/active") + assert r.status_code == 503 + assert r.json()["detail"]["controller_state"] == "standby" + + +@pytest.mark.slow +async def test_health_active_200_when_recovering(app_factory) -> None: + app = app_factory("recovering") + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + r = await c.get("/health/active") + assert r.status_code == 200 + body = r.json() + assert body["status"] == "active" + assert body["controller_state"] == "recovering" + + +@pytest.mark.slow +async def test_health_active_200_when_active(app_factory) -> None: + app = app_factory("active") + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + r = await c.get("/health/active") + assert r.status_code == 200 + body = r.json() + assert body["status"] == "active" + assert body["controller_state"] == "active" From 073dc0b1b576bdb9459dc7f39793c276b3b05491 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:52:47 +0800 Subject: [PATCH 11/15] feat(api): require_not_recovering 503 barrier on executor-loop endpoints (W3c) Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/api/_recovery_barrier.py | 18 +++++ src/dlw/api/executors.py | 7 +- src/dlw/api/subtasks.py | 4 +- tests/api/test_recovery_barrier.py | 116 +++++++++++++++++++++++++++++ tests/executor/test_client.py | 33 ++++++++ 5 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 src/dlw/api/_recovery_barrier.py create mode 100644 tests/api/test_recovery_barrier.py diff --git a/src/dlw/api/_recovery_barrier.py b/src/dlw/api/_recovery_barrier.py new file mode 100644 index 0000000..c3666d5 --- /dev/null +++ b/src/dlw/api/_recovery_barrier.py @@ -0,0 +1,18 @@ +"""FastAPI dep that 503s executor-loop calls while the controller is recovering +after a failover (Phase 2 W3c, INVARIANT 33).""" +from __future__ import annotations + +from fastapi import HTTPException, Request + + +async def require_not_recovering(request: Request) -> None: + """Returns normally if the controller is active (or state is unset, which + defaults to active for tests bypassing the lifespan). Raises 503 with + detail.code='CONTROLLER_RECOVERING' if state == 'recovering'.""" + state = getattr(request.app.state, "controller_state", "active") + if state == "recovering": + raise HTTPException( + status_code=503, + detail={"code": "CONTROLLER_RECOVERING", + "message": "controller recovering after failover, retry shortly"}, + ) diff --git a/src/dlw/api/executors.py b/src/dlw/api/executors.py index 6b1551f..c7f179b 100644 --- a/src/dlw/api/executors.py +++ b/src/dlw/api/executors.py @@ -15,6 +15,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from sqlalchemy.ext.asyncio import AsyncSession +from dlw.api._recovery_barrier import require_not_recovering from dlw.api.tasks import _session from dlw.auth.ca import fingerprint_of, sign_csr from dlw.auth import jwt_signing @@ -123,7 +124,8 @@ async def post_renew( ) -@router.post("/{executor_id}/heartbeat") +@router.post("/{executor_id}/heartbeat", + dependencies=[Depends(require_not_recovering)]) async def post_heartbeat( body: ExecutorHeartbeat, _hmac: Executor = Depends(require_hmac_heartbeat), @@ -138,7 +140,8 @@ async def post_heartbeat( return ExecutorRead.model_validate(ex) -@router.post("/{executor_id}/poll") +@router.post("/{executor_id}/poll", + dependencies=[Depends(require_not_recovering)]) async def post_poll( executor: Executor = Depends(require_executor_epoch), session: AsyncSession = Depends(_session), diff --git a/src/dlw/api/subtasks.py b/src/dlw/api/subtasks.py index 3f06d02..7838ce6 100644 --- a/src/dlw/api/subtasks.py +++ b/src/dlw/api/subtasks.py @@ -10,6 +10,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException from sqlalchemy.ext.asyncio import AsyncSession +from dlw.api._recovery_barrier import require_not_recovering from dlw.api.tasks import _session from dlw.auth.executor_jwt_dep import require_executor_jwt from dlw.db.models.executor import Executor @@ -20,7 +21,8 @@ router = APIRouter(prefix="/api/v1/subtasks", tags=["subtasks"]) -@router.post("/{subtask_id}/report") +@router.post("/{subtask_id}/report", + dependencies=[Depends(require_not_recovering)]) async def post_report( subtask_id: uuid.UUID, body: SubTaskReport, diff --git a/tests/api/test_recovery_barrier.py b/tests/api/test_recovery_barrier.py new file mode 100644 index 0000000..10b0f9b --- /dev/null +++ b/tests/api/test_recovery_barrier.py @@ -0,0 +1,116 @@ +"""Tests for require_not_recovering dep — 503 barrier on executor-loop endpoints +during controller recovery (Phase 2 W3c, INVARIANT 33).""" +from __future__ import annotations + +import uuid + +import pytest +from httpx import ASGITransport, AsyncClient +from sqlalchemy.ext.asyncio import async_sessionmaker + +from dlw.db.base import Base + + +_ENROLL = "test-enroll-barrier" + + +@pytest.fixture(scope="module", autouse=True) +async def _bootstrap(engine): + from dlw.db.models.storage import StorageBackend + from dlw.db.models.tenant import Project, Tenant, User + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + factory = async_sessionmaker(engine, expire_on_commit=False) + async with factory() as s: + s.add(Tenant(id=1, slug="d", display_name="D")) + await s.flush() + s.add(Project(id=1, tenant_id=1, name="d")) + s.add(User(id=1, tenant_id=1, oidc_subject="d", + email="d@l", role="tenant_admin")) + s.add(StorageBackend(id=1, tenant_id=1, name="d", + backend_type="s3", config_encrypted=b"")) + await s.commit() + yield + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + +@pytest.fixture(autouse=True) +def _set_env(monkeypatch: pytest.MonkeyPatch): + from dlw.config import get_settings + monkeypatch.setenv("DLW_TLS_TRUSTED_PROXY", "1") + get_settings.cache_clear() + yield + get_settings.cache_clear() + + +@pytest.mark.slow +async def test_heartbeat_503_when_recovering(ephemeral_ca) -> None: + from tests.conftest import ( + make_app_with_state, register_test_executor, signed_heartbeat_headers, + ) + app = make_app_with_state( + ephemeral_ca, enrollment_token=_ENROLL, controller_state="recovering", + ) + async with AsyncClient(transport=ASGITransport(app=app), + base_url="http://test") as c: + reg = await register_test_executor( + c, enrollment_token=_ENROLL, + executor_id="bar-worker-1", host_id="bar-host", + ) + hb_body = b'{"health_score": 100, "parts_dir_bytes": 0, "disk_free_gb": 100}' + r = await c.post("/api/v1/executors/bar-worker-1/heartbeat", + content=hb_body, + headers=signed_heartbeat_headers(reg, hb_body)) + assert r.status_code == 503 + assert r.json()["detail"]["code"] == "CONTROLLER_RECOVERING" + + +@pytest.mark.slow +async def test_poll_and_report_also_503_when_recovering(ephemeral_ca) -> None: + from tests.conftest import ( + executor_request_headers, make_app_with_state, register_test_executor, + ) + app = make_app_with_state( + ephemeral_ca, enrollment_token=_ENROLL, controller_state="recovering", + ) + async with AsyncClient(transport=ASGITransport(app=app), + base_url="http://test") as c: + reg = await register_test_executor( + c, enrollment_token=_ENROLL, + executor_id="bar-worker-2", host_id="bar-host", + ) + # /poll → 503 + r = await c.post("/api/v1/executors/bar-worker-2/poll", + headers=executor_request_headers(reg)) + assert r.status_code == 503 + assert r.json()["detail"]["code"] == "CONTROLLER_RECOVERING" + # /report → 503 + r = await c.post(f"/api/v1/subtasks/{uuid.uuid4()}/report", json={ + "status": "succeeded", "actual_sha256": "f" * 64, + "bytes_downloaded": 0, + }, headers=executor_request_headers(reg)) + assert r.status_code == 503 + assert r.json()["detail"]["code"] == "CONTROLLER_RECOVERING" + + +@pytest.mark.slow +async def test_heartbeat_passes_when_active(ephemeral_ca) -> None: + """Sanity: with controller_state='active', the barrier does not fire.""" + from tests.conftest import ( + make_app_with_state, register_test_executor, signed_heartbeat_headers, + ) + app = make_app_with_state( + ephemeral_ca, enrollment_token=_ENROLL, controller_state="active", + ) + async with AsyncClient(transport=ASGITransport(app=app), + base_url="http://test") as c: + reg = await register_test_executor( + c, enrollment_token=_ENROLL, + executor_id="bar-worker-3", host_id="bar-host", + ) + hb_body = b'{"health_score": 100, "parts_dir_bytes": 0, "disk_free_gb": 100}' + r = await c.post("/api/v1/executors/bar-worker-3/heartbeat", + content=hb_body, + headers=signed_heartbeat_headers(reg, hb_body)) + assert r.status_code == 200 diff --git a/tests/executor/test_client.py b/tests/executor/test_client.py index 68c93b0..8c4a654 100644 --- a/tests/executor/test_client.py +++ b/tests/executor/test_client.py @@ -209,3 +209,36 @@ def handler(request: httpx.Request) -> httpx.Response: assert seen["headers"]["x-executor-epoch"] == "4" assert seen["headers"]["x-assignment-token"] == str(tok) assert seen["headers"]["range"] == "bytes=0-1023" + + +@pytest.mark.slow +async def test_controller_recovering_503_is_retried_by_tenacity(tmp_path) -> None: + """W3c: a 503 with detail.code=CONTROLLER_RECOVERING from the controller is + a transient response. The existing tenacity _retry on ControllerClient + retries it; by the second attempt the controller is active and returns 200.""" + call_count = 0 + + def handler(request: httpx.Request) -> httpx.Response: + nonlocal call_count + call_count += 1 + if call_count == 1: + return httpx.Response(503, json={ + "detail": {"code": "CONTROLLER_RECOVERING", + "message": "controller recovering after failover"}, + }) + return httpx.Response(200, json={ + "id": "ex-recov", "status": "healthy", "health_score": 100, + }) + + state = make_fake_auth_state( + tmp_path, executor_id="ex-recov", epoch=1, jwt="jwt-recov", + hmac_seed=_HMAC_SEED, + ) + async with ControllerClient( + base_url="http://test", auth_state=state, + _transport=httpx.MockTransport(handler), + ) as c: + r = await c.heartbeat(executor_id="ex-recov", health_score=100, + parts_dir_bytes=0) + assert r["status"] == "healthy" + assert call_count == 2 From 7a83b7808d3f00929342161bd49bf68dc2e480e8 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 17:58:44 +0800 Subject: [PATCH 12/15] feat(controller): lifespan leader-gates recovery + sweep, delete STRICT_RECOVERY (W3c) Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/main.py | 80 +++++++++++++++++------------ tests/e2e/test_executor_auth_e2e.py | 1 - 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/src/dlw/main.py b/src/dlw/main.py index e8a44dd..2cabeab 100644 --- a/src/dlw/main.py +++ b/src/dlw/main.py @@ -18,22 +18,17 @@ @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncIterator[None]: - """Phase 2 W1: run recovery_routine before serving + spawn reclaim_loop. - - Order: - 1. Recovery routine (synchronous; must complete before serving traffic) - 2. Spawn background reclaim_loop task - 3. yield (app serves traffic) - 4. Cancel reclaim_loop + dispose engine on shutdown - """ + """W3c: leader-gated lifespan. The W3a auth substrate is bootstrapped + unconditionally (both roles need it ready so promotion is instant). The + run_recovery_routine + sweep loop are started by the leader loop only + after this instance acquires the leader advisory lock.""" from dlw.db.session import get_engine, reset_engine + from dlw.services.leader_election import LeaderElector, run_leader_loop from dlw.services.recovery import run_recovery_routine factory = async_sessionmaker(get_engine(), expire_on_commit=False) - # W3a: bootstrap CA + JWT signing key + server cert + nonce store + enrollment token. - # Install the uvicorn transport scope patch so _extract_peer_cert can read - # the mTLS peer cert from scope["transport"] in direct-TLS deployments. + # W3a auth bootstrap — UNCHANGED. Both active and standby need this ready. from dlw.auth.uvicorn_tls_patch import install_transport_scope_patch install_transport_scope_patch() @@ -65,34 +60,55 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: app.state.nonce_store = NonceStore(maxsize=10_000, ttl_seconds=300) app.state.enrollment_token = _enroll - # W6-J: spec §7 says recovery failure aborts startup. Permissive dev mode - # via DLW_STRICT_RECOVERY=false env override (defaults to strict). - import os - strict_recovery = os.environ.get("DLW_STRICT_RECOVERY", "true").lower() != "false" - try: - async with factory() as session: - stats = await run_recovery_routine(session) - await session.commit() # W6-E: caller commits (service is no-commit) - logger.info("startup recovery: %s", stats.as_dict()) - except Exception: - if strict_recovery: - logger.exception("startup recovery_routine failed; aborting startup (strict mode)") - raise - logger.exception( - "startup recovery_routine failed; continuing in permissive mode " - "(DLW_STRICT_RECOVERY=false)" - ) + # W3c: controller state + leader loop. + app.state.controller_state = "standby" + shutdown = asyncio.Event() + elector = LeaderElector( + db_url=_settings.db_url, lock_id=_settings.active_lock_id, + ) + sweep_task_holder: dict[str, asyncio.Task | None] = {"t": None} - sweep_task = asyncio.create_task(_sweep_loop_main(factory)) + def _set_state(s: str) -> None: + app.state.controller_state = s + async def _on_promote() -> None: + async with factory() as session: + stats = await run_recovery_routine(session) + await session.commit() + logger.info("recovery on promote: %s", stats.as_dict()) + + async def _on_active() -> None: + sweep_task_holder["t"] = asyncio.create_task(_sweep_loop_main(factory)) + + async def _on_step_down() -> None: + t = sweep_task_holder["t"] + if t is not None: + t.cancel() + try: + await asyncio.wait_for(t, timeout=2) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + sweep_task_holder["t"] = None + + leader_task = asyncio.create_task(run_leader_loop( + elector=elector, + poll_interval_seconds=_settings.leader_poll_interval_seconds, + set_state=_set_state, + on_promote=_on_promote, + on_active=_on_active, + on_step_down=_on_step_down, + shutdown=shutdown, + )) try: yield finally: - sweep_task.cancel() + shutdown.set() + await _on_step_down() try: - await asyncio.wait_for(sweep_task, timeout=2) + await asyncio.wait_for(leader_task, timeout=5) except (asyncio.CancelledError, asyncio.TimeoutError): - pass + leader_task.cancel() + await elector.release() await reset_engine() diff --git a/tests/e2e/test_executor_auth_e2e.py b/tests/e2e/test_executor_auth_e2e.py index ea1ada5..7447282 100644 --- a/tests/e2e/test_executor_auth_e2e.py +++ b/tests/e2e/test_executor_auth_e2e.py @@ -83,7 +83,6 @@ async def test_register_then_heartbeat_full_flow(tmp_path, engine, test_db_name) "DLW_DB_USER": os.environ.get("DLW_TEST_PG_USER", "postgres"), "DLW_DB_PASSWORD": os.environ.get("DLW_TEST_PG_PASSWORD", ""), "DLW_DB_NAME": test_db_name, - "DLW_STRICT_RECOVERY": "false", # don't abort startup on recovery hiccups } proc = subprocess.Popen( [sys.executable, "-m", "uvicorn", "dlw.main:app", From d1e36b2f3873f18d3c724917e3fe7535418ed901 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 18:10:32 +0800 Subject: [PATCH 13/15] test(e2e)+docs: failover drill + OpenAPI + operator runbook (W3c) --- api/openapi.yaml | 67 ++++++++++ docs/operator/executor-runbook.md | 39 ++++++ tests/e2e/test_failover_drill.py | 212 ++++++++++++++++++++++++++++++ 3 files changed, 318 insertions(+) create mode 100644 tests/e2e/test_failover_drill.py diff --git a/api/openapi.yaml b/api/openapi.yaml index 9914d66..b279c25 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -35,6 +35,8 @@ servers: description: Local dev tags: + - name: health + description: Health and leader-election probes - name: tasks description: Download task management - name: subtasks @@ -434,6 +436,35 @@ paths: application/json: schema: {$ref: '#/components/schemas/ModelInfo'} + # ========== Health ========== + /health/active: + get: + tags: [health] + summary: LB target — 200 iff this instance holds the leader advisory lock + operationId: healthActive + security: [] + responses: + '200': + description: Active or recovering — LB should route to this instance. + content: + application/json: + schema: + type: object + properties: + status: { type: string, enum: [active] } + controller_state: { type: string, enum: [recovering, active] } + '503': + description: Standby — LB must NOT route to this instance. + content: + application/json: + schema: + type: object + properties: + detail: + type: object + properties: + controller_state: { type: string, enum: [standby] } + # ========== Executors ========== /executors/register: post: @@ -644,6 +675,18 @@ paths: description: Server requesting backoff headers: Retry-After: {schema: {type: integer}} + '503': + description: Controller is recovering after a failover (INVARIANT 33). Retry shortly. + content: + application/json: + schema: + type: object + properties: + detail: + type: object + properties: + code: { type: string, enum: [CONTROLLER_RECOVERING] } + message: { type: string } /executors/{executorId}/renew: parameters: @@ -729,6 +772,18 @@ paths: type: integer got: type: integer + '503': + description: Controller is recovering after a failover (INVARIANT 33). Retry shortly. + content: + application/json: + schema: + type: object + properties: + detail: + type: object + properties: + code: { type: string, enum: [CONTROLLER_RECOVERING] } + message: { type: string } /hf-proxy/subtask/{subtaskId}: get: @@ -899,6 +954,18 @@ paths: type: integer '409': description: STALE_ASSIGNMENT or epoch fence violation + '503': + description: Controller is recovering after a failover (INVARIANT 33). Retry shortly. + content: + application/json: + schema: + type: object + properties: + detail: + type: object + properties: + code: { type: string, enum: [CONTROLLER_RECOVERING] } + message: { type: string } # ========== Tenants / Quota / Audit ========== /tenants/{tenantId}: diff --git a/docs/operator/executor-runbook.md b/docs/operator/executor-runbook.md index ca25d13..60a83b9 100644 --- a/docs/operator/executor-runbook.md +++ b/docs/operator/executor-runbook.md @@ -112,3 +112,42 @@ and any deployment manifests — they are now ignored): rather than executor→HF directly. For the internal beta this is acceptable; global rate-limit coordination and an executor-local credential pool are Phase 3 items. + +## Controller leadership (W3c) + +As of Phase 2 W3c, the controller supports active/standby deployments via an +app-level leader election. The instance holding a session-level +PostgreSQL advisory lock (`pg_try_advisory_lock()`) is +the **active**; all others are **standby**. + +**LB routing:** point the load balancer's health check at `GET /health/active` +(returns 200 only when this instance holds the lock). `/health/live` and +`/health/ready` remain the k8s liveness/readiness probes — unchanged. + +**Failover behaviour:** when the active dies, PG auto-releases the advisory +lock the instant its holding session ends. A standby's leader-loop poll +(default 5 s, configurable via `DLW_LEADER_POLL_INTERVAL_SECONDS`) acquires +the freed lock and promotes through `standby → recovering → active`. During +the `recovering` phase the executor-loop endpoints (heartbeat, poll, report) +return **503 `CONTROLLER_RECOVERING`** — executors retry through their +existing tenacity backoff. Total RTO target: ≤ 10 min. + +**Relationship to PG-level failover (`promote-standby.sh`):** the app-level +lock is orthogonal to PostgreSQL primary failover. The runbook +`deploy/runbooks/scripts/promote-standby.sh` promotes the PG primary itself +(CH-Q3); after that script runs, the controller pods reconnect and the +advisory lock is re-acquired automatically by whichever pod wins the race. + +**Required environment variables:** + +- `DLW_ACTIVE_LOCK_ID` — bigint advisory-lock key. Default + `0x444C5743414B5631`. **All controller instances MUST use the same value**; + a mismatch causes both to think they are active. +- `DLW_LEADER_POLL_INTERVAL_SECONDS` — standby poll interval (default 5.0, + range 0.5–60.0). + +**Removed environment variables (W3c):** + +- `DLW_STRICT_RECOVERY` — deleted. Recovery failures now keep the controller + in `recovering` and retry on the next leader-loop tick (heartbeats keep + 503ing; alertable from log volume). diff --git a/tests/e2e/test_failover_drill.py b/tests/e2e/test_failover_drill.py new file mode 100644 index 0000000..8d7e135 --- /dev/null +++ b/tests/e2e/test_failover_drill.py @@ -0,0 +1,212 @@ +"""W3c chaos drill: two LeaderElector instances + two run_leader_loop tasks +against the same test PG. Simulate active death; assert the standby acquires +the lock and transitions standby → recovering → active end-to-end.""" +from __future__ import annotations + +import asyncio +import os +import uuid + +import pytest +from sqlalchemy.ext.asyncio import async_sessionmaker + +from dlw.db.base import Base +from dlw.services.leader_election import LeaderElector, run_leader_loop + + +def _test_db_url(test_db_name: str) -> str: + env = { + "host": os.environ.get("DLW_TEST_PG_HOST", "localhost"), + "port": os.environ.get("DLW_TEST_PG_PORT", "5433"), + "user": os.environ.get("DLW_TEST_PG_USER", "postgres"), + "password": os.environ.get("DLW_TEST_PG_PASSWORD", ""), + } + auth = f"{env['user']}:{env['password']}@" if env["password"] else f"{env['user']}@" + return f"postgresql+asyncpg://{auth}{env['host']}:{env['port']}/{test_db_name}" + + +_LOCK_ID = 0x4D45_4145_5044_5631 # 'MEAPDV1' — distinct from test_leader_election + + +@pytest.fixture(scope="module", autouse=True) +async def _bootstrap(engine): + """Tables only — recovery_routine reconciles against real schema.""" + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + yield + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + +@pytest.mark.slow +async def test_two_instances_one_active(test_db_name) -> None: + """Both leader loops start simultaneously; PG guarantees exactly one + reaches 'active'; the other stays 'standby'.""" + db_url = _test_db_url(test_db_name) + states_a: list[str] = [] + states_b: list[str] = [] + + async def noop_promote(): pass + async def noop_active(): pass + async def noop_step_down(): pass + + elector_a = LeaderElector(db_url, _LOCK_ID) + elector_b = LeaderElector(db_url, _LOCK_ID) + shutdown = asyncio.Event() + + task_a = asyncio.create_task(run_leader_loop( + elector=elector_a, poll_interval_seconds=0.05, + set_state=states_a.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown, + )) + task_b = asyncio.create_task(run_leader_loop( + elector=elector_b, poll_interval_seconds=0.05, + set_state=states_b.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown, + )) + try: + await asyncio.sleep(0.5) + # One reached 'active', the other is stuck in 'standby'. + a_active = states_a and states_a[-1] == "active" + b_active = states_b and states_b[-1] == "active" + assert (a_active or b_active) and not (a_active and b_active), \ + f"expected exactly one active: states_a={states_a}, states_b={states_b}" + finally: + shutdown.set() + await asyncio.gather(task_a, task_b, return_exceptions=True) + await elector_a.release() + await elector_b.release() + + +@pytest.mark.slow +async def test_failover_promotes_standby_within_rto(test_db_name) -> None: + """Active dies (loop cancelled + elector connection closed). Within + ≤ 3 × poll_interval, the standby goes standby → recovering → active.""" + db_url = _test_db_url(test_db_name) + states_a: list[str] = [] + states_b: list[str] = [] + poll_interval = 0.05 + + async def noop_promote(): pass + async def noop_active(): pass + async def noop_step_down(): pass + + elector_a = LeaderElector(db_url, _LOCK_ID) + elector_b = LeaderElector(db_url, _LOCK_ID) + shutdown_a = asyncio.Event() + shutdown_b = asyncio.Event() + + task_a = asyncio.create_task(run_leader_loop( + elector=elector_a, poll_interval_seconds=poll_interval, + set_state=states_a.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_a, + )) + task_b = asyncio.create_task(run_leader_loop( + elector=elector_b, poll_interval_seconds=poll_interval, + set_state=states_b.append, on_promote=noop_promote, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_b, + )) + try: + # Let A become active. + await asyncio.sleep(0.3) + assert states_a[-1] == "active", \ + f"A should be active by now: states_a={states_a}" + assert states_b[-1] == "standby", \ + f"B should still be standby: states_b={states_b}" + + # Kill A: shut down its loop AND abruptly close its lock-holding + # connection (simulating crash, not graceful release). + shutdown_a.set() + await asyncio.wait_for(task_a, timeout=1.0) + if elector_a._conn is not None: + await elector_a._conn.close() + elector_a._conn = None + + # Within ≤ 3 × poll_interval B should pick up the lock and promote. + await asyncio.sleep(poll_interval * 6) + assert "recovering" in states_b, \ + f"B never entered recovering: states_b={states_b}" + assert states_b[-1] == "active", \ + f"B did not reach active: states_b={states_b}" + finally: + shutdown_b.set() + await asyncio.gather(task_b, return_exceptions=True) + await elector_a.release() + await elector_b.release() + + +@pytest.mark.slow +async def test_promoted_standby_runs_recovery_callback(test_db_name) -> None: + """The on_promote callback actually runs on the standby's promotion path — + proves run_recovery_routine would execute. Uses a counter callback, not + the real recovery routine (which is tested in tests/services/test_recovery.py).""" + db_url = _test_db_url(test_db_name) + poll_interval = 0.05 + + promote_a_count = 0 + promote_b_count = 0 + + async def promote_a(): + nonlocal promote_a_count + promote_a_count += 1 + + async def promote_b(): + nonlocal promote_b_count + promote_b_count += 1 + + async def noop_active(): pass + async def noop_step_down(): pass + + states_a: list[str] = [] + states_b: list[str] = [] + + elector_a = LeaderElector(db_url, _LOCK_ID) + elector_b = LeaderElector(db_url, _LOCK_ID) + shutdown_a = asyncio.Event() + shutdown_b = asyncio.Event() + + task_a = asyncio.create_task(run_leader_loop( + elector=elector_a, poll_interval_seconds=poll_interval, + set_state=states_a.append, on_promote=promote_a, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_a, + )) + task_b = asyncio.create_task(run_leader_loop( + elector=elector_b, poll_interval_seconds=poll_interval, + set_state=states_b.append, on_promote=promote_b, + on_active=noop_active, on_step_down=noop_step_down, + shutdown=shutdown_b, + )) + try: + await asyncio.sleep(0.3) + # One of A/B has promoted. Identify the dead-active path. + if states_a[-1] == "active": + assert promote_a_count == 1 + shutdown_a.set() + await asyncio.wait_for(task_a, timeout=1.0) + if elector_a._conn is not None: + await elector_a._conn.close() + elector_a._conn = None + await asyncio.sleep(poll_interval * 6) + assert promote_b_count == 1, \ + f"B's on_promote did not run after A died: {promote_b_count=}" + else: + assert promote_b_count == 1 + shutdown_b.set() + await asyncio.wait_for(task_b, timeout=1.0) + if elector_b._conn is not None: + await elector_b._conn.close() + elector_b._conn = None + await asyncio.sleep(poll_interval * 6) + assert promote_a_count == 1, \ + f"A's on_promote did not run after B died: {promote_a_count=}" + finally: + shutdown_a.set() + shutdown_b.set() + await asyncio.gather(task_a, task_b, return_exceptions=True) + await elector_a.release() + await elector_b.release() From 35fd4d004435b879a33c5156200f5588b9d993ff Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 18:23:27 +0800 Subject: [PATCH 14/15] fix(w3c): remove unused imports + await leader task on shutdown (W3c) Co-Authored-By: Claude Sonnet 4.6 --- src/dlw/main.py | 4 ++++ tests/e2e/test_failover_drill.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/dlw/main.py b/src/dlw/main.py index 2cabeab..34373e0 100644 --- a/src/dlw/main.py +++ b/src/dlw/main.py @@ -108,6 +108,10 @@ async def _on_step_down() -> None: await asyncio.wait_for(leader_task, timeout=5) except (asyncio.CancelledError, asyncio.TimeoutError): leader_task.cancel() + try: + await leader_task + except (asyncio.CancelledError, Exception): + pass await elector.release() await reset_engine() diff --git a/tests/e2e/test_failover_drill.py b/tests/e2e/test_failover_drill.py index 8d7e135..1b0dc41 100644 --- a/tests/e2e/test_failover_drill.py +++ b/tests/e2e/test_failover_drill.py @@ -5,10 +5,8 @@ import asyncio import os -import uuid import pytest -from sqlalchemy.ext.asyncio import async_sessionmaker from dlw.db.base import Base from dlw.services.leader_election import LeaderElector, run_leader_loop From 6a33597cb7fed3aa9f9205a1a6e4226f36c01ff2 Mon Sep 17 00:00:00 2001 From: l17728 Date: Fri, 15 May 2026 18:24:05 +0800 Subject: [PATCH 15/15] docs(plan): sync W3c plan with review-driven fixes during execution Captures: active_lock_id bigint upper bound + env-override test, _db_url docstring correction + cleanup debug log, run_leader_loop split into recovering/active branches + on_active exception guard, conftest helper defaults, recovery barrier wiring, lifespan shutdown await fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-15-phase-2-w3c-active-standby.md | 75 ++++++++++++++++--- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md b/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md index 6112720..acd15d0 100644 --- a/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md +++ b/docs/superpowers/plans/2026-05-15-phase-2-w3c-active-standby.md @@ -75,12 +75,23 @@ def test_settings_active_lock_id_rejects_zero() -> None: Settings(active_lock_id=0) +def test_settings_active_lock_id_rejects_above_pg_bigint_max() -> None: + with pytest.raises(ValidationError): + Settings(active_lock_id=9_223_372_036_854_775_808) + + def test_settings_has_leader_poll_interval_default(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("DLW_LEADER_POLL_INTERVAL_SECONDS", raising=False) s = Settings() assert s.leader_poll_interval_seconds == 5.0 +def test_settings_leader_poll_interval_reads_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("DLW_LEADER_POLL_INTERVAL_SECONDS", "10.0") + s = Settings() + assert s.leader_poll_interval_seconds == 10.0 + + def test_settings_leader_poll_interval_rejects_below_min() -> None: with pytest.raises(ValidationError): Settings(leader_poll_interval_seconds=0.1) @@ -102,7 +113,11 @@ In `src/dlw/config.py`, after the W3b block (`hf_proxy_timeout_seconds: int = Fi ```python # Phase 2 W3c — controller leader election - active_lock_id: int = Field(default=0x444C5743_414B5631, ge=1) # 'DLWC AKV1' + active_lock_id: int = Field( + default=0x444C5743_414B5631, + ge=1, + le=9_223_372_036_854_775_807, # PG bigint max (2**63 - 1) + ) # 'DLWC AKV1' leader_poll_interval_seconds: float = Field(default=5.0, ge=0.5, le=60.0) ``` @@ -140,7 +155,10 @@ from dlw.services.leader_election import LeaderElector def _db_url() -> str: - """Test DB URL — matches the conftest engine fixture.""" + """URL for the PostgreSQL admin DB used by these tests. Advisory locks + are cluster-wide (not per-DB), so the choice of DB doesn't affect lock + isolation — only the lock_id does. This file's _LOCK_ID is unique and + pytest runs serially within a session, so collisions aren't a concern.""" import os env = { "host": os.environ.get("DLW_TEST_PG_HOST", "localhost"), @@ -224,15 +242,21 @@ async def test_verify_returns_true_when_holding() -> None: @pytest.mark.slow async def test_verify_returns_false_after_connection_drop() -> None: - """verify() returns False if the lock connection has died, and cleans up - so the next try_acquire() opens a fresh connection.""" + """verify() must exercise its exception branch — when _conn is still set + but the underlying connection is dead, the SELECT 1 ping raises and + verify() cleans up + returns False. (Pre-NULLing _conn would short-circuit + on the `if self._conn is None: return False` guard and skip the path + that matters in production.)""" e = LeaderElector(_db_url(), _LOCK_ID) try: assert await e.try_acquire() is True assert e._conn is not None await e._conn.close() - e._conn = None + # _conn intentionally LEFT non-None — verify() must hit the + # SELECT-1-raises exception branch, not the early None-guard. assert await e.verify() is False + # And cleanup happened — _conn is None now: + assert e._conn is None finally: await e.release() ``` @@ -335,8 +359,8 @@ class LeaderElector: if self._conn is not None: try: await self._conn.close() - except Exception: - pass + except Exception as exc: + logger.debug("conn.close() during cleanup raised (ignored): %s", exc) self._conn = None ``` @@ -594,9 +618,42 @@ async def run_leader_loop( continue state = "active" set_state(state) - await on_active() + try: + await on_active() + except Exception: + logger.exception("leader: on_active failed; reverting to recovering for retry") + state = "recovering" + set_state(state) + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue logger.info("leader: promoted to active") - elif state in ("recovering", "active"): + elif state == "recovering": + # Lock is held but promotion hasn't completed. Verify lock still + # alive, then retry on_promote. + if not await elector.verify(): + logger.warning("leader: lost lock during recovery, stepping down") + await on_step_down() + state = "standby" + set_state(state) + continue + try: + await on_promote() + except Exception: + logger.exception("leader: recovery failed; will retry next tick") + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue + state = "active" + set_state(state) + try: + await on_active() + except Exception: + logger.exception("leader: on_active failed; reverting to recovering for retry") + state = "recovering" + set_state(state) + await _sleep_or_shutdown(shutdown, poll_interval_seconds) + continue + logger.info("leader: promoted to active (after retry)") + elif state == "active": if not await elector.verify(): logger.warning("leader: lost lock, stepping down to standby") await on_step_down()