Governs-AI · Shaivpidadi · Apr 24, 2026 · Apr 25, 2026
diff --git a/PROJECT_SPECS.md b/PROJECT_SPECS.md
@@ -948,9 +948,56 @@ Budget limits can be configured per user:
 - API key extracted from `X-Governs-Key` header and forwarded to webhook events
 
 ### Rate Limiting
-- 100 requests per minute per user
-- Configurable limits and windows
-- Redis-based rate limiting (optional)
+
+Minute-bucket sliding-window counters, enforced by the
+`app.rate_limit_middleware` FastAPI middleware before any route handler runs.
+Four dimensions are evaluated per authenticated request:
+
+| Counter key                        | Default limit        |
+|------------------------------------|----------------------|
+| `req:key:{key_hash}:{minute}`      | 100 req/min          |
+| `tokens:key:{key_hash}:{minute}`   | 100,000 tokens/min   |
+| `req:org:{org_id}:{minute}`        | 1,000 req/min        |
+| `tokens:org:{org_id}:{minute}`     | 1,000,000 tokens/min |
+
+Token cost is estimated from the request `Content-Length` as `ceil(bytes / 4)`
+(standard rough heuristic) until §1.5d wires policy-driven limits and real
+tokenizer counts.
+
+All responses carry `X-RateLimit-Limit`, `X-RateLimit-Remaining`, and
+`X-RateLimit-Reset` reflecting the most restrictive dimension. Denied
+requests return HTTP 429 with a `Retry-After` header in seconds.
+
+Unauthenticated paths (`/api/v1/health`, `/api/v1/ready`, `/api/metrics`,
+`/docs`, `/redoc`, `/openapi.json`, `/`) skip the limiter so probes cannot
+consume quota.
+
+#### Redis posture
+
+`REDIS_URL` **must** use the `rediss://` TLS scheme and carry a password in
+any non-debug environment. The `Settings` validator rejects plaintext or
+passwordless URLs — this protects counters against on-path tampering and
+co-tenant reads. Plaintext `redis://` is accepted only when `DEBUG=true`.
+
+#### Redis outage behavior (`RATE_LIMIT_FAIL_MODE`)
+
+When Redis is configured but unreachable at request time the limiter
+evaluates `RATE_LIMIT_FAIL_MODE`:
+
+* `closed` — default. The middleware returns HTTP 503
+  `rate limiter unavailable`. Safe under multi-replica deployments.
+* `open` — requests are allowed without a counter check. Operators must
+  explicitly accept the quota-bypass risk.
+* `local` — per-replica in-memory fallback. Across N replicas this
+  multiplies the effective quota by N, so `Settings` rejects it outside
+  debug mode. Intended for single-replica dev.
+
+When `REDIS_URL` is unset entirely (dev/tests), the limiter runs purely
+against in-memory buckets regardless of `RATE_LIMIT_FAIL_MODE`.
+
+Rationale for the fail-closed default comes from Cipher's review on
+precheck#31: a silent in-memory fallback on production replicas turns the
+rate limit into a denial-of-quota *ceiling* rather than a *floor*.
 
 ### PII Protection
 - Multiple redaction strategies

diff --git a/app/api.py b/app/api.py
@@ -27,15 +27,12 @@
 )
 from .models import DecisionResponse, PrePostCheckRequest
 from .policies import evaluate, evaluate_with_payload_policy
-from .rate_limit import rate_limiter
 from .settings import settings
 from .storage import APIKey, get_db
 
 logger = logging.getLogger(__name__)
 
 router = APIRouter()
-RATE_LIMIT_REQUESTS = 100
-RATE_LIMIT_WINDOW_SECONDS = 60
 
 
 def _ensure_correlation_id(corr_id: Optional[str]) -> str:
@@ -306,22 +303,8 @@ async def precheck(
     user_id = req.user_id
     correlation_id = _ensure_correlation_id(req.corr_id)
 
-    # Rate limiting (100 requests per minute per user/api_key)
-    if user_id:
-        rate_limit_key = f"precheck:{user_id}"
-    else:
-        rate_limit_key = f"precheck:key:{api_key}"
-    if not rate_limiter.is_allowed(
-        rate_limit_key, limit=RATE_LIMIT_REQUESTS, window=RATE_LIMIT_WINDOW_SECONDS
-    ):
-        retry_after = rate_limiter.retry_after(
-            rate_limit_key, limit=RATE_LIMIT_REQUESTS, window=RATE_LIMIT_WINDOW_SECONDS
-        )
-        raise HTTPException(
-            status_code=429,
-            detail="rate limit exceeded",
-            headers={"Retry-After": str(max(1, retry_after))},
-        )
+    # Rate limiting is enforced by app.rate_limit_middleware before this
+    # handler runs — see app/rate_limit_middleware.py.
 
     # Metrics: Track active requests
     set_active_requests("precheck", 1)
@@ -485,22 +468,8 @@ async def postcheck(
     user_id = req.user_id
     correlation_id = _ensure_correlation_id(req.corr_id)
 
-    # Rate limiting (100 requests per minute per user/api_key)
-    if user_id:
-        rate_limit_key = f"postcheck:{user_id}"
-    else:
-        rate_limit_key = f"postcheck:key:{api_key}"
-    if not rate_limiter.is_allowed(
-        rate_limit_key, limit=RATE_LIMIT_REQUESTS, window=RATE_LIMIT_WINDOW_SECONDS
-    ):
-        retry_after = rate_limiter.retry_after(
-            rate_limit_key, limit=RATE_LIMIT_REQUESTS, window=RATE_LIMIT_WINDOW_SECONDS
-        )
-        raise HTTPException(
-            status_code=429,
-            detail="rate limit exceeded",
-            headers={"Retry-After": str(max(1, retry_after))},
-        )
+    # Rate limiting is enforced by app.rate_limit_middleware before this
+    # handler runs — see app/rate_limit_middleware.py.
 
     # Metrics: Track active requests
     set_active_requests("postcheck", 1)

diff --git a/app/main.py b/app/main.py
@@ -10,6 +10,7 @@
 from fastapi.responses import JSONResponse
 
 from .api import router
+from .rate_limit_middleware import install_rate_limit_middleware
 from .settings import settings
 from .storage import create_tables
 
@@ -50,6 +51,11 @@ def create_app() -> FastAPI:
         lifespan=lifespan,
     )
 
+    # Middleware registration order is inside-out: the LAST decorator runs
+    # OUTERMOST. Install rate limiting first so request_id and response_time
+    # still apply to 429 / 503 responses.
+    install_rate_limit_middleware(app)
+
     @app.middleware("http")
     async def request_id_middleware(request: Request, call_next):
         request_id = str(uuid.uuid4())