From 5603283f98c937a4b62e6f1841f914802ccd235c Mon Sep 17 00:00:00 2001 From: James Grugett Date: Sun, 26 Apr 2026 16:26:35 -0700 Subject: [PATCH 1/2] Limit GLM sessions to 12 hours --- cli/src/components/waiting-room-screen.tsx | 4 ++-- cli/src/hooks/use-freebuff-session.ts | 2 +- common/src/types/freebuff-session.ts | 4 ++-- packages/internal/src/db/schema.ts | 2 +- .../free-session/__tests__/public-api.test.ts | 22 +++++++++---------- web/src/server/free-session/public-api.ts | 2 +- web/src/server/free-session/store.ts | 2 +- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cli/src/components/waiting-room-screen.tsx b/cli/src/components/waiting-room-screen.tsx index 3399786ec..32477a798 100644 --- a/cli/src/components/waiting-room-screen.tsx +++ b/cli/src/components/waiting-room-screen.tsx @@ -230,7 +230,7 @@ export const WaitingRoomScreen: React.FC = ({ Elapsed {formatElapsed(elapsedMs)} - {/* Per-model session quota (e.g. GLM 5.1 caps at 5/20h). Only + {/* Per-model session quota (e.g. GLM 5.1 caps at 5/12h). Only rendered for rate-limited models so the Minimax queue stays clutter-free. */} {session.rateLimit && ( @@ -298,7 +298,7 @@ export const WaitingRoomScreen: React.FC = ({ )} {/* Per-model session quota exhausted (e.g. 5+ GLM sessions in the - last 20h). Terminal for this run — the user can exit and come + last 12h). Terminal for this run — the user can exit and come back once the oldest session in the window rolls off. */} {session?.status === 'rate_limited' && ( <> diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts index 5b5a205c8..119e769b8 100644 --- a/cli/src/hooks/use-freebuff-session.ts +++ b/cli/src/hooks/use-freebuff-session.ts @@ -101,7 +101,7 @@ async function callSession( } } // 429 from POST is the per-model session-quota reject (e.g. too many GLM - // sessions in the last 20h). Terminal for the current poll — the CLI shows + // sessions in the last 12h). Terminal for the current poll — the CLI shows // a screen explaining the limit and when the user can try again. The 429 // status (rather than 200) keeps older CLIs in their error path so they // back off instead of tight-polling an unrecognized 200 body. diff --git a/common/src/types/freebuff-session.ts b/common/src/types/freebuff-session.ts index 7b5fc0492..c7322b665 100644 --- a/common/src/types/freebuff-session.ts +++ b/common/src/types/freebuff-session.ts @@ -10,7 +10,7 @@ * Per-model usage counter surfaced to the CLI so the waiting-room UI can * render "N of M sessions used" alongside queue/active state. Present when * the joined model has a rate limit applied (today: GLM 5.1 with 5 admits - * per 20-hour window). `recentCount` is the number of admissions inside + * per 12-hour window). `recentCount` is the number of admissions inside * `windowHours` at the time the response was produced — see also the * standalone `rate_limited` status for the reject path. */ @@ -132,7 +132,7 @@ export type FreebuffSessionServerResponse = } | { /** User has used up their per-model admission quota in the rolling - * window (GLM 5.1: 5 one-hour sessions per 20h). Returned from POST + * window (GLM 5.1: 5 one-hour sessions per 12h). Returned from POST * /session before the user is placed in the queue. `retryAfterMs` is * the time until the oldest admission inside the window falls off * and one quota slot opens up — clients should show the user when diff --git a/packages/internal/src/db/schema.ts b/packages/internal/src/db/schema.ts index 2ead1fc6d..6fed8a703 100644 --- a/packages/internal/src/db/schema.ts +++ b/packages/internal/src/db/schema.ts @@ -873,7 +873,7 @@ export const freeSession = pgTable( /** * Audit log of every admission — one row per queued→active transition. Used - * to rate-limit heavy users (e.g. no more than 5 GLM sessions per 20h). + * to rate-limit heavy users (e.g. no more than 5 GLM sessions per 12h). * * Separate from `free_session` because that table is one-row-per-user (state, * not history); the UPSERT path there would otherwise destroy prior admissions. diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index 44d516c12..f46a0f8c4 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -323,23 +323,23 @@ describe('requestSession', () => { expect(s3.status).toBe('active') }) - // Per-user rate limit (5 GLM admissions per 20h) — the wire limit is + // Per-user rate limit (5 GLM admissions per 12h) — the wire limit is // hard-coded in public-api.ts, so tests seed the fake admit log directly // rather than configuring it. GLM also has deployment-hours gating, so // these tests bump `now` into the open window (12pm ET on a weekday) // before issuing the request. const GLM_MODEL = 'z-ai/glm-5.1' const GLM_LIMIT = 5 - const GLM_WINDOW_HOURS = 20 + const GLM_WINDOW_HOURS = 12 const GLM_OPEN_TIME = new Date('2026-04-17T16:00:00Z') test('rate_limited: 5th GLM admit in window blocks the 6th attempt', async () => { deps._tick(GLM_OPEN_TIME) - // Seed 5 admits inside the 20h window, spaced so we can verify retryAfter + // Seed 5 admits inside the 12h window, spaced so we can verify retryAfter // points at the oldest one sliding off. const now = deps._now() - // Oldest: 19h ago (still in window). Next 4: 1h, 2h, 3h, 4h ago. - const ages = [19, 4, 3, 2, 1] + // Oldest: 11h ago (still in window). Next 4: 1h, 2h, 3h, 4h ago. + const ages = [11, 4, 3, 2, 1] for (const hoursAgo of ages) { deps.admits.push({ user_id: 'u1', @@ -359,15 +359,15 @@ describe('requestSession', () => { expect(state.limit).toBe(GLM_LIMIT) expect(state.windowHours).toBe(GLM_WINDOW_HOURS) expect(state.recentCount).toBe(GLM_LIMIT) - // Oldest admit is 19h ago; slot opens when it hits 20h, i.e. in 1h. + // Oldest admit is 11h ago; slot opens when it hits 12h, i.e. in 1h. expect(state.retryAfterMs).toBe(60 * 60 * 1000) // Blocked before any row is written — the user doesn't take a queue slot. expect(deps.rows.has('u1')).toBe(false) }) - test('rate_limited: admits outside the 20h window do not count', async () => { + test('rate_limited: admits outside the 12h window do not count', async () => { deps._tick(GLM_OPEN_TIME) - // 5 admits, each just over 20h old → all fall off the window. + // 5 admits, each just over 12h old → all fall off the window. const now = deps._now() for (let i = 0; i < 5; i++) { deps.admits.push({ @@ -446,7 +446,7 @@ describe('requestSession', () => { const now = deps._now() // Seed 5 prior admits (the cap), with the latest one matching the // active row we're about to install. - const ages = [19, 4, 3, 2, 0] + const ages = [11, 4, 3, 2, 0] for (const hoursAgo of ages) { deps.admits.push({ user_id: 'u1', @@ -527,7 +527,7 @@ describe('requestSession', () => { // must be blocked by the quota. deps._tick(GLM_OPEN_TIME) const now = deps._now() - const ages = [19, 4, 3, 2, 1] + const ages = [11, 4, 3, 2, 1] for (const hoursAgo of ages) { deps.admits.push({ user_id: 'u1', @@ -660,7 +660,7 @@ describe('getSessionState', () => { expect(state.rateLimit).toEqual({ model: 'z-ai/glm-5.1', limit: 5, - windowHours: 20, + windowHours: 12, recentCount: 1, }) }) diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts index 02c5c05c9..c963bb01a 100644 --- a/web/src/server/free-session/public-api.ts +++ b/web/src/server/free-session/public-api.ts @@ -41,7 +41,7 @@ import type { InternalSessionRow, SessionStateResponse } from './types' * queued/active responses — changing them is a deliberate, typed edit. */ const RATE_LIMITS: Record = { - 'z-ai/glm-5.1': { limit: 5, windowHours: 20 }, + 'z-ai/glm-5.1': { limit: 5, windowHours: 12 }, } /** Fetch the caller's current quota snapshot for `model`, or undefined if the diff --git a/web/src/server/free-session/store.ts b/web/src/server/free-session/store.ts index e84331b69..ee034cbd7 100644 --- a/web/src/server/free-session/store.ts +++ b/web/src/server/free-session/store.ts @@ -436,7 +436,7 @@ export async function promoteQueuedUser(params: { * so one query covers both the check and the reject path. * * Drives the per-user, per-model rate limit (e.g. at most 5 GLM sessions in - * the last 20h) enforced before `joinOrTakeOver`. + * the last 12h) enforced before `joinOrTakeOver`. */ export async function listRecentAdmits(params: { userId: string From d0e8a2ef00ce04db77915b7949dfb862f6da4ec4 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Sun, 26 Apr 2026 16:33:14 -0700 Subject: [PATCH 2/2] Avoid duplicate rate window math --- web/src/server/free-session/public-api.ts | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts index c963bb01a..422795e3a 100644 --- a/web/src/server/free-session/public-api.ts +++ b/web/src/server/free-session/public-api.ts @@ -47,19 +47,22 @@ const RATE_LIMITS: Record = { /** Fetch the caller's current quota snapshot for `model`, or undefined if the * model isn't rate-limited. Used by both POST (after admit) and GET polls so * the CLI's "N of M sessions used" line stays live instead of disappearing - * after the first poll. Also returns the oldest admit in-window so callers - * that need `retryAfterMs` don't have to re-query. */ + * after the first poll. Also returns the oldest admit in-window and the + * window duration so callers that need `retryAfterMs` don't have to re-query + * or duplicate the window math. */ async function fetchRateLimitSnapshot( userId: string, model: string, deps: SessionDeps, ): Promise< - { info: FreebuffSessionRateLimit; oldest: Date | null } | undefined + | { info: FreebuffSessionRateLimit; oldest: Date | null; windowMs: number } + | undefined > { const cfg = RATE_LIMITS[model] if (!cfg) return undefined const now = nowOf(deps) - const since = new Date(now.getTime() - cfg.windowHours * 60 * 60 * 1000) + const windowMs = cfg.windowHours * 60 * 60 * 1000 + const since = new Date(now.getTime() - windowMs) const admits = await deps.listRecentAdmits({ userId, model, @@ -74,6 +77,7 @@ async function fetchRateLimitSnapshot( recentCount: admits.length, }, oldest: admits[0] ?? null, + windowMs, } } @@ -271,10 +275,9 @@ export async function requestSession(params: { if (snapshot && snapshot.info.recentCount >= snapshot.info.limit) { // Oldest admit's window-anniversary is when one slot opens back up. // Clamped at 0 so a clock skew can't surface a negative retry-after. - const windowMs = snapshot.info.windowHours * 60 * 60 * 1000 const retryAfterMs = Math.max( 0, - (snapshot.oldest?.getTime() ?? 0) + windowMs - now.getTime(), + (snapshot.oldest?.getTime() ?? 0) + snapshot.windowMs - now.getTime(), ) return { status: 'rate_limited',