Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli/src/components/waiting-room-screen.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ export const WaitingRoomScreen: React.FC<WaitingRoomScreenProps> = ({
<span>Elapsed </span>
{formatElapsed(elapsedMs)}
</text>
{/* Per-model session quota (e.g. GLM 5.1 caps at 5/20h). Only
{/* Per-model session quota (e.g. GLM 5.1 caps at 5/12h). Only
rendered for rate-limited models so the Minimax queue stays
clutter-free. */}
{session.rateLimit && (
Expand Down Expand Up @@ -298,7 +298,7 @@ export const WaitingRoomScreen: React.FC<WaitingRoomScreenProps> = ({
)}

{/* Per-model session quota exhausted (e.g. 5+ GLM sessions in the
last 20h). Terminal for this run — the user can exit and come
last 12h). Terminal for this run — the user can exit and come
back once the oldest session in the window rolls off. */}
{session?.status === 'rate_limited' && (
<>
Expand Down
2 changes: 1 addition & 1 deletion cli/src/hooks/use-freebuff-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ async function callSession(
}
}
// 429 from POST is the per-model session-quota reject (e.g. too many GLM
// sessions in the last 20h). Terminal for the current poll — the CLI shows
// sessions in the last 12h). Terminal for the current poll — the CLI shows
// a screen explaining the limit and when the user can try again. The 429
// status (rather than 200) keeps older CLIs in their error path so they
// back off instead of tight-polling an unrecognized 200 body.
Expand Down
4 changes: 2 additions & 2 deletions common/src/types/freebuff-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* Per-model usage counter surfaced to the CLI so the waiting-room UI can
* render "N of M sessions used" alongside queue/active state. Present when
* the joined model has a rate limit applied (today: GLM 5.1 with 5 admits
* per 20-hour window). `recentCount` is the number of admissions inside
* per 12-hour window). `recentCount` is the number of admissions inside
* `windowHours` at the time the response was produced — see also the
* standalone `rate_limited` status for the reject path.
*/
Expand Down Expand Up @@ -132,7 +132,7 @@ export type FreebuffSessionServerResponse =
}
| {
/** User has used up their per-model admission quota in the rolling
* window (GLM 5.1: 5 one-hour sessions per 20h). Returned from POST
* window (GLM 5.1: 5 one-hour sessions per 12h). Returned from POST
* /session before the user is placed in the queue. `retryAfterMs` is
* the time until the oldest admission inside the window falls off
* and one quota slot opens up — clients should show the user when
Expand Down
2 changes: 1 addition & 1 deletion packages/internal/src/db/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ export const freeSession = pgTable(

/**
* Audit log of every admission — one row per queued→active transition. Used
* to rate-limit heavy users (e.g. no more than 5 GLM sessions per 20h).
* to rate-limit heavy users (e.g. no more than 5 GLM sessions per 12h).
*
* Separate from `free_session` because that table is one-row-per-user (state,
* not history); the UPSERT path there would otherwise destroy prior admissions.
Expand Down
22 changes: 11 additions & 11 deletions web/src/server/free-session/__tests__/public-api.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -323,23 +323,23 @@ describe('requestSession', () => {
expect(s3.status).toBe('active')
})

// Per-user rate limit (5 GLM admissions per 20h) — the wire limit is
// Per-user rate limit (5 GLM admissions per 12h) — the wire limit is
// hard-coded in public-api.ts, so tests seed the fake admit log directly
// rather than configuring it. GLM also has deployment-hours gating, so
// these tests bump `now` into the open window (12pm ET on a weekday)
// before issuing the request.
const GLM_MODEL = 'z-ai/glm-5.1'
const GLM_LIMIT = 5
const GLM_WINDOW_HOURS = 20
const GLM_WINDOW_HOURS = 12
const GLM_OPEN_TIME = new Date('2026-04-17T16:00:00Z')

test('rate_limited: 5th GLM admit in window blocks the 6th attempt', async () => {
deps._tick(GLM_OPEN_TIME)
// Seed 5 admits inside the 20h window, spaced so we can verify retryAfter
// Seed 5 admits inside the 12h window, spaced so we can verify retryAfter
// points at the oldest one sliding off.
const now = deps._now()
// Oldest: 19h ago (still in window). Next 4: 1h, 2h, 3h, 4h ago.
const ages = [19, 4, 3, 2, 1]
// Oldest: 11h ago (still in window). Next 4: 1h, 2h, 3h, 4h ago.
const ages = [11, 4, 3, 2, 1]
for (const hoursAgo of ages) {
deps.admits.push({
user_id: 'u1',
Expand All @@ -359,15 +359,15 @@ describe('requestSession', () => {
expect(state.limit).toBe(GLM_LIMIT)
expect(state.windowHours).toBe(GLM_WINDOW_HOURS)
expect(state.recentCount).toBe(GLM_LIMIT)
// Oldest admit is 19h ago; slot opens when it hits 20h, i.e. in 1h.
// Oldest admit is 11h ago; slot opens when it hits 12h, i.e. in 1h.
expect(state.retryAfterMs).toBe(60 * 60 * 1000)
// Blocked before any row is written — the user doesn't take a queue slot.
expect(deps.rows.has('u1')).toBe(false)
})

test('rate_limited: admits outside the 20h window do not count', async () => {
test('rate_limited: admits outside the 12h window do not count', async () => {
deps._tick(GLM_OPEN_TIME)
// 5 admits, each just over 20h old → all fall off the window.
// 5 admits, each just over 12h old → all fall off the window.
const now = deps._now()
for (let i = 0; i < 5; i++) {
deps.admits.push({
Expand Down Expand Up @@ -446,7 +446,7 @@ describe('requestSession', () => {
const now = deps._now()
// Seed 5 prior admits (the cap), with the latest one matching the
// active row we're about to install.
const ages = [19, 4, 3, 2, 0]
const ages = [11, 4, 3, 2, 0]
for (const hoursAgo of ages) {
deps.admits.push({
user_id: 'u1',
Expand Down Expand Up @@ -527,7 +527,7 @@ describe('requestSession', () => {
// must be blocked by the quota.
deps._tick(GLM_OPEN_TIME)
const now = deps._now()
const ages = [19, 4, 3, 2, 1]
const ages = [11, 4, 3, 2, 1]
for (const hoursAgo of ages) {
deps.admits.push({
user_id: 'u1',
Expand Down Expand Up @@ -660,7 +660,7 @@ describe('getSessionState', () => {
expect(state.rateLimit).toEqual({
model: 'z-ai/glm-5.1',
limit: 5,
windowHours: 20,
windowHours: 12,
recentCount: 1,
})
})
Expand Down
17 changes: 10 additions & 7 deletions web/src/server/free-session/public-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,25 +41,28 @@ import type { InternalSessionRow, SessionStateResponse } from './types'
* queued/active responses — changing them is a deliberate, typed edit.
*/
const RATE_LIMITS: Record<string, { limit: number; windowHours: number }> = {
'z-ai/glm-5.1': { limit: 5, windowHours: 20 },
'z-ai/glm-5.1': { limit: 5, windowHours: 12 },
}

/** Fetch the caller's current quota snapshot for `model`, or undefined if the
* model isn't rate-limited. Used by both POST (after admit) and GET polls so
* the CLI's "N of M sessions used" line stays live instead of disappearing
* after the first poll. Also returns the oldest admit in-window so callers
* that need `retryAfterMs` don't have to re-query. */
* after the first poll. Also returns the oldest admit in-window and the
* window duration so callers that need `retryAfterMs` don't have to re-query
* or duplicate the window math. */
async function fetchRateLimitSnapshot(
userId: string,
model: string,
deps: SessionDeps,
): Promise<
{ info: FreebuffSessionRateLimit; oldest: Date | null } | undefined
| { info: FreebuffSessionRateLimit; oldest: Date | null; windowMs: number }
| undefined
> {
const cfg = RATE_LIMITS[model]
if (!cfg) return undefined
const now = nowOf(deps)
const since = new Date(now.getTime() - cfg.windowHours * 60 * 60 * 1000)
const windowMs = cfg.windowHours * 60 * 60 * 1000
const since = new Date(now.getTime() - windowMs)
const admits = await deps.listRecentAdmits({
userId,
model,
Expand All @@ -74,6 +77,7 @@ async function fetchRateLimitSnapshot(
recentCount: admits.length,
},
oldest: admits[0] ?? null,
windowMs,
}
}

Expand Down Expand Up @@ -271,10 +275,9 @@ export async function requestSession(params: {
if (snapshot && snapshot.info.recentCount >= snapshot.info.limit) {
// Oldest admit's window-anniversary is when one slot opens back up.
// Clamped at 0 so a clock skew can't surface a negative retry-after.
const windowMs = snapshot.info.windowHours * 60 * 60 * 1000
const retryAfterMs = Math.max(
0,
(snapshot.oldest?.getTime() ?? 0) + windowMs - now.getTime(),
(snapshot.oldest?.getTime() ?? 0) + snapshot.windowMs - now.getTime(),
)
return {
status: 'rate_limited',
Expand Down
2 changes: 1 addition & 1 deletion web/src/server/free-session/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ export async function promoteQueuedUser(params: {
* so one query covers both the check and the reject path.
*
* Drives the per-user, per-model rate limit (e.g. at most 5 GLM sessions in
* the last 20h) enforced before `joinOrTakeOver`.
* the last 12h) enforced before `joinOrTakeOver`.
*/
export async function listRecentAdmits(params: {
userId: string
Expand Down
Loading