diff --git a/agents/__tests__/editor.test.ts b/agents/__tests__/editor.test.ts
index 030857c8d..36d6b75c5 100644
--- a/agents/__tests__/editor.test.ts
+++ b/agents/__tests__/editor.test.ts
@@ -67,6 +67,11 @@ describe('editor agent', () => {
expect(glmEditor.model).toBe('z-ai/glm-5.1')
})
+ test('creates minimax editor', () => {
+ const minimaxEditor = createCodeEditor({ model: 'minimax' })
+ expect(minimaxEditor.model).toBe('minimax/minimax-m2.7')
+ })
+
test('gpt-5 editor does not include think tags in instructions', () => {
const gpt5Editor = createCodeEditor({ model: 'gpt-5' })
expect(gpt5Editor.instructionsPrompt).not.toContain('')
@@ -79,6 +84,12 @@ describe('editor agent', () => {
expect(glmEditor.instructionsPrompt).not.toContain('')
})
+ test('minimax editor does not include think tags in instructions', () => {
+ const minimaxEditor = createCodeEditor({ model: 'minimax' })
+ expect(minimaxEditor.instructionsPrompt).not.toContain('')
+ expect(minimaxEditor.instructionsPrompt).not.toContain('')
+ })
+
test('opus editor includes think tags in instructions', () => {
const opusEditor = createCodeEditor({ model: 'opus' })
expect(opusEditor.instructionsPrompt).toContain('')
diff --git a/agents/editor/editor.ts b/agents/editor/editor.ts
index 3d208aa13..c98544d0f 100644
--- a/agents/editor/editor.ts
+++ b/agents/editor/editor.ts
@@ -4,7 +4,7 @@ import { publisher } from '../constants'
import type { AgentDefinition } from '../types/agent-definition'
export const createCodeEditor = (options: {
- model: 'gpt-5' | 'opus' | 'glm'
+ model: 'gpt-5' | 'opus' | 'glm' | 'minimax'
}): Omit => {
const { model } = options
return {
@@ -12,6 +12,8 @@ export const createCodeEditor = (options: {
model:
options.model === 'gpt-5'
? 'openai/gpt-5.1'
+ : options.model === 'minimax'
+ ? 'minimax/minimax-m2.7'
: options.model === 'glm'
? 'z-ai/glm-5.1'
: 'anthropic/claude-opus-4.7',
@@ -65,7 +67,7 @@ OR for new files or major rewrites:
}
-${model === 'gpt-5' || model === 'glm'
+${model === 'gpt-5' || model === 'glm' || model === 'minimax'
? ''
: `Before you start writing your implementation, you should use tags to think about the best way to implement the changes.
diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts
index b28a77c31..3608f3631 100644
--- a/agents/types/agent-definition.ts
+++ b/agents/types/agent-definition.ts
@@ -423,8 +423,6 @@ export type ModelName =
// Other open source models
| 'moonshotai/kimi-k2'
| 'moonshotai/kimi-k2:nitro'
- | 'moonshotai/kimi-k2.5'
- | 'moonshotai/kimi-k2.5:nitro'
| 'z-ai/glm-5'
| 'z-ai/glm-5.1'
| 'z-ai/glm-4.6'
diff --git a/cli/src/components/freebuff-model-selector.tsx b/cli/src/components/freebuff-model-selector.tsx
index a33d89540..5abaac272 100644
--- a/cli/src/components/freebuff-model-selector.tsx
+++ b/cli/src/components/freebuff-model-selector.tsx
@@ -3,9 +3,16 @@ import { useKeyboard } from '@opentui/react'
import React, { useCallback, useEffect, useMemo, useState } from 'react'
import { Button } from './button'
-import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models'
+import {
+ DEFAULT_FREEBUFF_MODEL_ID,
+ FREEBUFF_DEPLOYMENT_HOURS_LABEL,
+ FREEBUFF_GLM_MODEL_ID,
+ FREEBUFF_MODELS,
+ isFreebuffModelAvailable,
+} from '@codebuff/common/constants/freebuff-models'
import { joinFreebuffQueue } from '../hooks/use-freebuff-session'
+import { useNow } from '../hooks/use-now'
import { useFreebuffModelStore } from '../state/freebuff-model-store'
import { useFreebuffSessionStore } from '../state/freebuff-session-store'
import { useTerminalDimensions } from '../hooks/use-terminal-dimensions'
@@ -13,6 +20,11 @@ import { useTheme } from '../hooks/use-theme'
import type { KeyEvent } from '@opentui/core'
+const FREEBUFF_MODEL_SELECTOR_MODELS = [
+ ...FREEBUFF_MODELS.filter((model) => model.id === FREEBUFF_GLM_MODEL_ID),
+ ...FREEBUFF_MODELS.filter((model) => model.id !== FREEBUFF_GLM_MODEL_ID),
+]
+
/**
* Dual-purpose model picker:
* - Pre-chat landing (session 'none'): user hasn't joined any queue. Picking
@@ -33,7 +45,9 @@ export const FreebuffModelSelector: React.FC = () => {
const theme = useTheme()
const { terminalWidth } = useTerminalDimensions()
const selectedModel = useFreebuffModelStore((s) => s.selectedModel)
+ const setSelectedModel = useFreebuffModelStore((s) => s.setSelectedModel)
const session = useFreebuffSessionStore((s) => s.session)
+ const now = useNow(60_000)
const [pending, setPending] = useState(null)
const [hoveredId, setHoveredId] = useState(null)
// Keyboard cursor — separate from the actually-selected model so that
@@ -45,6 +59,15 @@ export const FreebuffModelSelector: React.FC = () => {
setFocusedId(selectedModel)
}, [selectedModel])
+ useEffect(() => {
+ if (
+ (session?.status === 'none' || !session) &&
+ !isFreebuffModelAvailable(selectedModel, new Date(now))
+ ) {
+ setSelectedModel(DEFAULT_FREEBUFF_MODEL_ID)
+ }
+ }, [now, selectedModel, session, setSelectedModel])
+
// Landing ('none'): depths come from the server snapshot, no "self" to
// subtract. In-queue ('queued'): for the user's queue, "ahead" is
// `position - 1` (themselves don't count); for every other queue, switching
@@ -85,18 +108,22 @@ export const FreebuffModelSelector: React.FC = () => {
)
// Decide row vs column layout based on whether both buttons actually fit
- // side-by-side. Each button's inner text is "● {displayName} · {tagline} {hint}",
+ // side-by-side. Each button's inner text is
+ // "● {displayName} · {tagline} · {hours} {hint}",
// plus 2 cols of border and 2 cols of padding. Buttons are separated by a
// gap of 2. If the total exceeds the terminal width, stack vertically.
const stackVertically = useMemo(() => {
const BUTTON_CHROME = 4 // 2 border + 2 padding
const GAP = 2
- const total = FREEBUFF_MODELS.reduce((sum, model, idx) => {
+ const total = FREEBUFF_MODEL_SELECTOR_MODELS.reduce((sum, model, idx) => {
const inner =
2 /* indicator + space */ +
model.displayName.length +
3 /* " · " */ +
model.tagline.length +
+ (model.availability === 'deployment_hours'
+ ? 3 + FREEBUFF_DEPLOYMENT_HOURS_LABEL.length
+ : 0) +
2 /* " " */ +
hintWidth
return sum + inner + BUTTON_CHROME + (idx > 0 ? GAP : 0)
@@ -115,10 +142,11 @@ export const FreebuffModelSelector: React.FC = () => {
(modelId: string) => {
if (pending) return
if (modelId === committedModelId) return
+ if (!isFreebuffModelAvailable(modelId, new Date(now))) return
setPending(modelId)
joinFreebuffQueue(modelId).finally(() => setPending(null))
},
- [pending, committedModelId],
+ [pending, committedModelId, now],
)
// Tab / Shift+Tab and arrow keys move the focus highlight only; Enter or
@@ -136,25 +164,30 @@ export const FreebuffModelSelector: React.FC = () => {
const isCommit = name === 'return' || name === 'enter' || name === 'space'
if (!isForward && !isBackward && !isCommit) return
if (isCommit) {
- if (focusedId !== committedModelId) {
+ if (
+ focusedId !== committedModelId &&
+ isFreebuffModelAvailable(focusedId, new Date(now))
+ ) {
key.preventDefault?.()
pick(focusedId)
}
return
}
- const currentIdx = FREEBUFF_MODELS.findIndex((m) => m.id === focusedId)
+ const currentIdx = FREEBUFF_MODEL_SELECTOR_MODELS.findIndex(
+ (m) => m.id === focusedId,
+ )
if (currentIdx === -1) return
- const len = FREEBUFF_MODELS.length
+ const len = FREEBUFF_MODEL_SELECTOR_MODELS.length
const nextIdx = isForward
? (currentIdx + 1) % len
: (currentIdx - 1 + len) % len
- const target = FREEBUFF_MODELS[nextIdx]
+ const target = FREEBUFF_MODEL_SELECTOR_MODELS[nextIdx]
if (target) {
key.preventDefault?.()
setFocusedId(target.id)
}
},
- [pending, pick, focusedId, committedModelId],
+ [pending, pick, focusedId, committedModelId, now],
),
)
@@ -173,7 +206,7 @@ export const FreebuffModelSelector: React.FC = () => {
alignItems: 'flex-start',
}}
>
- {FREEBUFF_MODELS.map((model) => {
+ {FREEBUFF_MODEL_SELECTOR_MODELS.map((model) => {
// 'Selected' means the dot is filled and the label is bold. On the
// landing screen ('none') this tracks the pre-focused pick; on the
// queued screen it tracks the model the server has us on. Either
@@ -181,15 +214,22 @@ export const FreebuffModelSelector: React.FC = () => {
const isSelected = model.id === selectedModel
const isHovered = hoveredId === model.id
const isFocused = focusedId === model.id && !isSelected
+ const isAvailable = isFreebuffModelAvailable(model.id, new Date(now))
const indicator = isSelected ? '●' : '○'
const indicatorColor = isSelected ? theme.primary : theme.muted
- const labelColor = isSelected ? theme.foreground : theme.muted
+ const labelColor = isSelected && isAvailable ? theme.foreground : theme.muted
// Clickable whenever picking would actually do something — i.e.
// anything except re-picking the queue we're already in.
- const interactable = !pending && model.id !== committedModelId
+ const interactable = !pending && isAvailable && model.id !== committedModelId
const ahead = aheadByModel?.[model.id]
const hint =
- ahead === undefined ? '' : ahead === 0 ? 'No wait' : `${ahead} ahead`
+ !isAvailable
+ ? 'Closed'
+ : ahead === undefined
+ ? ''
+ : ahead === 0
+ ? 'No wait'
+ : `${ahead} ahead`
const borderColor = isSelected
? theme.primary
@@ -202,7 +242,7 @@ export const FreebuffModelSelector: React.FC = () => {
key={model.id}
onClick={() => {
setFocusedId(model.id)
- pick(model.id)
+ if (isAvailable) pick(model.id)
}}
onMouseOver={() => interactable && setHoveredId(model.id)}
onMouseOut={() => setHoveredId((curr) => (curr === model.id ? null : curr))}
@@ -223,6 +263,9 @@ export const FreebuffModelSelector: React.FC = () => {
{model.displayName}
· {model.tagline}
+ {model.availability === 'deployment_hours' && (
+ · {FREEBUFF_DEPLOYMENT_HOURS_LABEL}
+ )}
{hint.padEnd(hintWidth)}
diff --git a/cli/src/components/waiting-room-screen.tsx b/cli/src/components/waiting-room-screen.tsx
index e67823f7a..251ca87c0 100644
--- a/cli/src/components/waiting-room-screen.tsx
+++ b/cli/src/components/waiting-room-screen.tsx
@@ -253,7 +253,7 @@ export const WaitingRoomScreen: React.FC = ({
⚠ Account unavailable
- This account can't use freebuff. If you think this is a
+ This account has been suspended and can't use freebuff. If you think this is a
mistake, contact support@codebuff.com. Press Ctrl+C to exit.
>
diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts
index 79deea1cf..f24fba7b3 100644
--- a/cli/src/hooks/use-freebuff-session.ts
+++ b/cli/src/hooks/use-freebuff-session.ts
@@ -1,4 +1,5 @@
import { env } from '@codebuff/common/env'
+import { DEFAULT_FREEBUFF_MODEL_ID } from '@codebuff/common/constants/freebuff-models'
import { useEffect } from 'react'
import {
@@ -75,14 +76,18 @@ async function callSession(
return body
}
}
- // 409 from POST means the user picked a different model than their active
- // session is bound to. Surface as a non-throw `model_locked` so the UI can
- // show a confirmation prompt (DELETE then re-POST to switch).
+ // 409 from POST means the selected model cannot be joined right now, either
+ // because an active session is locked to another model or because a
+ // Surface model-switch conflicts and temporary model availability closures
+ // as non-throw states.
if (resp.status === 409 && method === 'POST') {
const body = (await resp.json().catch(() => null)) as
| FreebuffSessionResponse
| null
- if (body && body.status === 'model_locked') {
+ if (
+ body &&
+ (body.status === 'model_locked' || body.status === 'model_unavailable')
+ ) {
return body
}
}
@@ -119,6 +124,7 @@ function nextDelayMs(next: FreebuffSessionResponse): number | null {
case 'country_blocked':
case 'banned':
case 'model_locked':
+ case 'model_unavailable':
return null
}
}
@@ -398,6 +404,12 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
schedule(0)
return
}
+ if (next.status === 'model_unavailable') {
+ useFreebuffModelStore.getState().setSelectedModel(DEFAULT_FREEBUFF_MODEL_ID)
+ nextMethod = 'GET'
+ schedule(0)
+ return
+ }
// Startup takeover: the initial probe GET saw we already hold a seat
// (from a prior CLI instance). POST now to rotate our instance id so
diff --git a/cli/src/state/freebuff-model-store.ts b/cli/src/state/freebuff-model-store.ts
index 182a38831..1aa9f2db8 100644
--- a/cli/src/state/freebuff-model-store.ts
+++ b/cli/src/state/freebuff-model-store.ts
@@ -1,6 +1,6 @@
import {
DEFAULT_FREEBUFF_MODEL_ID,
- resolveFreebuffModel,
+ resolveAvailableFreebuffModel,
} from '@codebuff/common/constants/freebuff-models'
import { create } from 'zustand'
@@ -24,11 +24,11 @@ interface FreebuffModelStore {
}
export const useFreebuffModelStore = create((set) => ({
- selectedModel: resolveFreebuffModel(
+ selectedModel: resolveAvailableFreebuffModel(
loadFreebuffModelPreference() ?? DEFAULT_FREEBUFF_MODEL_ID,
),
setSelectedModel: (model) => {
- const resolved = resolveFreebuffModel(model)
+ const resolved = resolveAvailableFreebuffModel(model)
saveFreebuffModelPreference(resolved)
set({ selectedModel: resolved })
},
diff --git a/cli/src/utils/local-agent-registry.ts b/cli/src/utils/local-agent-registry.ts
index 59206eb84..6106b3928 100644
--- a/cli/src/utils/local-agent-registry.ts
+++ b/cli/src/utils/local-agent-registry.ts
@@ -370,7 +370,7 @@ export const loadAgentDefinitions = (): AgentDefinition[] => {
}
// Override the model of free-mode agents to match the user's pick from the
- // freebuff waiting room. Bundled definitions hardcode glm-5.1; we swap in
+ // freebuff waiting room. Bundled definitions hardcode a free model; we swap in
// whatever the user chose so the chat-completions request body carries the
// matching model and the server-side session gate doesn't reject it as a
// model mismatch.
diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts
index e44c74cc6..308e12df6 100644
--- a/common/src/constants/free-agents.ts
+++ b/common/src/constants/free-agents.ts
@@ -26,7 +26,10 @@ export const FREEBUFF_ROOT_AGENT_IDS = ['base2-free'] as const
*/
export const FREE_MODE_AGENT_MODELS: Record> = {
// Root orchestrator
- 'base2-free': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']),
+ 'base2-free': new Set([
+ 'minimax/minimax-m2.7',
+ 'z-ai/glm-5.1',
+ ]),
// File exploration agents
'file-picker': new Set(['google/gemini-2.5-flash-lite']),
@@ -41,10 +44,16 @@ export const FREE_MODE_AGENT_MODELS: Record> = {
'basher': new Set(['google/gemini-3.1-flash-lite-preview']),
// Editor for free mode
- 'editor-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']),
+ 'editor-lite': new Set([
+ 'minimax/minimax-m2.7',
+ 'z-ai/glm-5.1',
+ ]),
// Code reviewer for free mode
- 'code-reviewer-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']),
+ 'code-reviewer-lite': new Set([
+ 'minimax/minimax-m2.7',
+ 'z-ai/glm-5.1',
+ ]),
}
/**
diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts
index d71ebd619..f1019c6fb 100644
--- a/common/src/constants/freebuff-models.ts
+++ b/common/src/constants/freebuff-models.ts
@@ -13,18 +13,25 @@ export interface FreebuffModelOption {
displayName: string
/** One-line description shown next to the label. */
tagline: string
+ /** Availability policy for the selector and server-side admission. */
+ availability: 'always' | 'deployment_hours'
}
+export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT'
+export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1'
+
export const FREEBUFF_MODELS = [
- {
- id: 'z-ai/glm-5.1',
- displayName: 'GLM 5.1',
- tagline: 'Smartest',
- },
{
id: 'minimax/minimax-m2.7',
displayName: 'MiniMax M2.7',
tagline: 'Fastest',
+ availability: 'always',
+ },
+ {
+ id: FREEBUFF_GLM_MODEL_ID,
+ displayName: 'GLM 5.1',
+ tagline: 'Smartest',
+ availability: 'deployment_hours',
},
] as const satisfies readonly FreebuffModelOption[]
@@ -51,3 +58,49 @@ export function getFreebuffModel(id: string): FreebuffModelOption {
FREEBUFF_MODELS.find((m) => m.id === DEFAULT_FREEBUFF_MODEL_ID)!
)
}
+
+function getZonedParts(
+ date: Date,
+ timeZone: string,
+): { weekday: string; minutes: number } {
+ const parts = new Intl.DateTimeFormat('en-US', {
+ timeZone,
+ weekday: 'short',
+ hour: '2-digit',
+ minute: '2-digit',
+ hourCycle: 'h23',
+ }).formatToParts(date)
+ const value = (type: string) => parts.find((part) => part.type === type)?.value
+ const hour = Number(value('hour') ?? 0)
+ const minute = Number(value('minute') ?? 0)
+ return {
+ weekday: value('weekday') ?? '',
+ minutes: hour * 60 + minute,
+ }
+}
+
+export function isFreebuffDeploymentHours(now: Date = new Date()): boolean {
+ const eastern = getZonedParts(now, 'America/New_York')
+ const pacific = getZonedParts(now, 'America/Los_Angeles')
+ if (eastern.weekday === 'Sat' || eastern.weekday === 'Sun') return false
+ return eastern.minutes >= 9 * 60 && pacific.minutes < 17 * 60
+}
+
+export function isFreebuffModelAvailable(
+ id: string,
+ now: Date = new Date(),
+): boolean {
+ const model = FREEBUFF_MODELS.find((m) => m.id === id)
+ if (!model) return false
+ return model.availability === 'always' || isFreebuffDeploymentHours(now)
+}
+
+export function resolveAvailableFreebuffModel(
+ id: string | null | undefined,
+ now: Date = new Date(),
+): FreebuffModelId {
+ const resolved = resolveFreebuffModel(id)
+ return isFreebuffModelAvailable(resolved, now)
+ ? resolved
+ : DEFAULT_FREEBUFF_MODEL_ID
+}
diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts
index b28a77c31..3608f3631 100644
--- a/common/src/templates/initial-agents-dir/types/agent-definition.ts
+++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts
@@ -423,8 +423,6 @@ export type ModelName =
// Other open source models
| 'moonshotai/kimi-k2'
| 'moonshotai/kimi-k2:nitro'
- | 'moonshotai/kimi-k2.5'
- | 'moonshotai/kimi-k2.5:nitro'
| 'z-ai/glm-5'
| 'z-ai/glm-5.1'
| 'z-ai/glm-4.6'
diff --git a/common/src/types/freebuff-session.ts b/common/src/types/freebuff-session.ts
index e42d9f0be..d141000a4 100644
--- a/common/src/types/freebuff-session.ts
+++ b/common/src/types/freebuff-session.ts
@@ -92,6 +92,12 @@ export type FreebuffSessionServerResponse =
currentModel: string
requestedModel: string
}
+ | {
+ /** Requested model is valid but not selectable right now. */
+ status: 'model_unavailable'
+ requestedModel: string
+ availableHours: string
+ }
| {
/** Account is banned. Returned from every endpoint so banned bots can't
* join the queue at all (otherwise they inflate `queueDepth` until the
diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md
index b1384d7b6..353bfb046 100644
--- a/docs/freebuff-waiting-room.md
+++ b/docs/freebuff-waiting-room.md
@@ -5,7 +5,7 @@
The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs:
1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones.
-2. **Gate on per-deployment health** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` admit that tick; a degraded minimax-m2.7 no longer stalls glm-5.1 admissions.
+2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; GLM 5.1 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available.
3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput.
Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session.
@@ -149,8 +149,8 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r
| Constant | Location | Default | Purpose |
|---|---|---|---|
| `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. |
-| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `glm-5.1`, `minimax-m2.7` | Selectable models; each gets its own queue and admission slot. |
-| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | glm-5.1 only | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. |
+| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `glm-5.1` | Selectable models; each gets its own queue and admission slot. |
+| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `glm-5.1` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. |
| `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. |
| `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime |
| `FREEBUFF_SESSION_GRACE_MS` | env | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. |
@@ -180,12 +180,12 @@ Response shapes:
{
"status": "queued",
"instanceId": "e47…",
- "model": "z-ai/glm-5.1",
+ "model": "minimax/minimax-m2.7",
"position": 17, // 1-indexed within this model's queue
"queueDepth": 43, // size of this model's queue
"queueDepthByModel": { // snapshot of every model's queue — powers the
- "z-ai/glm-5.1": 43, // "N ahead" hint in the selector. Missing
- "minimax/minimax-m2.7": 4 // entries should be treated as 0.
+ "minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing
+ "z-ai/glm-5.1": 4 // entries should be treated as 0.
},
"estimatedWaitMs": 384000,
"queuedAt": "2026-04-17T12:00:00Z"
@@ -195,7 +195,7 @@ Response shapes:
{
"status": "active",
"instanceId": "e47…",
- "model": "z-ai/glm-5.1",
+ "model": "minimax/minimax-m2.7",
"admittedAt": "2026-04-17T12:00:00Z",
"expiresAt": "2026-04-17T13:00:00Z",
"remainingMs": 3600000
@@ -219,7 +219,7 @@ Response shapes:
// to actually switch.
{
"status": "model_locked",
- "currentModel": "z-ai/glm-5.1",
+ "currentModel": "minimax/minimax-m2.7",
"requestedModel": "minimax/minimax-m2.7"
}
```
@@ -285,7 +285,7 @@ waitMs = (position - 1) * 24_000
- Position 1 → 0 (next tick admits you)
- Position 2 → 24s, and so on.
-`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a per-deployment Fireworks incident only the affected model's queue stalls; healthy models keep draining), so the real wait can be longer or shorter.
+`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a GLM Fireworks incident or outside 9am ET-5pm PT, only GLM's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter.
## CLI Integration (frontend-side contract)
@@ -324,7 +324,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr
| Spamming POST/GET to starve admission tick | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. |
| Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time. |
| Fireworks metrics endpoint down / slow | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses. |
-| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded minimax-m2.7 doesn't block glm-5.1 admissions. |
+| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded GLM deployment doesn't block MiniMax admissions. |
| Zombie expired sessions holding capacity | Swept on every admission tick, even when upstream is unhealthy |
## Testing
diff --git a/scripts/test-fireworks-cache-intervals.ts b/scripts/test-fireworks-cache-intervals.ts
index 0ed71193f..8d4e86740 100644
--- a/scripts/test-fireworks-cache-intervals.ts
+++ b/scripts/test-fireworks-cache-intervals.ts
@@ -13,7 +13,6 @@
*
* Models:
* glm-5.1 (default) — z-ai/glm-5.1
- * kimi-k2.5 — moonshotai/kimi-k2.5
* minimax — minimax/minimax-m2.5
*
* Flags:
@@ -39,7 +38,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1'
type ModelConfig = {
id: string
standardModel: string
- deploymentModel: string
+ deploymentModel?: string
inputCostPerToken: number
cachedInputCostPerToken: number
outputCostPerToken: number
@@ -54,14 +53,6 @@ const MODEL_CONFIGS: Record = {
cachedInputCostPerToken: 0.26 / 1_000_000,
outputCostPerToken: 4.4 / 1_000_000,
},
- 'kimi-k2.5': {
- id: 'moonshotai/kimi-k2.5',
- standardModel: 'accounts/fireworks/models/kimi-k2p5',
- deploymentModel: 'accounts/james-65d217/deployments/mx8l5rq2',
- inputCostPerToken: 0.6 / 1_000_000,
- cachedInputCostPerToken: 0.1 / 1_000_000,
- outputCostPerToken: 3.0 / 1_000_000,
- },
minimax: {
id: 'minimax/minimax-m2.5',
standardModel: 'accounts/fireworks/models/minimax-m2p5',
@@ -117,8 +108,12 @@ function parseArgs(): {
const { modelKey, useDeployment: USE_DEPLOYMENT, intervals: INTERVALS_SEC } =
parseArgs()
const MODEL = MODEL_CONFIGS[modelKey]
+if (USE_DEPLOYMENT && !MODEL.deploymentModel) {
+ console.error(`❌ No custom deployment configured for ${MODEL.id}`)
+ process.exit(1)
+}
const FIREWORKS_MODEL = USE_DEPLOYMENT
- ? MODEL.deploymentModel
+ ? MODEL.deploymentModel!
: MODEL.standardModel
const INPUT_COST_PER_TOKEN = MODEL.inputCostPerToken
const CACHED_INPUT_COST_PER_TOKEN = MODEL.cachedInputCostPerToken
diff --git a/scripts/test-fireworks-long.ts b/scripts/test-fireworks-long.ts
index 67028228d..a1e4950f8 100644
--- a/scripts/test-fireworks-long.ts
+++ b/scripts/test-fireworks-long.ts
@@ -12,12 +12,17 @@
* Models:
* glm-5.1 (default) — z-ai/glm-5.1
* minimax — minimax/minimax-m2.5
+ * minimax-m2.7 — minimax/minimax-m2.7
*
* Flags:
* --deployment Use custom deployment instead of serverless (standard API)
* Serverless is the default
+ * Examples:
+ * bun scripts/test-fireworks-long.ts glm-5.1 --deployment
*/
+import { FIREWORKS_DEPLOYMENT_MAP } from '../web/src/llm-api/fireworks-config'
+
export { }
const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1'
@@ -25,7 +30,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1'
type ModelConfig = {
id: string // OpenRouter-style ID (for display)
standardModel: string // Fireworks standard API model ID
- deploymentModel: string // Fireworks custom deployment model ID
+ deploymentModel?: string // Fireworks custom deployment model ID
inputCostPerToken: number
cachedInputCostPerToken: number
outputCostPerToken: number
@@ -35,19 +40,11 @@ const MODEL_CONFIGS: Record = {
'glm-5.1': {
id: 'z-ai/glm-5.1',
standardModel: 'accounts/fireworks/models/glm-5p1',
- deploymentModel: 'accounts/james-65d217/deployments/mjb4i7ea',
+ deploymentModel: FIREWORKS_DEPLOYMENT_MAP['z-ai/glm-5.1'],
inputCostPerToken: 1.40 / 1_000_000,
cachedInputCostPerToken: 0.26 / 1_000_000,
outputCostPerToken: 4.40 / 1_000_000,
},
- 'kimi-k2.5': {
- id: 'moonshotai/kimi-k2.5',
- standardModel: 'accounts/fireworks/models/kimi-k2p5',
- deploymentModel: 'accounts/james-65d217/deployments/mx8l5rq2',
- inputCostPerToken: 0.60 / 1_000_000,
- cachedInputCostPerToken: 0.10 / 1_000_000,
- outputCostPerToken: 3.00 / 1_000_000,
- },
minimax: {
id: 'minimax/minimax-m2.5',
standardModel: 'accounts/fireworks/models/minimax-m2p5',
@@ -67,9 +64,16 @@ const MODEL_CONFIGS: Record = {
}
const DEFAULT_MODEL = 'glm-5.1'
+const MODEL_ALIASES: Record = {
+ glm: 'glm-5.1',
+ 'z-ai/glm-5.1': 'glm-5.1',
+ 'minimax/minimax-m2.5': 'minimax',
+ 'minimax/minimax-m2.7': 'minimax-m2.7',
+}
function getModelConfig(modelArg?: string): ModelConfig {
- const key = modelArg ?? DEFAULT_MODEL
+ const rawKey = modelArg ?? DEFAULT_MODEL
+ const key = MODEL_ALIASES[rawKey] ?? rawKey
const config = MODEL_CONFIGS[key]
if (!config) {
console.error(`❌ Unknown model: "${key}". Available models: ${Object.keys(MODEL_CONFIGS).join(', ')}`)
@@ -83,7 +87,11 @@ const modelArg = process.argv.find((a, i) => i > 1 && !a.startsWith('-') && a !=
const MODEL = getModelConfig(modelArg)
// Default to serverless (standard API); use --deployment for custom deployment
-const FIREWORKS_MODEL = USE_DEPLOYMENT ? MODEL.deploymentModel : MODEL.standardModel
+if (USE_DEPLOYMENT && !MODEL.deploymentModel) {
+ console.error(`❌ No custom deployment configured for ${MODEL.id}`)
+ process.exit(1)
+}
+const FIREWORKS_MODEL = USE_DEPLOYMENT ? MODEL.deploymentModel! : MODEL.standardModel
const INPUT_COST_PER_TOKEN = MODEL.inputCostPerToken
const CACHED_INPUT_COST_PER_TOKEN = MODEL.cachedInputCostPerToken
const OUTPUT_COST_PER_TOKEN = MODEL.outputCostPerToken
@@ -455,4 +463,4 @@ async function main() {
console.log('Done!')
}
-main()
\ No newline at end of file
+main()
diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
index 51a3eb46b..1aac8800c 100644
--- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
+++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -1,6 +1,7 @@
import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test'
import { NextRequest } from 'next/server'
+import { isFreebuffDeploymentHours } from '@codebuff/common/constants/freebuff-models'
import { formatQuotaResetCountdown, postChatCompletions } from '../_post'
import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics'
@@ -528,7 +529,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
method: 'POST',
headers: { Authorization: 'Bearer test-api-key-new-free' },
body: JSON.stringify({
- model: 'z-ai/glm-5.1',
+ model: 'minimax/minimax-m2.7',
stream: false,
codebuff_metadata: {
run_id: 'run-free',
@@ -555,6 +556,76 @@ describe('/api/v1/chat/completions POST endpoint', () => {
expect(response.status).toBe(200)
})
+ it('lets freebuff use GLM 5.1 through Fireworks availability rules', async () => {
+ const fetchedBodies: Record[] = []
+ const fetchViaFireworks = mock(
+ async (_url: string | URL | Request, init?: RequestInit) => {
+ fetchedBodies.push(JSON.parse(init?.body as string))
+ return new Response(
+ JSON.stringify({
+ id: 'test-id',
+ model: 'accounts/james-65d217/deployments/mjb4i7ea',
+ choices: [{ message: { content: 'test response' } }],
+ usage: {
+ prompt_tokens: 10,
+ completion_tokens: 20,
+ total_tokens: 30,
+ },
+ }),
+ {
+ status: 200,
+ headers: { 'Content-Type': 'application/json' },
+ },
+ )
+ },
+ ) as unknown as typeof globalThis.fetch
+
+ const req = new NextRequest(
+ 'http://localhost:3000/api/v1/chat/completions',
+ {
+ method: 'POST',
+ headers: { Authorization: 'Bearer test-api-key-new-free' },
+ body: JSON.stringify({
+ model: 'z-ai/glm-5.1',
+ stream: false,
+ codebuff_metadata: {
+ run_id: 'run-free',
+ client_id: 'test-client-id-123',
+ cost_mode: 'free',
+ },
+ }),
+ },
+ )
+
+ const response = await postChatCompletions({
+ req,
+ getUserInfoFromApiKey: mockGetUserInfoFromApiKey,
+ logger: mockLogger,
+ trackEvent: mockTrackEvent,
+ getUserUsageData: mockGetUserUsageData,
+ getAgentRunFromId: mockGetAgentRunFromId,
+ fetch: fetchViaFireworks,
+ insertMessageBigquery: mockInsertMessageBigquery,
+ loggerWithContext: mockLoggerWithContext,
+ checkSessionAdmissible: mockCheckSessionAdmissibleAllow,
+ })
+
+ const body = await response.json()
+ if (isFreebuffDeploymentHours()) {
+ expect(response.status).toBe(200)
+ expect(fetchedBodies).toHaveLength(1)
+ expect(fetchedBodies[0].model).toBe(
+ 'accounts/james-65d217/deployments/mjb4i7ea',
+ )
+ expect(body.model).toBe('z-ai/glm-5.1')
+ expect(body.provider).toBe('Fireworks')
+ } else {
+ expect(response.status).toBe(503)
+ expect(fetchedBodies).toHaveLength(0)
+ expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
+ }
+ })
+
it('skips credit check when in FREE mode even with 0 credits', async () => {
const req = new NextRequest(
'http://localhost:3000/api/v1/chat/completions',
@@ -562,7 +633,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
method: 'POST',
headers: { Authorization: 'Bearer test-api-key-no-credits' },
body: JSON.stringify({
- model: 'z-ai/glm-5.1',
+ model: 'minimax/minimax-m2.7',
stream: false,
codebuff_metadata: {
run_id: 'run-free',
@@ -671,7 +742,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
method: 'POST',
headers: { Authorization: 'Bearer test-api-key-new-free' },
body: JSON.stringify({
- model: 'z-ai/glm-5.1',
+ model: 'minimax/minimax-m2.7',
stream: true,
codebuff_metadata: {
run_id: 'run-123',
@@ -853,7 +924,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
method: 'POST',
headers: { Authorization: 'Bearer test-api-key-123' },
body: JSON.stringify({
- model: 'z-ai/glm-5.1',
+ model: 'minimax/minimax-m2.7',
stream: false,
codebuff_metadata: {
run_id: 'run-free',
diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
index 657c17f6d..ffcb8fd36 100644
--- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
+++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
@@ -3,6 +3,7 @@ import { describe, expect, test } from 'bun:test'
import {
deleteFreebuffSession,
FREEBUFF_INSTANCE_HEADER,
+ FREEBUFF_MODEL_HEADER,
getFreebuffSession,
postFreebuffSession,
} from '../_handlers'
@@ -12,16 +13,17 @@ import type { SessionDeps } from '@/server/free-session/public-api'
import type { InternalSessionRow } from '@/server/free-session/types'
import type { NextRequest } from 'next/server'
-const DEFAULT_MODEL = 'z-ai/glm-5.1'
+const DEFAULT_MODEL = 'minimax/minimax-m2.7'
function makeReq(
apiKey: string | null,
- opts: { instanceId?: string; cfCountry?: string } = {},
+ opts: { instanceId?: string; cfCountry?: string; model?: string } = {},
): NextRequest {
const headers = new Headers()
if (apiKey) headers.set('Authorization', `Bearer ${apiKey}`)
if (opts.instanceId) headers.set(FREEBUFF_INSTANCE_HEADER, opts.instanceId)
if (opts.cfCountry) headers.set('cf-ipcountry', opts.cfCountry)
+ if (opts.model) headers.set(FREEBUFF_MODEL_HEADER, opts.model)
return {
headers,
} as unknown as NextRequest
@@ -153,6 +155,19 @@ describe('POST /api/v1/freebuff/session', () => {
expect(body.status).toBe('queued')
})
+ test('returns model_unavailable for GLM outside deployment hours', async () => {
+ const sessionDeps = makeSessionDeps()
+ const resp = await postFreebuffSession(
+ makeReq('ok', { model: 'z-ai/glm-5.1' }),
+ makeDeps(sessionDeps, 'u1'),
+ )
+ expect(resp.status).toBe(409)
+ const body = await resp.json()
+ expect(body.status).toBe('model_unavailable')
+ expect(body.availableHours).toBe('9am ET-5pm PT')
+ expect(sessionDeps.rows.size).toBe(0)
+ })
+
// Banned bots with valid API keys were POSTing every few seconds and
// inflating queueDepth between the 15s admission-tick sweeps. Rejecting at
// the HTTP layer with 403 (terminal, like country_blocked) keeps them out
diff --git a/web/src/app/api/v1/freebuff/session/_handlers.ts b/web/src/app/api/v1/freebuff/session/_handlers.ts
index ec17568a3..6f93e9282 100644
--- a/web/src/app/api/v1/freebuff/session/_handlers.ts
+++ b/web/src/app/api/v1/freebuff/session/_handlers.ts
@@ -138,12 +138,17 @@ export async function postFreebuffSession(
model: requestedModel,
deps: deps.sessionDeps,
})
- // model_locked is a 409 so it's distinguishable from a normal queued/active
- // response on the client. banned is a 403 (terminal, mirrors country_blocked)
- // so older CLIs that don't know the status fall into their `!resp.ok` error
- // path and back off instead of tight-polling on the unrecognized 200 body.
+ // model_locked / model_unavailable are 409 so they're distinguishable from
+ // normal queued/active responses on the client. banned is a 403 (terminal,
+ // mirrors country_blocked) so older CLIs that don't know the status fall
+ // into their `!resp.ok` error path and back off instead of tight-polling
+ // on the unrecognized 200 body.
const status =
- state.status === 'model_locked' ? 409 : state.status === 'banned' ? 403 : 200
+ state.status === 'model_locked' || state.status === 'model_unavailable'
+ ? 409
+ : state.status === 'banned'
+ ? 403
+ : 200
return NextResponse.json(state, { status })
} catch (error) {
return serverError(deps, 'POST', auth.userId, error)
diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts
index 9ed91fd0a..58863c674 100644
--- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts
+++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts
@@ -3,7 +3,7 @@ import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test'
import {
createFireworksRequestWithFallback,
DEPLOYMENT_COOLDOWN_MS,
- FireworksError,
+ isDeploymentHours,
isDeploymentCoolingDown,
markDeploymentScalingUp,
resetDeploymentCooldown,
@@ -13,6 +13,11 @@ import type { Logger } from '@codebuff/common/types/contracts/logger'
const STANDARD_MODEL_ID = 'accounts/fireworks/models/glm-5p1'
const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/mjb4i7ea'
+const IN_DEPLOYMENT_HOURS = new Date('2026-04-17T16:00:00Z') // Friday, 12pm ET / 9am PT
+const BEFORE_DEPLOYMENT_HOURS = new Date('2026-04-17T12:59:00Z') // Friday, 8:59am ET
+const AFTER_DEPLOYMENT_HOURS = new Date('2026-04-18T00:00:00Z') // Friday, 5pm PT
+const WEEKDAY_AFTER_DEPLOYMENT_HOURS = new Date('2026-04-21T00:01:00Z') // Monday, 5:01pm PT
+const WEEKEND_DEPLOYMENT_HOURS = new Date('2026-04-18T16:00:00Z') // Saturday
function createMockLogger(): Logger {
return {
@@ -23,18 +28,20 @@ function createMockLogger(): Logger {
}
}
-// Helper: create a Date at a specific ET hour using a known EDT date (June 2025, UTC-4)
-function dateAtEtHour(hour: number): Date {
- // June 15, 2025 is EDT (UTC-4), so ET hour H = UTC hour H+4
- const utcHour = hour + 4
- if (utcHour < 24) {
- return new Date(`2025-06-15T${String(utcHour).padStart(2, '0')}:30:00Z`)
- }
- // Wraps to next day
- return new Date(`2025-06-16T${String(utcHour - 24).padStart(2, '0')}:30:00Z`)
-}
-
describe('Fireworks deployment routing', () => {
+ describe('deployment hours', () => {
+ it('is active from 9am ET until before 5pm PT on weekdays', () => {
+ expect(isDeploymentHours(BEFORE_DEPLOYMENT_HOURS)).toBe(false)
+ expect(isDeploymentHours(IN_DEPLOYMENT_HOURS)).toBe(true)
+ expect(isDeploymentHours(AFTER_DEPLOYMENT_HOURS)).toBe(false)
+ expect(isDeploymentHours(WEEKDAY_AFTER_DEPLOYMENT_HOURS)).toBe(false)
+ })
+
+ it('is inactive on weekends', () => {
+ expect(isDeploymentHours(WEEKEND_DEPLOYMENT_HOURS)).toBe(false)
+ })
+ })
+
describe('deployment cooldown', () => {
beforeEach(() => {
resetDeploymentCooldown()
@@ -82,28 +89,6 @@ describe('Fireworks deployment routing', () => {
messages: [{ role: 'user' as const, content: 'test' }],
}
- function spyDeploymentHours(inHours: boolean) {
- // Control isDeploymentHours by mocking Date.prototype.toLocaleString
- // When called with the ET timezone options, return an hour inside or outside the window
- const original = Date.prototype.toLocaleString
- const spy = {
- restore: () => {
- Date.prototype.toLocaleString = original
- },
- }
- Date.prototype.toLocaleString = function (
- this: Date,
- ...args: Parameters
- ) {
- const options = args[1] as Intl.DateTimeFormatOptions | undefined
- if (options?.timeZone === 'America/New_York' && options?.hour === 'numeric') {
- return inHours ? '14' : '3'
- }
- return original.apply(this, args)
- }
- return spy
- }
-
it('uses standard API when custom deployment is disabled', async () => {
const fetchCalls: string[] = []
@@ -128,7 +113,6 @@ describe('Fireworks deployment routing', () => {
})
it('tries custom deployment during deployment hours', async () => {
- const spy = spyDeploymentHours(true)
const fetchCalls: string[] = []
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
@@ -137,160 +121,115 @@ describe('Fireworks deployment routing', () => {
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(response.status).toBe(200)
- expect(fetchCalls).toHaveLength(1)
- expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(200)
+ expect(fetchCalls).toHaveLength(1)
+ expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
})
- it('falls back to standard API on 503 DEPLOYMENT_SCALING_UP', async () => {
- const spy = spyDeploymentHours(true)
+ it('returns deployment 503 on DEPLOYMENT_SCALING_UP without serverless fallback', async () => {
const fetchCalls: string[] = []
- let callCount = 0
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
- callCount++
-
- if (callCount === 1) {
- return new Response(
- JSON.stringify({
- error: {
- message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.',
- code: 'DEPLOYMENT_SCALING_UP',
- type: 'error',
- },
- }),
- { status: 503, statusText: 'Service Unavailable' },
- )
- }
-
- return new Response(JSON.stringify({ ok: true }), { status: 200 })
+ return new Response(
+ JSON.stringify({
+ error: {
+ message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.',
+ code: 'DEPLOYMENT_SCALING_UP',
+ type: 'error',
+ },
+ }),
+ { status: 503, statusText: 'Service Unavailable' },
+ )
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(response.status).toBe(200)
- expect(fetchCalls).toHaveLength(2)
- expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
- expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
- // Verify cooldown was activated
- expect(isDeploymentCoolingDown()).toBe(true)
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(503)
+ expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID])
+ expect(isDeploymentCoolingDown()).toBe(true)
})
- it('falls back to standard API on non-scaling 503 from deployment', async () => {
- const spy = spyDeploymentHours(true)
+ it('returns non-scaling deployment 503 without serverless fallback', async () => {
const fetchCalls: string[] = []
- let callCount = 0
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
- callCount++
-
- if (callCount === 1) {
- return new Response(
- JSON.stringify({
- error: {
- message: 'Service temporarily unavailable',
- code: 'SERVICE_UNAVAILABLE',
- type: 'error',
- },
- }),
- { status: 503, statusText: 'Service Unavailable' },
- )
- }
-
- return new Response(JSON.stringify({ ok: true }), { status: 200 })
+ return new Response(
+ JSON.stringify({
+ error: {
+ message: 'Service temporarily unavailable',
+ code: 'SERVICE_UNAVAILABLE',
+ type: 'error',
+ },
+ }),
+ { status: 503, statusText: 'Service Unavailable' },
+ )
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(response.status).toBe(200)
- expect(fetchCalls).toHaveLength(2)
- expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
- expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
- // Non-scaling 503 should NOT activate the cooldown
- expect(isDeploymentCoolingDown()).toBe(false)
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(503)
+ expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID])
+ expect(isDeploymentCoolingDown()).toBe(false)
})
- it('falls back to standard API on 500 Internal Error from deployment', async () => {
- const spy = spyDeploymentHours(true)
+ it('returns 500 Internal Error from deployment without serverless fallback', async () => {
const fetchCalls: string[] = []
- let callCount = 0
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
- callCount++
-
- if (callCount === 1) {
- return new Response(
- JSON.stringify({ error: 'Internal error' }),
- { status: 500, statusText: 'Internal Server Error' },
- )
- }
-
- return new Response(JSON.stringify({ ok: true }), { status: 200 })
+ return new Response(
+ JSON.stringify({ error: 'Internal error' }),
+ { status: 500, statusText: 'Internal Server Error' },
+ )
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(response.status).toBe(200)
- expect(fetchCalls).toHaveLength(2)
- expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
- expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
- expect(isDeploymentCoolingDown()).toBe(false)
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(500)
+ expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID])
+ expect(isDeploymentCoolingDown()).toBe(false)
})
- it('skips deployment during cooldown and goes straight to standard API', async () => {
- const spy = spyDeploymentHours(true)
+ it('returns cooldown error without serverless fallback', async () => {
markDeploymentScalingUp()
const fetchCalls: string[] = []
@@ -300,26 +239,21 @@ describe('Fireworks deployment routing', () => {
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(response.status).toBe(200)
- expect(fetchCalls).toHaveLength(1)
- expect(fetchCalls[0]).toBe(STANDARD_MODEL_ID)
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(503)
+ expect(fetchCalls).toHaveLength(0)
})
it('uses standard API for models without a custom deployment', async () => {
- const spy = spyDeploymentHours(true)
const fetchCalls: string[] = []
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
@@ -328,27 +262,43 @@ describe('Fireworks deployment routing', () => {
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: { ...minimalBody, model: 'some-other/model' } as never,
- originalModel: 'some-other/model',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(response.status).toBe(200)
- expect(fetchCalls).toHaveLength(1)
- // Model without mapping falls through to the original model
- expect(fetchCalls[0]).toBe('some-other/model')
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: { ...minimalBody, model: 'some-other/model' } as never,
+ originalModel: 'some-other/model',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: BEFORE_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(200)
+ expect(fetchCalls).toHaveLength(1)
+ // Model without mapping falls through to the original model
+ expect(fetchCalls[0]).toBe('some-other/model')
+ })
+
+ it('returns an availability error for deployment models outside hours', async () => {
+ const mockFetch = mock(async () => {
+ throw new Error('should not fetch outside deployment hours')
+ }) as unknown as typeof globalThis.fetch
+
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: BEFORE_DEPLOYMENT_HOURS,
+ })
+
+ expect(response.status).toBe(503)
+ const body = await response.json()
+ expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
})
it('returns non-5xx responses from deployment without fallback (e.g. 429)', async () => {
- const spy = spyDeploymentHours(true)
const fetchCalls: string[] = []
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
@@ -360,23 +310,20 @@ describe('Fireworks deployment routing', () => {
)
}) as unknown as typeof globalThis.fetch
- try {
- const response = await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- // Non-5xx errors from deployment are returned as-is (caller handles them)
- expect(response.status).toBe(429)
- expect(fetchCalls).toHaveLength(1)
- expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
- } finally {
- spy.restore()
- }
+ const response = await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ // Non-5xx errors from deployment are returned as-is (caller handles them)
+ expect(response.status).toBe(429)
+ expect(fetchCalls).toHaveLength(1)
+ expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
})
it('transforms reasoning to reasoning_effort (defaults to medium)', async () => {
@@ -535,41 +482,31 @@ describe('Fireworks deployment routing', () => {
expect(fetchedBodies[0].reasoning_effort).toBe('low')
})
- it('logs when trying deployment and when falling back on 5xx', async () => {
- const spy = spyDeploymentHours(true)
- let callCount = 0
-
+ it('logs when trying deployment and when deployment returns 5xx', async () => {
const mockFetch = mock(async () => {
- callCount++
- if (callCount === 1) {
- return new Response(
- JSON.stringify({
- error: {
- message: 'Scaling up',
- code: 'DEPLOYMENT_SCALING_UP',
- type: 'error',
- },
- }),
- { status: 503, statusText: 'Service Unavailable' },
- )
- }
- return new Response(JSON.stringify({ ok: true }), { status: 200 })
+ return new Response(
+ JSON.stringify({
+ error: {
+ message: 'Scaling up',
+ code: 'DEPLOYMENT_SCALING_UP',
+ type: 'error',
+ },
+ }),
+ { status: 503, statusText: 'Service Unavailable' },
+ )
}) as unknown as typeof globalThis.fetch
- try {
- await createFireworksRequestWithFallback({
- body: minimalBody as never,
- originalModel: 'z-ai/glm-5.1',
- fetch: mockFetch,
- logger,
- useCustomDeployment: true,
- sessionId: 'test-user-id',
- })
-
- expect(logger.info).toHaveBeenCalledTimes(2)
- } finally {
- spy.restore()
- }
+ await createFireworksRequestWithFallback({
+ body: minimalBody as never,
+ originalModel: 'z-ai/glm-5.1',
+ fetch: mockFetch,
+ logger,
+ useCustomDeployment: true,
+ sessionId: 'test-user-id',
+ now: IN_DEPLOYMENT_HOURS,
+ })
+
+ expect(logger.info).toHaveBeenCalledTimes(2)
})
})
})
diff --git a/web/src/llm-api/fireworks-config.ts b/web/src/llm-api/fireworks-config.ts
index fb6d59580..566728250 100644
--- a/web/src/llm-api/fireworks-config.ts
+++ b/web/src/llm-api/fireworks-config.ts
@@ -10,7 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217'
export const FIREWORKS_DEPLOYMENT_MAP: Record = {
// 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9',
- // 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',
- // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd',
'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea',
+ // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd',
}
diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts
index 6e304638d..028ad4222 100644
--- a/web/src/llm-api/fireworks.ts
+++ b/web/src/llm-api/fireworks.ts
@@ -1,5 +1,9 @@
import { Agent } from 'undici'
+import {
+ FREEBUFF_DEPLOYMENT_HOURS_LABEL,
+ isFreebuffDeploymentHours,
+} from '@codebuff/common/constants/freebuff-models'
import { PROFIT_MARGIN } from '@codebuff/common/constants/limits'
import { getErrorObject } from '@codebuff/common/util/error'
import { env } from '@codebuff/internal/env'
@@ -32,15 +36,14 @@ const FIREWORKS_MODEL_MAP: Record = {
'minimax/minimax-m2.5': 'accounts/fireworks/models/minimax-m2p5',
'minimax/minimax-m2.7': 'accounts/fireworks/models/minimax-m2p7',
'z-ai/glm-5.1': 'accounts/fireworks/models/glm-5p1',
- 'moonshotai/kimi-k2.5': 'accounts/fireworks/models/kimi-k2p5',
}
/** Flag to enable custom Fireworks deployments (set to false to use global API only) */
const FIREWORKS_USE_CUSTOM_DEPLOYMENT = true
-/** Check if current time is within deployment hours (always enabled) */
-export function isDeploymentHours(_now: Date = new Date()): boolean {
- return true
+/** Check if current time is within deployment hours: Mon-Fri, 9am ET to 5pm PT. */
+export function isDeploymentHours(now: Date = new Date()): boolean {
+ return isFreebuffDeploymentHours(now)
}
/**
@@ -93,7 +96,7 @@ function createFireworksRequest(params: {
// Transform OpenRouter-style `reasoning` object into Fireworks' `reasoning_effort`.
// Unlike OpenAI, Fireworks supports reasoning_effort together with function tools
- // (e.g. GLM-4.5/5.1 and Kimi K2 are designed for interleaved reasoning + tool use).
+ // (e.g. GLM-4.5/5.1 are designed for interleaved reasoning + tool use).
if (fireworksBody.reasoning && typeof fireworksBody.reasoning === 'object') {
const reasoning = fireworksBody.reasoning as {
enabled?: boolean
@@ -165,15 +168,10 @@ const FIREWORKS_PRICING_MAP: Record = {
cachedInputCostPerToken: 0.26 / 1_000_000,
outputCostPerToken: 4.40 / 1_000_000,
},
- 'moonshotai/kimi-k2.5': {
- inputCostPerToken: 0.60 / 1_000_000,
- cachedInputCostPerToken: 0.10 / 1_000_000,
- outputCostPerToken: 3.00 / 1_000_000,
- },
}
function getFireworksPricing(model: string): FireworksPricing {
- return FIREWORKS_PRICING_MAP[model] ?? FIREWORKS_MODEL_MAP['z-ai/glm-5.1']
+ return FIREWORKS_PRICING_MAP[model] ?? FIREWORKS_PRICING_MAP['z-ai/glm-5.1']
}
function extractUsageAndCost(usage: Record | undefined | null, model: string): UsageData {
@@ -708,9 +706,10 @@ async function parseFireworksError(response: Response): Promise
}
/**
- * Tries the custom Fireworks deployment during business hours (10am–8pm ET),
- * falling back to the standard API if the deployment returns 503 DEPLOYMENT_SCALING_UP.
- * Outside deployment hours or during cooldown, goes straight to the standard API.
+ * Uses custom Fireworks deployments only during deployment hours. Deployment
+ * mapped models never fall back to the serverless API outside hours, during
+ * cooldown, or after deployment 5xxs; those states surface as provider errors
+ * so freebuff can offer MiniMax as the always-on option.
*/
export async function createFireworksRequestWithFallback(params: {
body: ChatCompletionRequestBody
@@ -719,17 +718,41 @@ export async function createFireworksRequestWithFallback(params: {
logger: Logger
useCustomDeployment?: boolean
sessionId: string
+ now?: Date
}): Promise {
const { body, originalModel, fetch, logger, sessionId } = params
+ const now = params.now ?? new Date()
const useCustomDeployment = params.useCustomDeployment ?? FIREWORKS_USE_CUSTOM_DEPLOYMENT
const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel]
- const shouldTryDeployment =
- useCustomDeployment &&
- deploymentModelId &&
- isDeploymentHours() &&
- !isDeploymentCoolingDown()
+ const hasDeployment = useCustomDeployment && Boolean(deploymentModelId)
+
+ if (hasDeployment && !isDeploymentHours(now)) {
+ return new Response(
+ JSON.stringify({
+ error: {
+ message: `${originalModel} is only available during ${FREEBUFF_DEPLOYMENT_HOURS_LABEL}. Use minimax/minimax-m2.7 outside those hours.`,
+ code: 'DEPLOYMENT_OUTSIDE_HOURS',
+ type: 'availability_error',
+ },
+ }),
+ { status: 503, statusText: 'Service Unavailable' },
+ )
+ }
- if (shouldTryDeployment) {
+ if (hasDeployment && isDeploymentCoolingDown()) {
+ return new Response(
+ JSON.stringify({
+ error: {
+ message: `${originalModel} deployment is temporarily unavailable. Use minimax/minimax-m2.7 while it recovers.`,
+ code: 'DEPLOYMENT_COOLDOWN',
+ type: 'availability_error',
+ },
+ }),
+ { status: 503, statusText: 'Service Unavailable' },
+ )
+ }
+
+ if (hasDeployment && deploymentModelId) {
logger.info(
{ model: originalModel, deploymentModel: deploymentModelId },
'Trying Fireworks custom deployment',
@@ -746,15 +769,18 @@ export async function createFireworksRequestWithFallback(params: {
const errorText = await response.text()
logger.info(
{ model: originalModel, status: response.status, errorText: errorText.slice(0, 200) },
- 'Fireworks custom deployment returned 5xx, falling back to standard API',
+ 'Fireworks custom deployment returned 5xx',
)
if (errorText.includes('DEPLOYMENT_SCALING_UP')) {
markDeploymentScalingUp()
}
- // Fall through to standard API request below
- } else {
- return response
+ return new Response(errorText, {
+ status: response.status,
+ statusText: response.statusText,
+ headers: response.headers,
+ })
}
+ return response
}
return createFireworksRequest({ body, originalModel, fetch, sessionId })
diff --git a/web/src/server/free-session/__tests__/config.test.ts b/web/src/server/free-session/__tests__/config.test.ts
new file mode 100644
index 000000000..93f5fdcf0
--- /dev/null
+++ b/web/src/server/free-session/__tests__/config.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, test } from 'bun:test'
+
+import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models'
+
+import { getInstantAdmitCapacity } from '../config'
+
+describe('free session config', () => {
+ test('every selectable freebuff model has instant-admit capacity', () => {
+ for (const model of FREEBUFF_MODELS) {
+ expect(getInstantAdmitCapacity(model.id)).toBeGreaterThan(0)
+ }
+ })
+})
diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts
index a824f6d22..a90bc800d 100644
--- a/web/src/server/free-session/__tests__/public-api.test.ts
+++ b/web/src/server/free-session/__tests__/public-api.test.ts
@@ -13,7 +13,7 @@ import type { InternalSessionRow } from '../types'
const SESSION_LEN = 60 * 60 * 1000
const GRACE_MS = 30 * 60 * 1000
-const DEFAULT_MODEL = 'z-ai/glm-5.1'
+const DEFAULT_MODEL = 'minimax/minimax-m2.7'
function makeDeps(overrides: Partial = {}): SessionDeps & {
rows: Map
@@ -177,19 +177,34 @@ describe('requestSession', () => {
expect(state.instanceId).toBe('inst-1')
})
+ test('deployment-hours-only model is unavailable outside deployment hours', async () => {
+ const state = await requestSession({
+ userId: 'u1',
+ model: 'z-ai/glm-5.1',
+ deps,
+ })
+ expect(state).toEqual({
+ status: 'model_unavailable',
+ requestedModel: 'z-ai/glm-5.1',
+ availableHours: '9am ET-5pm PT',
+ })
+ expect(deps.rows.size).toBe(0)
+ })
+
test('queued response includes a per-model depth snapshot for the selector', async () => {
- // Seed 2 users in glm + 1 in minimax so the returned map captures both.
+ deps._tick(new Date('2026-04-17T16:00:00Z'))
+ // Seed 2 users in MiniMax + 1 in GLM so the returned map captures both.
await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps })
deps._tick(new Date(deps._now().getTime() + 1000))
await requestSession({ userId: 'u2', model: DEFAULT_MODEL, deps })
deps._tick(new Date(deps._now().getTime() + 1000))
- await requestSession({ userId: 'u3', model: 'minimax/minimax-m2.7', deps })
+ await requestSession({ userId: 'u3', model: 'z-ai/glm-5.1', deps })
const state = await getSessionState({ userId: 'u1', deps })
if (state.status !== 'queued') throw new Error('unreachable')
expect(state.queueDepthByModel).toEqual({
[DEFAULT_MODEL]: 2,
- 'minimax/minimax-m2.7': 1,
+ 'z-ai/glm-5.1': 1,
})
})
@@ -264,11 +279,12 @@ describe('requestSession', () => {
})
test('instant-admit: per-model capacities are independent', async () => {
- // GLM saturated at 1 active, MiniMax still has room.
+ // MiniMax saturated at 1 active, GLM still has room.
const admitDeps = makeDeps({
getInstantAdmitCapacity: (model) =>
model === DEFAULT_MODEL ? 1 : 10,
})
+ admitDeps._tick(new Date('2026-04-17T16:00:00Z'))
await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps: admitDeps })
const s2 = await requestSession({
userId: 'u2',
@@ -277,7 +293,7 @@ describe('requestSession', () => {
})
const s3 = await requestSession({
userId: 'u3',
- model: 'minimax/minimax-m2.7',
+ model: 'z-ai/glm-5.1',
deps: admitDeps,
})
expect(s2.status).toBe('queued')
diff --git a/web/src/server/free-session/admission.ts b/web/src/server/free-session/admission.ts
index 3f3c051d2..9f0b74c9f 100644
--- a/web/src/server/free-session/admission.ts
+++ b/web/src/server/free-session/admission.ts
@@ -1,4 +1,7 @@
-import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models'
+import {
+ FREEBUFF_MODELS,
+ isFreebuffModelAvailable,
+} from '@codebuff/common/constants/freebuff-models'
import {
ADMISSION_TICK_MS,
@@ -111,7 +114,10 @@ export async function runAdmissionTick(
// advisory locks and a single update each.
const perModel = await Promise.all(
models.map(async (model) => {
- const health = fleet[model] ?? 'healthy'
+ const isRegisteredModel = FREEBUFF_MODELS.some((m) => m.id === model)
+ const health = !isRegisteredModel || isFreebuffModelAvailable(model, now)
+ ? fleet[model] ?? 'healthy'
+ : 'unhealthy'
const { admitted, skipped } = await deps.admitFromQueue({
model,
sessionLengthMs: deps.sessionLengthMs,
diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts
index 450540443..7ea85f2e4 100644
--- a/web/src/server/free-session/public-api.ts
+++ b/web/src/server/free-session/public-api.ts
@@ -1,4 +1,6 @@
import {
+ FREEBUFF_DEPLOYMENT_HOURS_LABEL,
+ isFreebuffModelAvailable,
isFreebuffModelId as isSelectableFreebuffModel,
resolveFreebuffModel,
} from '@codebuff/common/constants/freebuff-models'
@@ -122,6 +124,11 @@ export type RequestSessionResult =
currentModel: string
requestedModel: string
}
+ | {
+ status: 'model_unavailable'
+ requestedModel: string
+ availableHours: string
+ }
/**
* Client calls this on CLI startup with the model they want to use.
@@ -152,6 +159,7 @@ export async function requestSession(params: {
}): Promise {
const deps = params.deps ?? defaultDeps
const model = resolveFreebuffModel(params.model)
+ const now = nowOf(deps)
if (params.userBanned) {
return { status: 'banned' }
}
@@ -161,13 +169,20 @@ export async function requestSession(params: {
) {
return { status: 'disabled' }
}
+ if (!isFreebuffModelAvailable(model, now)) {
+ return {
+ status: 'model_unavailable',
+ requestedModel: model,
+ availableHours: FREEBUFF_DEPLOYMENT_HOURS_LABEL,
+ }
+ }
let row: InternalSessionRow
try {
row = await deps.joinOrTakeOver({
userId: params.userId,
model,
- now: nowOf(deps),
+ now,
})
} catch (err) {
if (err instanceof FreeSessionModelLockedError) {
@@ -199,7 +214,7 @@ export async function requestSession(params: {
userId: params.userId,
model,
sessionLengthMs: deps.sessionLengthMs,
- now: nowOf(deps),
+ now,
})
if (promoted) row = promoted
}