diff --git a/agents/__tests__/editor.test.ts b/agents/__tests__/editor.test.ts index 030857c8d..36d6b75c5 100644 --- a/agents/__tests__/editor.test.ts +++ b/agents/__tests__/editor.test.ts @@ -67,6 +67,11 @@ describe('editor agent', () => { expect(glmEditor.model).toBe('z-ai/glm-5.1') }) + test('creates minimax editor', () => { + const minimaxEditor = createCodeEditor({ model: 'minimax' }) + expect(minimaxEditor.model).toBe('minimax/minimax-m2.7') + }) + test('gpt-5 editor does not include think tags in instructions', () => { const gpt5Editor = createCodeEditor({ model: 'gpt-5' }) expect(gpt5Editor.instructionsPrompt).not.toContain('') @@ -79,6 +84,12 @@ describe('editor agent', () => { expect(glmEditor.instructionsPrompt).not.toContain('') }) + test('minimax editor does not include think tags in instructions', () => { + const minimaxEditor = createCodeEditor({ model: 'minimax' }) + expect(minimaxEditor.instructionsPrompt).not.toContain('') + expect(minimaxEditor.instructionsPrompt).not.toContain('') + }) + test('opus editor includes think tags in instructions', () => { const opusEditor = createCodeEditor({ model: 'opus' }) expect(opusEditor.instructionsPrompt).toContain('') diff --git a/agents/editor/editor.ts b/agents/editor/editor.ts index 3d208aa13..c98544d0f 100644 --- a/agents/editor/editor.ts +++ b/agents/editor/editor.ts @@ -4,7 +4,7 @@ import { publisher } from '../constants' import type { AgentDefinition } from '../types/agent-definition' export const createCodeEditor = (options: { - model: 'gpt-5' | 'opus' | 'glm' + model: 'gpt-5' | 'opus' | 'glm' | 'minimax' }): Omit => { const { model } = options return { @@ -12,6 +12,8 @@ export const createCodeEditor = (options: { model: options.model === 'gpt-5' ? 'openai/gpt-5.1' + : options.model === 'minimax' + ? 'minimax/minimax-m2.7' : options.model === 'glm' ? 'z-ai/glm-5.1' : 'anthropic/claude-opus-4.7', @@ -65,7 +67,7 @@ OR for new files or major rewrites: } -${model === 'gpt-5' || model === 'glm' +${model === 'gpt-5' || model === 'glm' || model === 'minimax' ? '' : `Before you start writing your implementation, you should use tags to think about the best way to implement the changes. diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts index b28a77c31..3608f3631 100644 --- a/agents/types/agent-definition.ts +++ b/agents/types/agent-definition.ts @@ -423,8 +423,6 @@ export type ModelName = // Other open source models | 'moonshotai/kimi-k2' | 'moonshotai/kimi-k2:nitro' - | 'moonshotai/kimi-k2.5' - | 'moonshotai/kimi-k2.5:nitro' | 'z-ai/glm-5' | 'z-ai/glm-5.1' | 'z-ai/glm-4.6' diff --git a/cli/src/components/freebuff-model-selector.tsx b/cli/src/components/freebuff-model-selector.tsx index a33d89540..5abaac272 100644 --- a/cli/src/components/freebuff-model-selector.tsx +++ b/cli/src/components/freebuff-model-selector.tsx @@ -3,9 +3,16 @@ import { useKeyboard } from '@opentui/react' import React, { useCallback, useEffect, useMemo, useState } from 'react' import { Button } from './button' -import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models' +import { + DEFAULT_FREEBUFF_MODEL_ID, + FREEBUFF_DEPLOYMENT_HOURS_LABEL, + FREEBUFF_GLM_MODEL_ID, + FREEBUFF_MODELS, + isFreebuffModelAvailable, +} from '@codebuff/common/constants/freebuff-models' import { joinFreebuffQueue } from '../hooks/use-freebuff-session' +import { useNow } from '../hooks/use-now' import { useFreebuffModelStore } from '../state/freebuff-model-store' import { useFreebuffSessionStore } from '../state/freebuff-session-store' import { useTerminalDimensions } from '../hooks/use-terminal-dimensions' @@ -13,6 +20,11 @@ import { useTheme } from '../hooks/use-theme' import type { KeyEvent } from '@opentui/core' +const FREEBUFF_MODEL_SELECTOR_MODELS = [ + ...FREEBUFF_MODELS.filter((model) => model.id === FREEBUFF_GLM_MODEL_ID), + ...FREEBUFF_MODELS.filter((model) => model.id !== FREEBUFF_GLM_MODEL_ID), +] + /** * Dual-purpose model picker: * - Pre-chat landing (session 'none'): user hasn't joined any queue. Picking @@ -33,7 +45,9 @@ export const FreebuffModelSelector: React.FC = () => { const theme = useTheme() const { terminalWidth } = useTerminalDimensions() const selectedModel = useFreebuffModelStore((s) => s.selectedModel) + const setSelectedModel = useFreebuffModelStore((s) => s.setSelectedModel) const session = useFreebuffSessionStore((s) => s.session) + const now = useNow(60_000) const [pending, setPending] = useState(null) const [hoveredId, setHoveredId] = useState(null) // Keyboard cursor — separate from the actually-selected model so that @@ -45,6 +59,15 @@ export const FreebuffModelSelector: React.FC = () => { setFocusedId(selectedModel) }, [selectedModel]) + useEffect(() => { + if ( + (session?.status === 'none' || !session) && + !isFreebuffModelAvailable(selectedModel, new Date(now)) + ) { + setSelectedModel(DEFAULT_FREEBUFF_MODEL_ID) + } + }, [now, selectedModel, session, setSelectedModel]) + // Landing ('none'): depths come from the server snapshot, no "self" to // subtract. In-queue ('queued'): for the user's queue, "ahead" is // `position - 1` (themselves don't count); for every other queue, switching @@ -85,18 +108,22 @@ export const FreebuffModelSelector: React.FC = () => { ) // Decide row vs column layout based on whether both buttons actually fit - // side-by-side. Each button's inner text is "● {displayName} · {tagline} {hint}", + // side-by-side. Each button's inner text is + // "● {displayName} · {tagline} · {hours} {hint}", // plus 2 cols of border and 2 cols of padding. Buttons are separated by a // gap of 2. If the total exceeds the terminal width, stack vertically. const stackVertically = useMemo(() => { const BUTTON_CHROME = 4 // 2 border + 2 padding const GAP = 2 - const total = FREEBUFF_MODELS.reduce((sum, model, idx) => { + const total = FREEBUFF_MODEL_SELECTOR_MODELS.reduce((sum, model, idx) => { const inner = 2 /* indicator + space */ + model.displayName.length + 3 /* " · " */ + model.tagline.length + + (model.availability === 'deployment_hours' + ? 3 + FREEBUFF_DEPLOYMENT_HOURS_LABEL.length + : 0) + 2 /* " " */ + hintWidth return sum + inner + BUTTON_CHROME + (idx > 0 ? GAP : 0) @@ -115,10 +142,11 @@ export const FreebuffModelSelector: React.FC = () => { (modelId: string) => { if (pending) return if (modelId === committedModelId) return + if (!isFreebuffModelAvailable(modelId, new Date(now))) return setPending(modelId) joinFreebuffQueue(modelId).finally(() => setPending(null)) }, - [pending, committedModelId], + [pending, committedModelId, now], ) // Tab / Shift+Tab and arrow keys move the focus highlight only; Enter or @@ -136,25 +164,30 @@ export const FreebuffModelSelector: React.FC = () => { const isCommit = name === 'return' || name === 'enter' || name === 'space' if (!isForward && !isBackward && !isCommit) return if (isCommit) { - if (focusedId !== committedModelId) { + if ( + focusedId !== committedModelId && + isFreebuffModelAvailable(focusedId, new Date(now)) + ) { key.preventDefault?.() pick(focusedId) } return } - const currentIdx = FREEBUFF_MODELS.findIndex((m) => m.id === focusedId) + const currentIdx = FREEBUFF_MODEL_SELECTOR_MODELS.findIndex( + (m) => m.id === focusedId, + ) if (currentIdx === -1) return - const len = FREEBUFF_MODELS.length + const len = FREEBUFF_MODEL_SELECTOR_MODELS.length const nextIdx = isForward ? (currentIdx + 1) % len : (currentIdx - 1 + len) % len - const target = FREEBUFF_MODELS[nextIdx] + const target = FREEBUFF_MODEL_SELECTOR_MODELS[nextIdx] if (target) { key.preventDefault?.() setFocusedId(target.id) } }, - [pending, pick, focusedId, committedModelId], + [pending, pick, focusedId, committedModelId, now], ), ) @@ -173,7 +206,7 @@ export const FreebuffModelSelector: React.FC = () => { alignItems: 'flex-start', }} > - {FREEBUFF_MODELS.map((model) => { + {FREEBUFF_MODEL_SELECTOR_MODELS.map((model) => { // 'Selected' means the dot is filled and the label is bold. On the // landing screen ('none') this tracks the pre-focused pick; on the // queued screen it tracks the model the server has us on. Either @@ -181,15 +214,22 @@ export const FreebuffModelSelector: React.FC = () => { const isSelected = model.id === selectedModel const isHovered = hoveredId === model.id const isFocused = focusedId === model.id && !isSelected + const isAvailable = isFreebuffModelAvailable(model.id, new Date(now)) const indicator = isSelected ? '●' : '○' const indicatorColor = isSelected ? theme.primary : theme.muted - const labelColor = isSelected ? theme.foreground : theme.muted + const labelColor = isSelected && isAvailable ? theme.foreground : theme.muted // Clickable whenever picking would actually do something — i.e. // anything except re-picking the queue we're already in. - const interactable = !pending && model.id !== committedModelId + const interactable = !pending && isAvailable && model.id !== committedModelId const ahead = aheadByModel?.[model.id] const hint = - ahead === undefined ? '' : ahead === 0 ? 'No wait' : `${ahead} ahead` + !isAvailable + ? 'Closed' + : ahead === undefined + ? '' + : ahead === 0 + ? 'No wait' + : `${ahead} ahead` const borderColor = isSelected ? theme.primary @@ -202,7 +242,7 @@ export const FreebuffModelSelector: React.FC = () => { key={model.id} onClick={() => { setFocusedId(model.id) - pick(model.id) + if (isAvailable) pick(model.id) }} onMouseOver={() => interactable && setHoveredId(model.id)} onMouseOut={() => setHoveredId((curr) => (curr === model.id ? null : curr))} @@ -223,6 +263,9 @@ export const FreebuffModelSelector: React.FC = () => { {model.displayName} · {model.tagline} + {model.availability === 'deployment_hours' && ( + · {FREEBUFF_DEPLOYMENT_HOURS_LABEL} + )} {hint.padEnd(hintWidth)} diff --git a/cli/src/components/waiting-room-screen.tsx b/cli/src/components/waiting-room-screen.tsx index e67823f7a..251ca87c0 100644 --- a/cli/src/components/waiting-room-screen.tsx +++ b/cli/src/components/waiting-room-screen.tsx @@ -253,7 +253,7 @@ export const WaitingRoomScreen: React.FC = ({ ⚠ Account unavailable - This account can't use freebuff. If you think this is a + This account has been suspended and can't use freebuff. If you think this is a mistake, contact support@codebuff.com. Press Ctrl+C to exit. diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts index 79deea1cf..f24fba7b3 100644 --- a/cli/src/hooks/use-freebuff-session.ts +++ b/cli/src/hooks/use-freebuff-session.ts @@ -1,4 +1,5 @@ import { env } from '@codebuff/common/env' +import { DEFAULT_FREEBUFF_MODEL_ID } from '@codebuff/common/constants/freebuff-models' import { useEffect } from 'react' import { @@ -75,14 +76,18 @@ async function callSession( return body } } - // 409 from POST means the user picked a different model than their active - // session is bound to. Surface as a non-throw `model_locked` so the UI can - // show a confirmation prompt (DELETE then re-POST to switch). + // 409 from POST means the selected model cannot be joined right now, either + // because an active session is locked to another model or because a + // Surface model-switch conflicts and temporary model availability closures + // as non-throw states. if (resp.status === 409 && method === 'POST') { const body = (await resp.json().catch(() => null)) as | FreebuffSessionResponse | null - if (body && body.status === 'model_locked') { + if ( + body && + (body.status === 'model_locked' || body.status === 'model_unavailable') + ) { return body } } @@ -119,6 +124,7 @@ function nextDelayMs(next: FreebuffSessionResponse): number | null { case 'country_blocked': case 'banned': case 'model_locked': + case 'model_unavailable': return null } } @@ -398,6 +404,12 @@ export function useFreebuffSession(): UseFreebuffSessionResult { schedule(0) return } + if (next.status === 'model_unavailable') { + useFreebuffModelStore.getState().setSelectedModel(DEFAULT_FREEBUFF_MODEL_ID) + nextMethod = 'GET' + schedule(0) + return + } // Startup takeover: the initial probe GET saw we already hold a seat // (from a prior CLI instance). POST now to rotate our instance id so diff --git a/cli/src/state/freebuff-model-store.ts b/cli/src/state/freebuff-model-store.ts index 182a38831..1aa9f2db8 100644 --- a/cli/src/state/freebuff-model-store.ts +++ b/cli/src/state/freebuff-model-store.ts @@ -1,6 +1,6 @@ import { DEFAULT_FREEBUFF_MODEL_ID, - resolveFreebuffModel, + resolveAvailableFreebuffModel, } from '@codebuff/common/constants/freebuff-models' import { create } from 'zustand' @@ -24,11 +24,11 @@ interface FreebuffModelStore { } export const useFreebuffModelStore = create((set) => ({ - selectedModel: resolveFreebuffModel( + selectedModel: resolveAvailableFreebuffModel( loadFreebuffModelPreference() ?? DEFAULT_FREEBUFF_MODEL_ID, ), setSelectedModel: (model) => { - const resolved = resolveFreebuffModel(model) + const resolved = resolveAvailableFreebuffModel(model) saveFreebuffModelPreference(resolved) set({ selectedModel: resolved }) }, diff --git a/cli/src/utils/local-agent-registry.ts b/cli/src/utils/local-agent-registry.ts index 59206eb84..6106b3928 100644 --- a/cli/src/utils/local-agent-registry.ts +++ b/cli/src/utils/local-agent-registry.ts @@ -370,7 +370,7 @@ export const loadAgentDefinitions = (): AgentDefinition[] => { } // Override the model of free-mode agents to match the user's pick from the - // freebuff waiting room. Bundled definitions hardcode glm-5.1; we swap in + // freebuff waiting room. Bundled definitions hardcode a free model; we swap in // whatever the user chose so the chat-completions request body carries the // matching model and the server-side session gate doesn't reject it as a // model mismatch. diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index e44c74cc6..308e12df6 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -26,7 +26,10 @@ export const FREEBUFF_ROOT_AGENT_IDS = ['base2-free'] as const */ export const FREE_MODE_AGENT_MODELS: Record> = { // Root orchestrator - 'base2-free': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']), + 'base2-free': new Set([ + 'minimax/minimax-m2.7', + 'z-ai/glm-5.1', + ]), // File exploration agents 'file-picker': new Set(['google/gemini-2.5-flash-lite']), @@ -41,10 +44,16 @@ export const FREE_MODE_AGENT_MODELS: Record> = { 'basher': new Set(['google/gemini-3.1-flash-lite-preview']), // Editor for free mode - 'editor-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']), + 'editor-lite': new Set([ + 'minimax/minimax-m2.7', + 'z-ai/glm-5.1', + ]), // Code reviewer for free mode - 'code-reviewer-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']), + 'code-reviewer-lite': new Set([ + 'minimax/minimax-m2.7', + 'z-ai/glm-5.1', + ]), } /** diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts index d71ebd619..f1019c6fb 100644 --- a/common/src/constants/freebuff-models.ts +++ b/common/src/constants/freebuff-models.ts @@ -13,18 +13,25 @@ export interface FreebuffModelOption { displayName: string /** One-line description shown next to the label. */ tagline: string + /** Availability policy for the selector and server-side admission. */ + availability: 'always' | 'deployment_hours' } +export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT' +export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1' + export const FREEBUFF_MODELS = [ - { - id: 'z-ai/glm-5.1', - displayName: 'GLM 5.1', - tagline: 'Smartest', - }, { id: 'minimax/minimax-m2.7', displayName: 'MiniMax M2.7', tagline: 'Fastest', + availability: 'always', + }, + { + id: FREEBUFF_GLM_MODEL_ID, + displayName: 'GLM 5.1', + tagline: 'Smartest', + availability: 'deployment_hours', }, ] as const satisfies readonly FreebuffModelOption[] @@ -51,3 +58,49 @@ export function getFreebuffModel(id: string): FreebuffModelOption { FREEBUFF_MODELS.find((m) => m.id === DEFAULT_FREEBUFF_MODEL_ID)! ) } + +function getZonedParts( + date: Date, + timeZone: string, +): { weekday: string; minutes: number } { + const parts = new Intl.DateTimeFormat('en-US', { + timeZone, + weekday: 'short', + hour: '2-digit', + minute: '2-digit', + hourCycle: 'h23', + }).formatToParts(date) + const value = (type: string) => parts.find((part) => part.type === type)?.value + const hour = Number(value('hour') ?? 0) + const minute = Number(value('minute') ?? 0) + return { + weekday: value('weekday') ?? '', + minutes: hour * 60 + minute, + } +} + +export function isFreebuffDeploymentHours(now: Date = new Date()): boolean { + const eastern = getZonedParts(now, 'America/New_York') + const pacific = getZonedParts(now, 'America/Los_Angeles') + if (eastern.weekday === 'Sat' || eastern.weekday === 'Sun') return false + return eastern.minutes >= 9 * 60 && pacific.minutes < 17 * 60 +} + +export function isFreebuffModelAvailable( + id: string, + now: Date = new Date(), +): boolean { + const model = FREEBUFF_MODELS.find((m) => m.id === id) + if (!model) return false + return model.availability === 'always' || isFreebuffDeploymentHours(now) +} + +export function resolveAvailableFreebuffModel( + id: string | null | undefined, + now: Date = new Date(), +): FreebuffModelId { + const resolved = resolveFreebuffModel(id) + return isFreebuffModelAvailable(resolved, now) + ? resolved + : DEFAULT_FREEBUFF_MODEL_ID +} diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts index b28a77c31..3608f3631 100644 --- a/common/src/templates/initial-agents-dir/types/agent-definition.ts +++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts @@ -423,8 +423,6 @@ export type ModelName = // Other open source models | 'moonshotai/kimi-k2' | 'moonshotai/kimi-k2:nitro' - | 'moonshotai/kimi-k2.5' - | 'moonshotai/kimi-k2.5:nitro' | 'z-ai/glm-5' | 'z-ai/glm-5.1' | 'z-ai/glm-4.6' diff --git a/common/src/types/freebuff-session.ts b/common/src/types/freebuff-session.ts index e42d9f0be..d141000a4 100644 --- a/common/src/types/freebuff-session.ts +++ b/common/src/types/freebuff-session.ts @@ -92,6 +92,12 @@ export type FreebuffSessionServerResponse = currentModel: string requestedModel: string } + | { + /** Requested model is valid but not selectable right now. */ + status: 'model_unavailable' + requestedModel: string + availableHours: string + } | { /** Account is banned. Returned from every endpoint so banned bots can't * join the queue at all (otherwise they inflate `queueDepth` until the diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md index b1384d7b6..353bfb046 100644 --- a/docs/freebuff-waiting-room.md +++ b/docs/freebuff-waiting-room.md @@ -5,7 +5,7 @@ The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs: 1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones. -2. **Gate on per-deployment health** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` admit that tick; a degraded minimax-m2.7 no longer stalls glm-5.1 admissions. +2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; GLM 5.1 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available. 3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput. Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session. @@ -149,8 +149,8 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r | Constant | Location | Default | Purpose | |---|---|---|---| | `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. | -| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `glm-5.1`, `minimax-m2.7` | Selectable models; each gets its own queue and admission slot. | -| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | glm-5.1 only | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | +| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `glm-5.1` | Selectable models; each gets its own queue and admission slot. | +| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `glm-5.1` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | | `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. | | `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime | | `FREEBUFF_SESSION_GRACE_MS` | env | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. | @@ -180,12 +180,12 @@ Response shapes: { "status": "queued", "instanceId": "e47…", - "model": "z-ai/glm-5.1", + "model": "minimax/minimax-m2.7", "position": 17, // 1-indexed within this model's queue "queueDepth": 43, // size of this model's queue "queueDepthByModel": { // snapshot of every model's queue — powers the - "z-ai/glm-5.1": 43, // "N ahead" hint in the selector. Missing - "minimax/minimax-m2.7": 4 // entries should be treated as 0. + "minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing + "z-ai/glm-5.1": 4 // entries should be treated as 0. }, "estimatedWaitMs": 384000, "queuedAt": "2026-04-17T12:00:00Z" @@ -195,7 +195,7 @@ Response shapes: { "status": "active", "instanceId": "e47…", - "model": "z-ai/glm-5.1", + "model": "minimax/minimax-m2.7", "admittedAt": "2026-04-17T12:00:00Z", "expiresAt": "2026-04-17T13:00:00Z", "remainingMs": 3600000 @@ -219,7 +219,7 @@ Response shapes: // to actually switch. { "status": "model_locked", - "currentModel": "z-ai/glm-5.1", + "currentModel": "minimax/minimax-m2.7", "requestedModel": "minimax/minimax-m2.7" } ``` @@ -285,7 +285,7 @@ waitMs = (position - 1) * 24_000 - Position 1 → 0 (next tick admits you) - Position 2 → 24s, and so on. -`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a per-deployment Fireworks incident only the affected model's queue stalls; healthy models keep draining), so the real wait can be longer or shorter. +`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a GLM Fireworks incident or outside 9am ET-5pm PT, only GLM's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter. ## CLI Integration (frontend-side contract) @@ -324,7 +324,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr | Spamming POST/GET to starve admission tick | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. | | Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time. | | Fireworks metrics endpoint down / slow | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses. | -| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded minimax-m2.7 doesn't block glm-5.1 admissions. | +| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded GLM deployment doesn't block MiniMax admissions. | | Zombie expired sessions holding capacity | Swept on every admission tick, even when upstream is unhealthy | ## Testing diff --git a/scripts/test-fireworks-cache-intervals.ts b/scripts/test-fireworks-cache-intervals.ts index 0ed71193f..8d4e86740 100644 --- a/scripts/test-fireworks-cache-intervals.ts +++ b/scripts/test-fireworks-cache-intervals.ts @@ -13,7 +13,6 @@ * * Models: * glm-5.1 (default) — z-ai/glm-5.1 - * kimi-k2.5 — moonshotai/kimi-k2.5 * minimax — minimax/minimax-m2.5 * * Flags: @@ -39,7 +38,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' type ModelConfig = { id: string standardModel: string - deploymentModel: string + deploymentModel?: string inputCostPerToken: number cachedInputCostPerToken: number outputCostPerToken: number @@ -54,14 +53,6 @@ const MODEL_CONFIGS: Record = { cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.4 / 1_000_000, }, - 'kimi-k2.5': { - id: 'moonshotai/kimi-k2.5', - standardModel: 'accounts/fireworks/models/kimi-k2p5', - deploymentModel: 'accounts/james-65d217/deployments/mx8l5rq2', - inputCostPerToken: 0.6 / 1_000_000, - cachedInputCostPerToken: 0.1 / 1_000_000, - outputCostPerToken: 3.0 / 1_000_000, - }, minimax: { id: 'minimax/minimax-m2.5', standardModel: 'accounts/fireworks/models/minimax-m2p5', @@ -117,8 +108,12 @@ function parseArgs(): { const { modelKey, useDeployment: USE_DEPLOYMENT, intervals: INTERVALS_SEC } = parseArgs() const MODEL = MODEL_CONFIGS[modelKey] +if (USE_DEPLOYMENT && !MODEL.deploymentModel) { + console.error(`❌ No custom deployment configured for ${MODEL.id}`) + process.exit(1) +} const FIREWORKS_MODEL = USE_DEPLOYMENT - ? MODEL.deploymentModel + ? MODEL.deploymentModel! : MODEL.standardModel const INPUT_COST_PER_TOKEN = MODEL.inputCostPerToken const CACHED_INPUT_COST_PER_TOKEN = MODEL.cachedInputCostPerToken diff --git a/scripts/test-fireworks-long.ts b/scripts/test-fireworks-long.ts index 67028228d..a1e4950f8 100644 --- a/scripts/test-fireworks-long.ts +++ b/scripts/test-fireworks-long.ts @@ -12,12 +12,17 @@ * Models: * glm-5.1 (default) — z-ai/glm-5.1 * minimax — minimax/minimax-m2.5 + * minimax-m2.7 — minimax/minimax-m2.7 * * Flags: * --deployment Use custom deployment instead of serverless (standard API) * Serverless is the default + * Examples: + * bun scripts/test-fireworks-long.ts glm-5.1 --deployment */ +import { FIREWORKS_DEPLOYMENT_MAP } from '../web/src/llm-api/fireworks-config' + export { } const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' @@ -25,7 +30,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' type ModelConfig = { id: string // OpenRouter-style ID (for display) standardModel: string // Fireworks standard API model ID - deploymentModel: string // Fireworks custom deployment model ID + deploymentModel?: string // Fireworks custom deployment model ID inputCostPerToken: number cachedInputCostPerToken: number outputCostPerToken: number @@ -35,19 +40,11 @@ const MODEL_CONFIGS: Record = { 'glm-5.1': { id: 'z-ai/glm-5.1', standardModel: 'accounts/fireworks/models/glm-5p1', - deploymentModel: 'accounts/james-65d217/deployments/mjb4i7ea', + deploymentModel: FIREWORKS_DEPLOYMENT_MAP['z-ai/glm-5.1'], inputCostPerToken: 1.40 / 1_000_000, cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, }, - 'kimi-k2.5': { - id: 'moonshotai/kimi-k2.5', - standardModel: 'accounts/fireworks/models/kimi-k2p5', - deploymentModel: 'accounts/james-65d217/deployments/mx8l5rq2', - inputCostPerToken: 0.60 / 1_000_000, - cachedInputCostPerToken: 0.10 / 1_000_000, - outputCostPerToken: 3.00 / 1_000_000, - }, minimax: { id: 'minimax/minimax-m2.5', standardModel: 'accounts/fireworks/models/minimax-m2p5', @@ -67,9 +64,16 @@ const MODEL_CONFIGS: Record = { } const DEFAULT_MODEL = 'glm-5.1' +const MODEL_ALIASES: Record = { + glm: 'glm-5.1', + 'z-ai/glm-5.1': 'glm-5.1', + 'minimax/minimax-m2.5': 'minimax', + 'minimax/minimax-m2.7': 'minimax-m2.7', +} function getModelConfig(modelArg?: string): ModelConfig { - const key = modelArg ?? DEFAULT_MODEL + const rawKey = modelArg ?? DEFAULT_MODEL + const key = MODEL_ALIASES[rawKey] ?? rawKey const config = MODEL_CONFIGS[key] if (!config) { console.error(`❌ Unknown model: "${key}". Available models: ${Object.keys(MODEL_CONFIGS).join(', ')}`) @@ -83,7 +87,11 @@ const modelArg = process.argv.find((a, i) => i > 1 && !a.startsWith('-') && a != const MODEL = getModelConfig(modelArg) // Default to serverless (standard API); use --deployment for custom deployment -const FIREWORKS_MODEL = USE_DEPLOYMENT ? MODEL.deploymentModel : MODEL.standardModel +if (USE_DEPLOYMENT && !MODEL.deploymentModel) { + console.error(`❌ No custom deployment configured for ${MODEL.id}`) + process.exit(1) +} +const FIREWORKS_MODEL = USE_DEPLOYMENT ? MODEL.deploymentModel! : MODEL.standardModel const INPUT_COST_PER_TOKEN = MODEL.inputCostPerToken const CACHED_INPUT_COST_PER_TOKEN = MODEL.cachedInputCostPerToken const OUTPUT_COST_PER_TOKEN = MODEL.outputCostPerToken @@ -455,4 +463,4 @@ async function main() { console.log('Done!') } -main() \ No newline at end of file +main() diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index 51a3eb46b..1aac8800c 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -1,6 +1,7 @@ import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test' import { NextRequest } from 'next/server' +import { isFreebuffDeploymentHours } from '@codebuff/common/constants/freebuff-models' import { formatQuotaResetCountdown, postChatCompletions } from '../_post' import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics' @@ -528,7 +529,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-new-free' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: false, codebuff_metadata: { run_id: 'run-free', @@ -555,6 +556,76 @@ describe('/api/v1/chat/completions POST endpoint', () => { expect(response.status).toBe(200) }) + it('lets freebuff use GLM 5.1 through Fireworks availability rules', async () => { + const fetchedBodies: Record[] = [] + const fetchViaFireworks = mock( + async (_url: string | URL | Request, init?: RequestInit) => { + fetchedBodies.push(JSON.parse(init?.body as string)) + return new Response( + JSON.stringify({ + id: 'test-id', + model: 'accounts/james-65d217/deployments/mjb4i7ea', + choices: [{ message: { content: 'test response' } }], + usage: { + prompt_tokens: 10, + completion_tokens: 20, + total_tokens: 30, + }, + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }, + ) + }, + ) as unknown as typeof globalThis.fetch + + const req = new NextRequest( + 'http://localhost:3000/api/v1/chat/completions', + { + method: 'POST', + headers: { Authorization: 'Bearer test-api-key-new-free' }, + body: JSON.stringify({ + model: 'z-ai/glm-5.1', + stream: false, + codebuff_metadata: { + run_id: 'run-free', + client_id: 'test-client-id-123', + cost_mode: 'free', + }, + }), + }, + ) + + const response = await postChatCompletions({ + req, + getUserInfoFromApiKey: mockGetUserInfoFromApiKey, + logger: mockLogger, + trackEvent: mockTrackEvent, + getUserUsageData: mockGetUserUsageData, + getAgentRunFromId: mockGetAgentRunFromId, + fetch: fetchViaFireworks, + insertMessageBigquery: mockInsertMessageBigquery, + loggerWithContext: mockLoggerWithContext, + checkSessionAdmissible: mockCheckSessionAdmissibleAllow, + }) + + const body = await response.json() + if (isFreebuffDeploymentHours()) { + expect(response.status).toBe(200) + expect(fetchedBodies).toHaveLength(1) + expect(fetchedBodies[0].model).toBe( + 'accounts/james-65d217/deployments/mjb4i7ea', + ) + expect(body.model).toBe('z-ai/glm-5.1') + expect(body.provider).toBe('Fireworks') + } else { + expect(response.status).toBe(503) + expect(fetchedBodies).toHaveLength(0) + expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') + } + }) + it('skips credit check when in FREE mode even with 0 credits', async () => { const req = new NextRequest( 'http://localhost:3000/api/v1/chat/completions', @@ -562,7 +633,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-no-credits' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: false, codebuff_metadata: { run_id: 'run-free', @@ -671,7 +742,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-new-free' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: true, codebuff_metadata: { run_id: 'run-123', @@ -853,7 +924,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-123' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: false, codebuff_metadata: { run_id: 'run-free', diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts index 657c17f6d..ffcb8fd36 100644 --- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts +++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts @@ -3,6 +3,7 @@ import { describe, expect, test } from 'bun:test' import { deleteFreebuffSession, FREEBUFF_INSTANCE_HEADER, + FREEBUFF_MODEL_HEADER, getFreebuffSession, postFreebuffSession, } from '../_handlers' @@ -12,16 +13,17 @@ import type { SessionDeps } from '@/server/free-session/public-api' import type { InternalSessionRow } from '@/server/free-session/types' import type { NextRequest } from 'next/server' -const DEFAULT_MODEL = 'z-ai/glm-5.1' +const DEFAULT_MODEL = 'minimax/minimax-m2.7' function makeReq( apiKey: string | null, - opts: { instanceId?: string; cfCountry?: string } = {}, + opts: { instanceId?: string; cfCountry?: string; model?: string } = {}, ): NextRequest { const headers = new Headers() if (apiKey) headers.set('Authorization', `Bearer ${apiKey}`) if (opts.instanceId) headers.set(FREEBUFF_INSTANCE_HEADER, opts.instanceId) if (opts.cfCountry) headers.set('cf-ipcountry', opts.cfCountry) + if (opts.model) headers.set(FREEBUFF_MODEL_HEADER, opts.model) return { headers, } as unknown as NextRequest @@ -153,6 +155,19 @@ describe('POST /api/v1/freebuff/session', () => { expect(body.status).toBe('queued') }) + test('returns model_unavailable for GLM outside deployment hours', async () => { + const sessionDeps = makeSessionDeps() + const resp = await postFreebuffSession( + makeReq('ok', { model: 'z-ai/glm-5.1' }), + makeDeps(sessionDeps, 'u1'), + ) + expect(resp.status).toBe(409) + const body = await resp.json() + expect(body.status).toBe('model_unavailable') + expect(body.availableHours).toBe('9am ET-5pm PT') + expect(sessionDeps.rows.size).toBe(0) + }) + // Banned bots with valid API keys were POSTing every few seconds and // inflating queueDepth between the 15s admission-tick sweeps. Rejecting at // the HTTP layer with 403 (terminal, like country_blocked) keeps them out diff --git a/web/src/app/api/v1/freebuff/session/_handlers.ts b/web/src/app/api/v1/freebuff/session/_handlers.ts index ec17568a3..6f93e9282 100644 --- a/web/src/app/api/v1/freebuff/session/_handlers.ts +++ b/web/src/app/api/v1/freebuff/session/_handlers.ts @@ -138,12 +138,17 @@ export async function postFreebuffSession( model: requestedModel, deps: deps.sessionDeps, }) - // model_locked is a 409 so it's distinguishable from a normal queued/active - // response on the client. banned is a 403 (terminal, mirrors country_blocked) - // so older CLIs that don't know the status fall into their `!resp.ok` error - // path and back off instead of tight-polling on the unrecognized 200 body. + // model_locked / model_unavailable are 409 so they're distinguishable from + // normal queued/active responses on the client. banned is a 403 (terminal, + // mirrors country_blocked) so older CLIs that don't know the status fall + // into their `!resp.ok` error path and back off instead of tight-polling + // on the unrecognized 200 body. const status = - state.status === 'model_locked' ? 409 : state.status === 'banned' ? 403 : 200 + state.status === 'model_locked' || state.status === 'model_unavailable' + ? 409 + : state.status === 'banned' + ? 403 + : 200 return NextResponse.json(state, { status }) } catch (error) { return serverError(deps, 'POST', auth.userId, error) diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index 9ed91fd0a..58863c674 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -3,7 +3,7 @@ import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test' import { createFireworksRequestWithFallback, DEPLOYMENT_COOLDOWN_MS, - FireworksError, + isDeploymentHours, isDeploymentCoolingDown, markDeploymentScalingUp, resetDeploymentCooldown, @@ -13,6 +13,11 @@ import type { Logger } from '@codebuff/common/types/contracts/logger' const STANDARD_MODEL_ID = 'accounts/fireworks/models/glm-5p1' const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/mjb4i7ea' +const IN_DEPLOYMENT_HOURS = new Date('2026-04-17T16:00:00Z') // Friday, 12pm ET / 9am PT +const BEFORE_DEPLOYMENT_HOURS = new Date('2026-04-17T12:59:00Z') // Friday, 8:59am ET +const AFTER_DEPLOYMENT_HOURS = new Date('2026-04-18T00:00:00Z') // Friday, 5pm PT +const WEEKDAY_AFTER_DEPLOYMENT_HOURS = new Date('2026-04-21T00:01:00Z') // Monday, 5:01pm PT +const WEEKEND_DEPLOYMENT_HOURS = new Date('2026-04-18T16:00:00Z') // Saturday function createMockLogger(): Logger { return { @@ -23,18 +28,20 @@ function createMockLogger(): Logger { } } -// Helper: create a Date at a specific ET hour using a known EDT date (June 2025, UTC-4) -function dateAtEtHour(hour: number): Date { - // June 15, 2025 is EDT (UTC-4), so ET hour H = UTC hour H+4 - const utcHour = hour + 4 - if (utcHour < 24) { - return new Date(`2025-06-15T${String(utcHour).padStart(2, '0')}:30:00Z`) - } - // Wraps to next day - return new Date(`2025-06-16T${String(utcHour - 24).padStart(2, '0')}:30:00Z`) -} - describe('Fireworks deployment routing', () => { + describe('deployment hours', () => { + it('is active from 9am ET until before 5pm PT on weekdays', () => { + expect(isDeploymentHours(BEFORE_DEPLOYMENT_HOURS)).toBe(false) + expect(isDeploymentHours(IN_DEPLOYMENT_HOURS)).toBe(true) + expect(isDeploymentHours(AFTER_DEPLOYMENT_HOURS)).toBe(false) + expect(isDeploymentHours(WEEKDAY_AFTER_DEPLOYMENT_HOURS)).toBe(false) + }) + + it('is inactive on weekends', () => { + expect(isDeploymentHours(WEEKEND_DEPLOYMENT_HOURS)).toBe(false) + }) + }) + describe('deployment cooldown', () => { beforeEach(() => { resetDeploymentCooldown() @@ -82,28 +89,6 @@ describe('Fireworks deployment routing', () => { messages: [{ role: 'user' as const, content: 'test' }], } - function spyDeploymentHours(inHours: boolean) { - // Control isDeploymentHours by mocking Date.prototype.toLocaleString - // When called with the ET timezone options, return an hour inside or outside the window - const original = Date.prototype.toLocaleString - const spy = { - restore: () => { - Date.prototype.toLocaleString = original - }, - } - Date.prototype.toLocaleString = function ( - this: Date, - ...args: Parameters - ) { - const options = args[1] as Intl.DateTimeFormatOptions | undefined - if (options?.timeZone === 'America/New_York' && options?.hour === 'numeric') { - return inHours ? '14' : '3' - } - return original.apply(this, args) - } - return spy - } - it('uses standard API when custom deployment is disabled', async () => { const fetchCalls: string[] = [] @@ -128,7 +113,6 @@ describe('Fireworks deployment routing', () => { }) it('tries custom deployment during deployment hours', async () => { - const spy = spyDeploymentHours(true) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { @@ -137,160 +121,115 @@ describe('Fireworks deployment routing', () => { return new Response(JSON.stringify({ ok: true }), { status: 200 }) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(1) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toHaveLength(1) + expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) }) - it('falls back to standard API on 503 DEPLOYMENT_SCALING_UP', async () => { - const spy = spyDeploymentHours(true) + it('returns deployment 503 on DEPLOYMENT_SCALING_UP without serverless fallback', async () => { const fetchCalls: string[] = [] - let callCount = 0 const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { const body = JSON.parse(init?.body as string) fetchCalls.push(body.model) - callCount++ - - if (callCount === 1) { - return new Response( - JSON.stringify({ - error: { - message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.', - code: 'DEPLOYMENT_SCALING_UP', - type: 'error', - }, - }), - { status: 503, statusText: 'Service Unavailable' }, - ) - } - - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ + error: { + message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.', + code: 'DEPLOYMENT_SCALING_UP', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(2) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - // Verify cooldown was activated - expect(isDeploymentCoolingDown()).toBe(true) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(true) }) - it('falls back to standard API on non-scaling 503 from deployment', async () => { - const spy = spyDeploymentHours(true) + it('returns non-scaling deployment 503 without serverless fallback', async () => { const fetchCalls: string[] = [] - let callCount = 0 const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { const body = JSON.parse(init?.body as string) fetchCalls.push(body.model) - callCount++ - - if (callCount === 1) { - return new Response( - JSON.stringify({ - error: { - message: 'Service temporarily unavailable', - code: 'SERVICE_UNAVAILABLE', - type: 'error', - }, - }), - { status: 503, statusText: 'Service Unavailable' }, - ) - } - - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ + error: { + message: 'Service temporarily unavailable', + code: 'SERVICE_UNAVAILABLE', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(2) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - // Non-scaling 503 should NOT activate the cooldown - expect(isDeploymentCoolingDown()).toBe(false) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(false) }) - it('falls back to standard API on 500 Internal Error from deployment', async () => { - const spy = spyDeploymentHours(true) + it('returns 500 Internal Error from deployment without serverless fallback', async () => { const fetchCalls: string[] = [] - let callCount = 0 const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { const body = JSON.parse(init?.body as string) fetchCalls.push(body.model) - callCount++ - - if (callCount === 1) { - return new Response( - JSON.stringify({ error: 'Internal error' }), - { status: 500, statusText: 'Internal Server Error' }, - ) - } - - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ error: 'Internal error' }), + { status: 500, statusText: 'Internal Server Error' }, + ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(2) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - expect(isDeploymentCoolingDown()).toBe(false) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(500) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(false) }) - it('skips deployment during cooldown and goes straight to standard API', async () => { - const spy = spyDeploymentHours(true) + it('returns cooldown error without serverless fallback', async () => { markDeploymentScalingUp() const fetchCalls: string[] = [] @@ -300,26 +239,21 @@ describe('Fireworks deployment routing', () => { return new Response(JSON.stringify({ ok: true }), { status: 200 }) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(1) - expect(fetchCalls[0]).toBe(STANDARD_MODEL_ID) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + expect(fetchCalls).toHaveLength(0) }) it('uses standard API for models without a custom deployment', async () => { - const spy = spyDeploymentHours(true) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { @@ -328,27 +262,43 @@ describe('Fireworks deployment routing', () => { return new Response(JSON.stringify({ ok: true }), { status: 200 }) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: { ...minimalBody, model: 'some-other/model' } as never, - originalModel: 'some-other/model', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(1) - // Model without mapping falls through to the original model - expect(fetchCalls[0]).toBe('some-other/model') - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: { ...minimalBody, model: 'some-other/model' } as never, + originalModel: 'some-other/model', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: BEFORE_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toHaveLength(1) + // Model without mapping falls through to the original model + expect(fetchCalls[0]).toBe('some-other/model') + }) + + it('returns an availability error for deployment models outside hours', async () => { + const mockFetch = mock(async () => { + throw new Error('should not fetch outside deployment hours') + }) as unknown as typeof globalThis.fetch + + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: BEFORE_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + const body = await response.json() + expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') }) it('returns non-5xx responses from deployment without fallback (e.g. 429)', async () => { - const spy = spyDeploymentHours(true) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { @@ -360,23 +310,20 @@ describe('Fireworks deployment routing', () => { ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - // Non-5xx errors from deployment are returned as-is (caller handles them) - expect(response.status).toBe(429) - expect(fetchCalls).toHaveLength(1) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + // Non-5xx errors from deployment are returned as-is (caller handles them) + expect(response.status).toBe(429) + expect(fetchCalls).toHaveLength(1) + expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) }) it('transforms reasoning to reasoning_effort (defaults to medium)', async () => { @@ -535,41 +482,31 @@ describe('Fireworks deployment routing', () => { expect(fetchedBodies[0].reasoning_effort).toBe('low') }) - it('logs when trying deployment and when falling back on 5xx', async () => { - const spy = spyDeploymentHours(true) - let callCount = 0 - + it('logs when trying deployment and when deployment returns 5xx', async () => { const mockFetch = mock(async () => { - callCount++ - if (callCount === 1) { - return new Response( - JSON.stringify({ - error: { - message: 'Scaling up', - code: 'DEPLOYMENT_SCALING_UP', - type: 'error', - }, - }), - { status: 503, statusText: 'Service Unavailable' }, - ) - } - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ + error: { + message: 'Scaling up', + code: 'DEPLOYMENT_SCALING_UP', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) }) as unknown as typeof globalThis.fetch - try { - await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(logger.info).toHaveBeenCalledTimes(2) - } finally { - spy.restore() - } + await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(logger.info).toHaveBeenCalledTimes(2) }) }) }) diff --git a/web/src/llm-api/fireworks-config.ts b/web/src/llm-api/fireworks-config.ts index fb6d59580..566728250 100644 --- a/web/src/llm-api/fireworks-config.ts +++ b/web/src/llm-api/fireworks-config.ts @@ -10,7 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217' export const FIREWORKS_DEPLOYMENT_MAP: Record = { // 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9', - // 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2', - // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd', 'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea', + // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd', } diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 6e304638d..028ad4222 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -1,5 +1,9 @@ import { Agent } from 'undici' +import { + FREEBUFF_DEPLOYMENT_HOURS_LABEL, + isFreebuffDeploymentHours, +} from '@codebuff/common/constants/freebuff-models' import { PROFIT_MARGIN } from '@codebuff/common/constants/limits' import { getErrorObject } from '@codebuff/common/util/error' import { env } from '@codebuff/internal/env' @@ -32,15 +36,14 @@ const FIREWORKS_MODEL_MAP: Record = { 'minimax/minimax-m2.5': 'accounts/fireworks/models/minimax-m2p5', 'minimax/minimax-m2.7': 'accounts/fireworks/models/minimax-m2p7', 'z-ai/glm-5.1': 'accounts/fireworks/models/glm-5p1', - 'moonshotai/kimi-k2.5': 'accounts/fireworks/models/kimi-k2p5', } /** Flag to enable custom Fireworks deployments (set to false to use global API only) */ const FIREWORKS_USE_CUSTOM_DEPLOYMENT = true -/** Check if current time is within deployment hours (always enabled) */ -export function isDeploymentHours(_now: Date = new Date()): boolean { - return true +/** Check if current time is within deployment hours: Mon-Fri, 9am ET to 5pm PT. */ +export function isDeploymentHours(now: Date = new Date()): boolean { + return isFreebuffDeploymentHours(now) } /** @@ -93,7 +96,7 @@ function createFireworksRequest(params: { // Transform OpenRouter-style `reasoning` object into Fireworks' `reasoning_effort`. // Unlike OpenAI, Fireworks supports reasoning_effort together with function tools - // (e.g. GLM-4.5/5.1 and Kimi K2 are designed for interleaved reasoning + tool use). + // (e.g. GLM-4.5/5.1 are designed for interleaved reasoning + tool use). if (fireworksBody.reasoning && typeof fireworksBody.reasoning === 'object') { const reasoning = fireworksBody.reasoning as { enabled?: boolean @@ -165,15 +168,10 @@ const FIREWORKS_PRICING_MAP: Record = { cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, }, - 'moonshotai/kimi-k2.5': { - inputCostPerToken: 0.60 / 1_000_000, - cachedInputCostPerToken: 0.10 / 1_000_000, - outputCostPerToken: 3.00 / 1_000_000, - }, } function getFireworksPricing(model: string): FireworksPricing { - return FIREWORKS_PRICING_MAP[model] ?? FIREWORKS_MODEL_MAP['z-ai/glm-5.1'] + return FIREWORKS_PRICING_MAP[model] ?? FIREWORKS_PRICING_MAP['z-ai/glm-5.1'] } function extractUsageAndCost(usage: Record | undefined | null, model: string): UsageData { @@ -708,9 +706,10 @@ async function parseFireworksError(response: Response): Promise } /** - * Tries the custom Fireworks deployment during business hours (10am–8pm ET), - * falling back to the standard API if the deployment returns 503 DEPLOYMENT_SCALING_UP. - * Outside deployment hours or during cooldown, goes straight to the standard API. + * Uses custom Fireworks deployments only during deployment hours. Deployment + * mapped models never fall back to the serverless API outside hours, during + * cooldown, or after deployment 5xxs; those states surface as provider errors + * so freebuff can offer MiniMax as the always-on option. */ export async function createFireworksRequestWithFallback(params: { body: ChatCompletionRequestBody @@ -719,17 +718,41 @@ export async function createFireworksRequestWithFallback(params: { logger: Logger useCustomDeployment?: boolean sessionId: string + now?: Date }): Promise { const { body, originalModel, fetch, logger, sessionId } = params + const now = params.now ?? new Date() const useCustomDeployment = params.useCustomDeployment ?? FIREWORKS_USE_CUSTOM_DEPLOYMENT const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel] - const shouldTryDeployment = - useCustomDeployment && - deploymentModelId && - isDeploymentHours() && - !isDeploymentCoolingDown() + const hasDeployment = useCustomDeployment && Boolean(deploymentModelId) + + if (hasDeployment && !isDeploymentHours(now)) { + return new Response( + JSON.stringify({ + error: { + message: `${originalModel} is only available during ${FREEBUFF_DEPLOYMENT_HOURS_LABEL}. Use minimax/minimax-m2.7 outside those hours.`, + code: 'DEPLOYMENT_OUTSIDE_HOURS', + type: 'availability_error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) + } - if (shouldTryDeployment) { + if (hasDeployment && isDeploymentCoolingDown()) { + return new Response( + JSON.stringify({ + error: { + message: `${originalModel} deployment is temporarily unavailable. Use minimax/minimax-m2.7 while it recovers.`, + code: 'DEPLOYMENT_COOLDOWN', + type: 'availability_error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) + } + + if (hasDeployment && deploymentModelId) { logger.info( { model: originalModel, deploymentModel: deploymentModelId }, 'Trying Fireworks custom deployment', @@ -746,15 +769,18 @@ export async function createFireworksRequestWithFallback(params: { const errorText = await response.text() logger.info( { model: originalModel, status: response.status, errorText: errorText.slice(0, 200) }, - 'Fireworks custom deployment returned 5xx, falling back to standard API', + 'Fireworks custom deployment returned 5xx', ) if (errorText.includes('DEPLOYMENT_SCALING_UP')) { markDeploymentScalingUp() } - // Fall through to standard API request below - } else { - return response + return new Response(errorText, { + status: response.status, + statusText: response.statusText, + headers: response.headers, + }) } + return response } return createFireworksRequest({ body, originalModel, fetch, sessionId }) diff --git a/web/src/server/free-session/__tests__/config.test.ts b/web/src/server/free-session/__tests__/config.test.ts new file mode 100644 index 000000000..93f5fdcf0 --- /dev/null +++ b/web/src/server/free-session/__tests__/config.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, test } from 'bun:test' + +import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models' + +import { getInstantAdmitCapacity } from '../config' + +describe('free session config', () => { + test('every selectable freebuff model has instant-admit capacity', () => { + for (const model of FREEBUFF_MODELS) { + expect(getInstantAdmitCapacity(model.id)).toBeGreaterThan(0) + } + }) +}) diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index a824f6d22..a90bc800d 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -13,7 +13,7 @@ import type { InternalSessionRow } from '../types' const SESSION_LEN = 60 * 60 * 1000 const GRACE_MS = 30 * 60 * 1000 -const DEFAULT_MODEL = 'z-ai/glm-5.1' +const DEFAULT_MODEL = 'minimax/minimax-m2.7' function makeDeps(overrides: Partial = {}): SessionDeps & { rows: Map @@ -177,19 +177,34 @@ describe('requestSession', () => { expect(state.instanceId).toBe('inst-1') }) + test('deployment-hours-only model is unavailable outside deployment hours', async () => { + const state = await requestSession({ + userId: 'u1', + model: 'z-ai/glm-5.1', + deps, + }) + expect(state).toEqual({ + status: 'model_unavailable', + requestedModel: 'z-ai/glm-5.1', + availableHours: '9am ET-5pm PT', + }) + expect(deps.rows.size).toBe(0) + }) + test('queued response includes a per-model depth snapshot for the selector', async () => { - // Seed 2 users in glm + 1 in minimax so the returned map captures both. + deps._tick(new Date('2026-04-17T16:00:00Z')) + // Seed 2 users in MiniMax + 1 in GLM so the returned map captures both. await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) await requestSession({ userId: 'u2', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) - await requestSession({ userId: 'u3', model: 'minimax/minimax-m2.7', deps }) + await requestSession({ userId: 'u3', model: 'z-ai/glm-5.1', deps }) const state = await getSessionState({ userId: 'u1', deps }) if (state.status !== 'queued') throw new Error('unreachable') expect(state.queueDepthByModel).toEqual({ [DEFAULT_MODEL]: 2, - 'minimax/minimax-m2.7': 1, + 'z-ai/glm-5.1': 1, }) }) @@ -264,11 +279,12 @@ describe('requestSession', () => { }) test('instant-admit: per-model capacities are independent', async () => { - // GLM saturated at 1 active, MiniMax still has room. + // MiniMax saturated at 1 active, GLM still has room. const admitDeps = makeDeps({ getInstantAdmitCapacity: (model) => model === DEFAULT_MODEL ? 1 : 10, }) + admitDeps._tick(new Date('2026-04-17T16:00:00Z')) await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps: admitDeps }) const s2 = await requestSession({ userId: 'u2', @@ -277,7 +293,7 @@ describe('requestSession', () => { }) const s3 = await requestSession({ userId: 'u3', - model: 'minimax/minimax-m2.7', + model: 'z-ai/glm-5.1', deps: admitDeps, }) expect(s2.status).toBe('queued') diff --git a/web/src/server/free-session/admission.ts b/web/src/server/free-session/admission.ts index 3f3c051d2..9f0b74c9f 100644 --- a/web/src/server/free-session/admission.ts +++ b/web/src/server/free-session/admission.ts @@ -1,4 +1,7 @@ -import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models' +import { + FREEBUFF_MODELS, + isFreebuffModelAvailable, +} from '@codebuff/common/constants/freebuff-models' import { ADMISSION_TICK_MS, @@ -111,7 +114,10 @@ export async function runAdmissionTick( // advisory locks and a single update each. const perModel = await Promise.all( models.map(async (model) => { - const health = fleet[model] ?? 'healthy' + const isRegisteredModel = FREEBUFF_MODELS.some((m) => m.id === model) + const health = !isRegisteredModel || isFreebuffModelAvailable(model, now) + ? fleet[model] ?? 'healthy' + : 'unhealthy' const { admitted, skipped } = await deps.admitFromQueue({ model, sessionLengthMs: deps.sessionLengthMs, diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts index 450540443..7ea85f2e4 100644 --- a/web/src/server/free-session/public-api.ts +++ b/web/src/server/free-session/public-api.ts @@ -1,4 +1,6 @@ import { + FREEBUFF_DEPLOYMENT_HOURS_LABEL, + isFreebuffModelAvailable, isFreebuffModelId as isSelectableFreebuffModel, resolveFreebuffModel, } from '@codebuff/common/constants/freebuff-models' @@ -122,6 +124,11 @@ export type RequestSessionResult = currentModel: string requestedModel: string } + | { + status: 'model_unavailable' + requestedModel: string + availableHours: string + } /** * Client calls this on CLI startup with the model they want to use. @@ -152,6 +159,7 @@ export async function requestSession(params: { }): Promise { const deps = params.deps ?? defaultDeps const model = resolveFreebuffModel(params.model) + const now = nowOf(deps) if (params.userBanned) { return { status: 'banned' } } @@ -161,13 +169,20 @@ export async function requestSession(params: { ) { return { status: 'disabled' } } + if (!isFreebuffModelAvailable(model, now)) { + return { + status: 'model_unavailable', + requestedModel: model, + availableHours: FREEBUFF_DEPLOYMENT_HOURS_LABEL, + } + } let row: InternalSessionRow try { row = await deps.joinOrTakeOver({ userId: params.userId, model, - now: nowOf(deps), + now, }) } catch (err) { if (err instanceof FreeSessionModelLockedError) { @@ -199,7 +214,7 @@ export async function requestSession(params: { userId: params.userId, model, sessionLengthMs: deps.sessionLengthMs, - now: nowOf(deps), + now, }) if (promoted) row = promoted }