From 350576c56d25982cfece0f5cf88eaedb92a9924c Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 23 Apr 2026 21:01:31 -0700 Subject: [PATCH 1/3] glm 5.1 => kimi l2.5 --- agents/__tests__/editor.test.ts | 11 + agents/base2/base2.ts | 14 +- agents/editor/editor-lite.ts | 2 +- agents/editor/editor.ts | 6 +- agents/reviewer/code-reviewer-lite.ts | 2 +- .../components/freebuff-model-selector.tsx | 53 ++- cli/src/components/waiting-room-screen.tsx | 2 +- cli/src/hooks/use-freebuff-session.ts | 19 +- cli/src/state/freebuff-model-store.ts | 6 +- cli/src/utils/local-agent-registry.ts | 2 +- common/src/constants/free-agents.ts | 15 +- common/src/constants/freebuff-models.ts | 63 ++- common/src/types/freebuff-session.ts | 7 + docs/freebuff-waiting-room.md | 20 +- scripts/test-fireworks-cache-intervals.ts | 17 +- scripts/test-fireworks-long.ts | 31 +- .../completions/__tests__/completions.test.ts | 80 +++- .../session/__tests__/session.test.ts | 19 +- .../app/api/v1/freebuff/session/_handlers.ts | 15 +- .../__tests__/fireworks-deployment.test.ts | 443 ++++++++---------- web/src/llm-api/fireworks-config.ts | 3 +- web/src/llm-api/fireworks.ts | 66 ++- .../free-session/__tests__/config.test.ts | 13 + .../free-session/__tests__/public-api.test.ts | 28 +- web/src/server/free-session/admission.ts | 10 +- web/src/server/free-session/config.ts | 2 +- web/src/server/free-session/public-api.ts | 19 +- 27 files changed, 614 insertions(+), 354 deletions(-) create mode 100644 web/src/server/free-session/__tests__/config.test.ts diff --git a/agents/__tests__/editor.test.ts b/agents/__tests__/editor.test.ts index 030857c8dc..36d6b75c5c 100644 --- a/agents/__tests__/editor.test.ts +++ b/agents/__tests__/editor.test.ts @@ -67,6 +67,11 @@ describe('editor agent', () => { expect(glmEditor.model).toBe('z-ai/glm-5.1') }) + test('creates minimax editor', () => { + const minimaxEditor = createCodeEditor({ model: 'minimax' }) + expect(minimaxEditor.model).toBe('minimax/minimax-m2.7') + }) + test('gpt-5 editor does not include think tags in instructions', () => { const gpt5Editor = createCodeEditor({ model: 'gpt-5' }) expect(gpt5Editor.instructionsPrompt).not.toContain('') @@ -79,6 +84,12 @@ describe('editor agent', () => { expect(glmEditor.instructionsPrompt).not.toContain('') }) + test('minimax editor does not include think tags in instructions', () => { + const minimaxEditor = createCodeEditor({ model: 'minimax' }) + expect(minimaxEditor.instructionsPrompt).not.toContain('') + expect(minimaxEditor.instructionsPrompt).not.toContain('') + }) + test('opus editor includes think tags in instructions', () => { const opusEditor = createCodeEditor({ model: 'opus' }) expect(opusEditor.instructionsPrompt).toContain('') diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index 1a81f948bf..c6f7e15f8a 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -25,16 +25,18 @@ export function createBase2( const isFree = mode === 'free' || mode === 'lite' const isSonnet = false - const model = isFree ? 'z-ai/glm-5.1' : 'anthropic/claude-opus-4.7' + const model = isFree ? 'minimax/minimax-m2.7' : 'anthropic/claude-opus-4.7' return { publisher, model, - providerOptions: isFree ? { - data_collection: 'deny', - } : { - only: ['amazon-bedrock'], - }, + providerOptions: isFree + ? { + data_collection: 'deny', + } + : { + only: ['amazon-bedrock'], + }, displayName: 'Buffy the Orchestrator', spawnerPrompt: 'Advanced base agent that orchestrates planning, editing, and reviewing for complex coding tasks', diff --git a/agents/editor/editor-lite.ts b/agents/editor/editor-lite.ts index 29225f0c29..9cb5675b5e 100644 --- a/agents/editor/editor-lite.ts +++ b/agents/editor/editor-lite.ts @@ -3,7 +3,7 @@ import { createCodeEditor } from './editor' import type { AgentDefinition } from '../types/agent-definition' const definition: AgentDefinition = { - ...createCodeEditor({ model: 'glm' }), + ...createCodeEditor({ model: 'minimax' }), id: 'editor-lite', } export default definition diff --git a/agents/editor/editor.ts b/agents/editor/editor.ts index 3d208aa13a..c98544d0f2 100644 --- a/agents/editor/editor.ts +++ b/agents/editor/editor.ts @@ -4,7 +4,7 @@ import { publisher } from '../constants' import type { AgentDefinition } from '../types/agent-definition' export const createCodeEditor = (options: { - model: 'gpt-5' | 'opus' | 'glm' + model: 'gpt-5' | 'opus' | 'glm' | 'minimax' }): Omit => { const { model } = options return { @@ -12,6 +12,8 @@ export const createCodeEditor = (options: { model: options.model === 'gpt-5' ? 'openai/gpt-5.1' + : options.model === 'minimax' + ? 'minimax/minimax-m2.7' : options.model === 'glm' ? 'z-ai/glm-5.1' : 'anthropic/claude-opus-4.7', @@ -65,7 +67,7 @@ OR for new files or major rewrites: } -${model === 'gpt-5' || model === 'glm' +${model === 'gpt-5' || model === 'glm' || model === 'minimax' ? '' : `Before you start writing your implementation, you should use tags to think about the best way to implement the changes. diff --git a/agents/reviewer/code-reviewer-lite.ts b/agents/reviewer/code-reviewer-lite.ts index feafb87c45..ee017c24e6 100644 --- a/agents/reviewer/code-reviewer-lite.ts +++ b/agents/reviewer/code-reviewer-lite.ts @@ -5,7 +5,7 @@ import { createReviewer } from './code-reviewer' const definition: SecretAgentDefinition = { id: 'code-reviewer-lite', publisher, - ...createReviewer('z-ai/glm-5.1'), + ...createReviewer('minimax/minimax-m2.7'), } export default definition diff --git a/cli/src/components/freebuff-model-selector.tsx b/cli/src/components/freebuff-model-selector.tsx index a33d89540a..1ba966fd22 100644 --- a/cli/src/components/freebuff-model-selector.tsx +++ b/cli/src/components/freebuff-model-selector.tsx @@ -3,9 +3,15 @@ import { useKeyboard } from '@opentui/react' import React, { useCallback, useEffect, useMemo, useState } from 'react' import { Button } from './button' -import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models' +import { + DEFAULT_FREEBUFF_MODEL_ID, + FREEBUFF_DEPLOYMENT_HOURS_LABEL, + FREEBUFF_MODELS, + isFreebuffModelAvailable, +} from '@codebuff/common/constants/freebuff-models' import { joinFreebuffQueue } from '../hooks/use-freebuff-session' +import { useNow } from '../hooks/use-now' import { useFreebuffModelStore } from '../state/freebuff-model-store' import { useFreebuffSessionStore } from '../state/freebuff-session-store' import { useTerminalDimensions } from '../hooks/use-terminal-dimensions' @@ -33,7 +39,9 @@ export const FreebuffModelSelector: React.FC = () => { const theme = useTheme() const { terminalWidth } = useTerminalDimensions() const selectedModel = useFreebuffModelStore((s) => s.selectedModel) + const setSelectedModel = useFreebuffModelStore((s) => s.setSelectedModel) const session = useFreebuffSessionStore((s) => s.session) + const now = useNow(60_000) const [pending, setPending] = useState(null) const [hoveredId, setHoveredId] = useState(null) // Keyboard cursor — separate from the actually-selected model so that @@ -45,6 +53,15 @@ export const FreebuffModelSelector: React.FC = () => { setFocusedId(selectedModel) }, [selectedModel]) + useEffect(() => { + if ( + (session?.status === 'none' || !session) && + !isFreebuffModelAvailable(selectedModel, new Date(now)) + ) { + setSelectedModel(DEFAULT_FREEBUFF_MODEL_ID) + } + }, [now, selectedModel, session, setSelectedModel]) + // Landing ('none'): depths come from the server snapshot, no "self" to // subtract. In-queue ('queued'): for the user's queue, "ahead" is // `position - 1` (themselves don't count); for every other queue, switching @@ -85,7 +102,8 @@ export const FreebuffModelSelector: React.FC = () => { ) // Decide row vs column layout based on whether both buttons actually fit - // side-by-side. Each button's inner text is "● {displayName} · {tagline} {hint}", + // side-by-side. Each button's inner text is + // "● {displayName} · {tagline} · {hours} {hint}", // plus 2 cols of border and 2 cols of padding. Buttons are separated by a // gap of 2. If the total exceeds the terminal width, stack vertically. const stackVertically = useMemo(() => { @@ -97,6 +115,9 @@ export const FreebuffModelSelector: React.FC = () => { model.displayName.length + 3 /* " · " */ + model.tagline.length + + (model.availability === 'deployment_hours' + ? 3 + FREEBUFF_DEPLOYMENT_HOURS_LABEL.length + : 0) + 2 /* " " */ + hintWidth return sum + inner + BUTTON_CHROME + (idx > 0 ? GAP : 0) @@ -115,10 +136,11 @@ export const FreebuffModelSelector: React.FC = () => { (modelId: string) => { if (pending) return if (modelId === committedModelId) return + if (!isFreebuffModelAvailable(modelId, new Date(now))) return setPending(modelId) joinFreebuffQueue(modelId).finally(() => setPending(null)) }, - [pending, committedModelId], + [pending, committedModelId, now], ) // Tab / Shift+Tab and arrow keys move the focus highlight only; Enter or @@ -136,7 +158,10 @@ export const FreebuffModelSelector: React.FC = () => { const isCommit = name === 'return' || name === 'enter' || name === 'space' if (!isForward && !isBackward && !isCommit) return if (isCommit) { - if (focusedId !== committedModelId) { + if ( + focusedId !== committedModelId && + isFreebuffModelAvailable(focusedId, new Date(now)) + ) { key.preventDefault?.() pick(focusedId) } @@ -154,7 +179,7 @@ export const FreebuffModelSelector: React.FC = () => { setFocusedId(target.id) } }, - [pending, pick, focusedId, committedModelId], + [pending, pick, focusedId, committedModelId, now], ), ) @@ -181,15 +206,22 @@ export const FreebuffModelSelector: React.FC = () => { const isSelected = model.id === selectedModel const isHovered = hoveredId === model.id const isFocused = focusedId === model.id && !isSelected + const isAvailable = isFreebuffModelAvailable(model.id, new Date(now)) const indicator = isSelected ? '●' : '○' const indicatorColor = isSelected ? theme.primary : theme.muted - const labelColor = isSelected ? theme.foreground : theme.muted + const labelColor = isSelected && isAvailable ? theme.foreground : theme.muted // Clickable whenever picking would actually do something — i.e. // anything except re-picking the queue we're already in. - const interactable = !pending && model.id !== committedModelId + const interactable = !pending && isAvailable && model.id !== committedModelId const ahead = aheadByModel?.[model.id] const hint = - ahead === undefined ? '' : ahead === 0 ? 'No wait' : `${ahead} ahead` + !isAvailable + ? 'Closed' + : ahead === undefined + ? '' + : ahead === 0 + ? 'No wait' + : `${ahead} ahead` const borderColor = isSelected ? theme.primary @@ -202,7 +234,7 @@ export const FreebuffModelSelector: React.FC = () => { key={model.id} onClick={() => { setFocusedId(model.id) - pick(model.id) + if (isAvailable) pick(model.id) }} onMouseOver={() => interactable && setHoveredId(model.id)} onMouseOut={() => setHoveredId((curr) => (curr === model.id ? null : curr))} @@ -223,6 +255,9 @@ export const FreebuffModelSelector: React.FC = () => { {model.displayName} · {model.tagline} + {model.availability === 'deployment_hours' && ( + · {FREEBUFF_DEPLOYMENT_HOURS_LABEL} + )} {hint.padEnd(hintWidth)} diff --git a/cli/src/components/waiting-room-screen.tsx b/cli/src/components/waiting-room-screen.tsx index e67823f7a2..251ca87c0a 100644 --- a/cli/src/components/waiting-room-screen.tsx +++ b/cli/src/components/waiting-room-screen.tsx @@ -253,7 +253,7 @@ export const WaitingRoomScreen: React.FC = ({ ⚠ Account unavailable - This account can't use freebuff. If you think this is a + This account has been suspended and can't use freebuff. If you think this is a mistake, contact support@codebuff.com. Press Ctrl+C to exit. diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts index 79deea1cfb..225eee2b24 100644 --- a/cli/src/hooks/use-freebuff-session.ts +++ b/cli/src/hooks/use-freebuff-session.ts @@ -1,4 +1,5 @@ import { env } from '@codebuff/common/env' +import { DEFAULT_FREEBUFF_MODEL_ID } from '@codebuff/common/constants/freebuff-models' import { useEffect } from 'react' import { @@ -75,14 +76,17 @@ async function callSession( return body } } - // 409 from POST means the user picked a different model than their active - // session is bound to. Surface as a non-throw `model_locked` so the UI can - // show a confirmation prompt (DELETE then re-POST to switch). + // 409 from POST means the selected model cannot be joined right now, either + // because an active session is locked to another model or because a + // deployment-hours-only model is closed. Surface both as non-throw states. if (resp.status === 409 && method === 'POST') { const body = (await resp.json().catch(() => null)) as | FreebuffSessionResponse | null - if (body && body.status === 'model_locked') { + if ( + body && + (body.status === 'model_locked' || body.status === 'model_unavailable') + ) { return body } } @@ -119,6 +123,7 @@ function nextDelayMs(next: FreebuffSessionResponse): number | null { case 'country_blocked': case 'banned': case 'model_locked': + case 'model_unavailable': return null } } @@ -398,6 +403,12 @@ export function useFreebuffSession(): UseFreebuffSessionResult { schedule(0) return } + if (next.status === 'model_unavailable') { + useFreebuffModelStore.getState().setSelectedModel(DEFAULT_FREEBUFF_MODEL_ID) + nextMethod = 'GET' + schedule(0) + return + } // Startup takeover: the initial probe GET saw we already hold a seat // (from a prior CLI instance). POST now to rotate our instance id so diff --git a/cli/src/state/freebuff-model-store.ts b/cli/src/state/freebuff-model-store.ts index 182a38831f..1aa9f2db80 100644 --- a/cli/src/state/freebuff-model-store.ts +++ b/cli/src/state/freebuff-model-store.ts @@ -1,6 +1,6 @@ import { DEFAULT_FREEBUFF_MODEL_ID, - resolveFreebuffModel, + resolveAvailableFreebuffModel, } from '@codebuff/common/constants/freebuff-models' import { create } from 'zustand' @@ -24,11 +24,11 @@ interface FreebuffModelStore { } export const useFreebuffModelStore = create((set) => ({ - selectedModel: resolveFreebuffModel( + selectedModel: resolveAvailableFreebuffModel( loadFreebuffModelPreference() ?? DEFAULT_FREEBUFF_MODEL_ID, ), setSelectedModel: (model) => { - const resolved = resolveFreebuffModel(model) + const resolved = resolveAvailableFreebuffModel(model) saveFreebuffModelPreference(resolved) set({ selectedModel: resolved }) }, diff --git a/cli/src/utils/local-agent-registry.ts b/cli/src/utils/local-agent-registry.ts index 59206eb848..6106b3928e 100644 --- a/cli/src/utils/local-agent-registry.ts +++ b/cli/src/utils/local-agent-registry.ts @@ -370,7 +370,7 @@ export const loadAgentDefinitions = (): AgentDefinition[] => { } // Override the model of free-mode agents to match the user's pick from the - // freebuff waiting room. Bundled definitions hardcode glm-5.1; we swap in + // freebuff waiting room. Bundled definitions hardcode a free model; we swap in // whatever the user chose so the chat-completions request body carries the // matching model and the server-side session gate doesn't reject it as a // model mismatch. diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index e44c74cc65..762202dcca 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -26,7 +26,10 @@ export const FREEBUFF_ROOT_AGENT_IDS = ['base2-free'] as const */ export const FREE_MODE_AGENT_MODELS: Record> = { // Root orchestrator - 'base2-free': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']), + 'base2-free': new Set([ + 'minimax/minimax-m2.7', + 'moonshotai/kimi-k2.5', + ]), // File exploration agents 'file-picker': new Set(['google/gemini-2.5-flash-lite']), @@ -41,10 +44,16 @@ export const FREE_MODE_AGENT_MODELS: Record> = { 'basher': new Set(['google/gemini-3.1-flash-lite-preview']), // Editor for free mode - 'editor-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']), + 'editor-lite': new Set([ + 'minimax/minimax-m2.7', + 'moonshotai/kimi-k2.5', + ]), // Code reviewer for free mode - 'code-reviewer-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']), + 'code-reviewer-lite': new Set([ + 'minimax/minimax-m2.7', + 'moonshotai/kimi-k2.5', + ]), } /** diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts index d71ebd619d..d38d187ffc 100644 --- a/common/src/constants/freebuff-models.ts +++ b/common/src/constants/freebuff-models.ts @@ -13,18 +13,25 @@ export interface FreebuffModelOption { displayName: string /** One-line description shown next to the label. */ tagline: string + /** Availability policy for the selector and server-side admission. */ + availability: 'always' | 'deployment_hours' } +export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT' +export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.5' + export const FREEBUFF_MODELS = [ - { - id: 'z-ai/glm-5.1', - displayName: 'GLM 5.1', - tagline: 'Smartest', - }, { id: 'minimax/minimax-m2.7', displayName: 'MiniMax M2.7', tagline: 'Fastest', + availability: 'always', + }, + { + id: FREEBUFF_KIMI_MODEL_ID, + displayName: 'Kimi K2.5', + tagline: 'Balanced', + availability: 'deployment_hours', }, ] as const satisfies readonly FreebuffModelOption[] @@ -51,3 +58,49 @@ export function getFreebuffModel(id: string): FreebuffModelOption { FREEBUFF_MODELS.find((m) => m.id === DEFAULT_FREEBUFF_MODEL_ID)! ) } + +function getZonedParts( + date: Date, + timeZone: string, +): { weekday: string; minutes: number } { + const parts = new Intl.DateTimeFormat('en-US', { + timeZone, + weekday: 'short', + hour: '2-digit', + minute: '2-digit', + hourCycle: 'h23', + }).formatToParts(date) + const value = (type: string) => parts.find((part) => part.type === type)?.value + const hour = Number(value('hour') ?? 0) + const minute = Number(value('minute') ?? 0) + return { + weekday: value('weekday') ?? '', + minutes: hour * 60 + minute, + } +} + +export function isFreebuffDeploymentHours(now: Date = new Date()): boolean { + const eastern = getZonedParts(now, 'America/New_York') + const pacific = getZonedParts(now, 'America/Los_Angeles') + if (eastern.weekday === 'Sat' || eastern.weekday === 'Sun') return false + return eastern.minutes >= 9 * 60 && pacific.minutes < 24 * 60 +} + +export function isFreebuffModelAvailable( + id: string, + now: Date = new Date(), +): boolean { + const model = FREEBUFF_MODELS.find((m) => m.id === id) + if (!model) return false + return model.availability === 'always' || isFreebuffDeploymentHours(now) +} + +export function resolveAvailableFreebuffModel( + id: string | null | undefined, + now: Date = new Date(), +): FreebuffModelId { + const resolved = resolveFreebuffModel(id) + return isFreebuffModelAvailable(resolved, now) + ? resolved + : DEFAULT_FREEBUFF_MODEL_ID +} diff --git a/common/src/types/freebuff-session.ts b/common/src/types/freebuff-session.ts index e42d9f0bee..43cd3eaa25 100644 --- a/common/src/types/freebuff-session.ts +++ b/common/src/types/freebuff-session.ts @@ -92,6 +92,13 @@ export type FreebuffSessionServerResponse = currentModel: string requestedModel: string } + | { + /** Requested model is valid but not selectable right now. Currently + * used for deployment-hours-only models such as Kimi K2.5. */ + status: 'model_unavailable' + requestedModel: string + availableHours: string + } | { /** Account is banned. Returned from every endpoint so banned bots can't * join the queue at all (otherwise they inflate `queueDepth` until the diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md index b1384d7b60..73fa779270 100644 --- a/docs/freebuff-waiting-room.md +++ b/docs/freebuff-waiting-room.md @@ -5,7 +5,7 @@ The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs: 1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones. -2. **Gate on per-deployment health** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` admit that tick; a degraded minimax-m2.7 no longer stalls glm-5.1 admissions. +2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; Kimi K2.5 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available. 3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput. Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session. @@ -149,8 +149,8 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r | Constant | Location | Default | Purpose | |---|---|---|---| | `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. | -| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `glm-5.1`, `minimax-m2.7` | Selectable models; each gets its own queue and admission slot. | -| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | glm-5.1 only | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | +| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `kimi-k2.5` | Selectable models; each gets its own queue and admission slot. | +| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `kimi-k2.5` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | | `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. | | `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime | | `FREEBUFF_SESSION_GRACE_MS` | env | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. | @@ -180,12 +180,12 @@ Response shapes: { "status": "queued", "instanceId": "e47…", - "model": "z-ai/glm-5.1", + "model": "minimax/minimax-m2.7", "position": 17, // 1-indexed within this model's queue "queueDepth": 43, // size of this model's queue "queueDepthByModel": { // snapshot of every model's queue — powers the - "z-ai/glm-5.1": 43, // "N ahead" hint in the selector. Missing - "minimax/minimax-m2.7": 4 // entries should be treated as 0. + "minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing + "moonshotai/kimi-k2.5": 4 // entries should be treated as 0. }, "estimatedWaitMs": 384000, "queuedAt": "2026-04-17T12:00:00Z" @@ -195,7 +195,7 @@ Response shapes: { "status": "active", "instanceId": "e47…", - "model": "z-ai/glm-5.1", + "model": "minimax/minimax-m2.7", "admittedAt": "2026-04-17T12:00:00Z", "expiresAt": "2026-04-17T13:00:00Z", "remainingMs": 3600000 @@ -219,7 +219,7 @@ Response shapes: // to actually switch. { "status": "model_locked", - "currentModel": "z-ai/glm-5.1", + "currentModel": "minimax/minimax-m2.7", "requestedModel": "minimax/minimax-m2.7" } ``` @@ -285,7 +285,7 @@ waitMs = (position - 1) * 24_000 - Position 1 → 0 (next tick admits you) - Position 2 → 24s, and so on. -`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a per-deployment Fireworks incident only the affected model's queue stalls; healthy models keep draining), so the real wait can be longer or shorter. +`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `moonshotai/kimi-k2.5` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a Kimi Fireworks incident or outside 9am ET-5pm PT, only Kimi's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter. ## CLI Integration (frontend-side contract) @@ -324,7 +324,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr | Spamming POST/GET to starve admission tick | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. | | Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time. | | Fireworks metrics endpoint down / slow | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses. | -| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded minimax-m2.7 doesn't block glm-5.1 admissions. | +| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded Kimi deployment doesn't block MiniMax admissions. | | Zombie expired sessions holding capacity | Swept on every admission tick, even when upstream is unhealthy | ## Testing diff --git a/scripts/test-fireworks-cache-intervals.ts b/scripts/test-fireworks-cache-intervals.ts index 0ed71193fd..92d7ac49e3 100644 --- a/scripts/test-fireworks-cache-intervals.ts +++ b/scripts/test-fireworks-cache-intervals.ts @@ -25,11 +25,11 @@ * # Default glm-5.1 serverless with default intervals * bun scripts/test-fireworks-cache-intervals.ts * - * # Custom GLM deployment with a faster sweep - * bun scripts/test-fireworks-cache-intervals.ts glm-5.1 --deployment --intervals=30,60,120,300,600 + * # Custom Kimi deployment with a faster sweep + * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.5 --deployment --intervals=30,60,120,300,600 * * # Long sweep up to 1 hour - * bun scripts/test-fireworks-cache-intervals.ts glm-5.1 --deployment --intervals=60,300,600,1200,1800,2700,3600 + * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.5 --deployment --intervals=60,300,600,1200,1800,2700,3600 */ export {} @@ -39,7 +39,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' type ModelConfig = { id: string standardModel: string - deploymentModel: string + deploymentModel?: string inputCostPerToken: number cachedInputCostPerToken: number outputCostPerToken: number @@ -49,7 +49,6 @@ const MODEL_CONFIGS: Record = { 'glm-5.1': { id: 'z-ai/glm-5.1', standardModel: 'accounts/fireworks/models/glm-5p1', - deploymentModel: 'accounts/james-65d217/deployments/mjb4i7ea', inputCostPerToken: 1.4 / 1_000_000, cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.4 / 1_000_000, @@ -57,7 +56,7 @@ const MODEL_CONFIGS: Record = { 'kimi-k2.5': { id: 'moonshotai/kimi-k2.5', standardModel: 'accounts/fireworks/models/kimi-k2p5', - deploymentModel: 'accounts/james-65d217/deployments/mx8l5rq2', + deploymentModel: 'accounts/james-65d217/deployments/y5b3z17u', inputCostPerToken: 0.6 / 1_000_000, cachedInputCostPerToken: 0.1 / 1_000_000, outputCostPerToken: 3.0 / 1_000_000, @@ -117,8 +116,12 @@ function parseArgs(): { const { modelKey, useDeployment: USE_DEPLOYMENT, intervals: INTERVALS_SEC } = parseArgs() const MODEL = MODEL_CONFIGS[modelKey] +if (USE_DEPLOYMENT && !MODEL.deploymentModel) { + console.error(`❌ No custom deployment configured for ${MODEL.id}`) + process.exit(1) +} const FIREWORKS_MODEL = USE_DEPLOYMENT - ? MODEL.deploymentModel + ? MODEL.deploymentModel! : MODEL.standardModel const INPUT_COST_PER_TOKEN = MODEL.inputCostPerToken const CACHED_INPUT_COST_PER_TOKEN = MODEL.cachedInputCostPerToken diff --git a/scripts/test-fireworks-long.ts b/scripts/test-fireworks-long.ts index 67028228da..e506ccf022 100644 --- a/scripts/test-fireworks-long.ts +++ b/scripts/test-fireworks-long.ts @@ -11,13 +11,19 @@ * * Models: * glm-5.1 (default) — z-ai/glm-5.1 + * kimi-k2.5 — moonshotai/kimi-k2.5 * minimax — minimax/minimax-m2.5 + * minimax-m2.7 — minimax/minimax-m2.7 * * Flags: * --deployment Use custom deployment instead of serverless (standard API) * Serverless is the default + * Examples: + * bun scripts/test-fireworks-long.ts kimi-k2.5 --deployment */ +import { FIREWORKS_DEPLOYMENT_MAP } from '../web/src/llm-api/fireworks-config' + export { } const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' @@ -25,7 +31,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' type ModelConfig = { id: string // OpenRouter-style ID (for display) standardModel: string // Fireworks standard API model ID - deploymentModel: string // Fireworks custom deployment model ID + deploymentModel?: string // Fireworks custom deployment model ID inputCostPerToken: number cachedInputCostPerToken: number outputCostPerToken: number @@ -35,7 +41,6 @@ const MODEL_CONFIGS: Record = { 'glm-5.1': { id: 'z-ai/glm-5.1', standardModel: 'accounts/fireworks/models/glm-5p1', - deploymentModel: 'accounts/james-65d217/deployments/mjb4i7ea', inputCostPerToken: 1.40 / 1_000_000, cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, @@ -43,7 +48,7 @@ const MODEL_CONFIGS: Record = { 'kimi-k2.5': { id: 'moonshotai/kimi-k2.5', standardModel: 'accounts/fireworks/models/kimi-k2p5', - deploymentModel: 'accounts/james-65d217/deployments/mx8l5rq2', + deploymentModel: FIREWORKS_DEPLOYMENT_MAP['moonshotai/kimi-k2.5'], inputCostPerToken: 0.60 / 1_000_000, cachedInputCostPerToken: 0.10 / 1_000_000, outputCostPerToken: 3.00 / 1_000_000, @@ -67,9 +72,19 @@ const MODEL_CONFIGS: Record = { } const DEFAULT_MODEL = 'glm-5.1' +const MODEL_ALIASES: Record = { + glm: 'glm-5.1', + 'z-ai/glm-5.1': 'glm-5.1', + kimi: 'kimi-k2.5', + 'kimi-k2': 'kimi-k2.5', + 'moonshotai/kimi-k2.5': 'kimi-k2.5', + 'minimax/minimax-m2.5': 'minimax', + 'minimax/minimax-m2.7': 'minimax-m2.7', +} function getModelConfig(modelArg?: string): ModelConfig { - const key = modelArg ?? DEFAULT_MODEL + const rawKey = modelArg ?? DEFAULT_MODEL + const key = MODEL_ALIASES[rawKey] ?? rawKey const config = MODEL_CONFIGS[key] if (!config) { console.error(`❌ Unknown model: "${key}". Available models: ${Object.keys(MODEL_CONFIGS).join(', ')}`) @@ -83,7 +98,11 @@ const modelArg = process.argv.find((a, i) => i > 1 && !a.startsWith('-') && a != const MODEL = getModelConfig(modelArg) // Default to serverless (standard API); use --deployment for custom deployment -const FIREWORKS_MODEL = USE_DEPLOYMENT ? MODEL.deploymentModel : MODEL.standardModel +if (USE_DEPLOYMENT && !MODEL.deploymentModel) { + console.error(`❌ No custom deployment configured for ${MODEL.id}`) + process.exit(1) +} +const FIREWORKS_MODEL = USE_DEPLOYMENT ? MODEL.deploymentModel! : MODEL.standardModel const INPUT_COST_PER_TOKEN = MODEL.inputCostPerToken const CACHED_INPUT_COST_PER_TOKEN = MODEL.cachedInputCostPerToken const OUTPUT_COST_PER_TOKEN = MODEL.outputCostPerToken @@ -455,4 +474,4 @@ async function main() { console.log('Done!') } -main() \ No newline at end of file +main() diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index 51a3eb46be..04f9b570cd 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -1,6 +1,8 @@ import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test' import { NextRequest } from 'next/server' +import { isFreebuffDeploymentHours } from '@codebuff/common/constants/freebuff-models' + import { formatQuotaResetCountdown, postChatCompletions } from '../_post' import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics' @@ -528,7 +530,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-new-free' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: false, codebuff_metadata: { run_id: 'run-free', @@ -555,6 +557,76 @@ describe('/api/v1/chat/completions POST endpoint', () => { expect(response.status).toBe(200) }) + it('lets freebuff use Kimi K2.5 through Fireworks availability rules', async () => { + const fetchedBodies: Record[] = [] + const fetchViaFireworks = mock( + async (_url: string | URL | Request, init?: RequestInit) => { + fetchedBodies.push(JSON.parse(init?.body as string)) + return new Response( + JSON.stringify({ + id: 'test-id', + model: 'accounts/james-65d217/deployments/y5b3z17u', + choices: [{ message: { content: 'test response' } }], + usage: { + prompt_tokens: 10, + completion_tokens: 20, + total_tokens: 30, + }, + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }, + ) + }, + ) as unknown as typeof globalThis.fetch + + const req = new NextRequest( + 'http://localhost:3000/api/v1/chat/completions', + { + method: 'POST', + headers: { Authorization: 'Bearer test-api-key-new-free' }, + body: JSON.stringify({ + model: 'moonshotai/kimi-k2.5', + stream: false, + codebuff_metadata: { + run_id: 'run-free', + client_id: 'test-client-id-123', + cost_mode: 'free', + }, + }), + }, + ) + + const response = await postChatCompletions({ + req, + getUserInfoFromApiKey: mockGetUserInfoFromApiKey, + logger: mockLogger, + trackEvent: mockTrackEvent, + getUserUsageData: mockGetUserUsageData, + getAgentRunFromId: mockGetAgentRunFromId, + fetch: fetchViaFireworks, + insertMessageBigquery: mockInsertMessageBigquery, + loggerWithContext: mockLoggerWithContext, + checkSessionAdmissible: mockCheckSessionAdmissibleAllow, + }) + + const body = await response.json() + if (isFreebuffDeploymentHours()) { + expect(response.status).toBe(200) + expect(fetchedBodies).toHaveLength(1) + expect(fetchedBodies[0].model).toBe( + 'accounts/james-65d217/deployments/y5b3z17u', + ) + expect(body.model).toBe('moonshotai/kimi-k2.5') + expect(body.provider).toBe('Fireworks') + } else { + expect(response.status).toBe(503) + expect(fetchedBodies).toHaveLength(0) + expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') + } + }) + it('skips credit check when in FREE mode even with 0 credits', async () => { const req = new NextRequest( 'http://localhost:3000/api/v1/chat/completions', @@ -562,7 +634,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-no-credits' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: false, codebuff_metadata: { run_id: 'run-free', @@ -671,7 +743,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-new-free' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: true, codebuff_metadata: { run_id: 'run-123', @@ -853,7 +925,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-123' }, body: JSON.stringify({ - model: 'z-ai/glm-5.1', + model: 'minimax/minimax-m2.7', stream: false, codebuff_metadata: { run_id: 'run-free', diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts index 657c17f6da..2d33a1ae09 100644 --- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts +++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts @@ -3,6 +3,7 @@ import { describe, expect, test } from 'bun:test' import { deleteFreebuffSession, FREEBUFF_INSTANCE_HEADER, + FREEBUFF_MODEL_HEADER, getFreebuffSession, postFreebuffSession, } from '../_handlers' @@ -12,16 +13,17 @@ import type { SessionDeps } from '@/server/free-session/public-api' import type { InternalSessionRow } from '@/server/free-session/types' import type { NextRequest } from 'next/server' -const DEFAULT_MODEL = 'z-ai/glm-5.1' +const DEFAULT_MODEL = 'minimax/minimax-m2.7' function makeReq( apiKey: string | null, - opts: { instanceId?: string; cfCountry?: string } = {}, + opts: { instanceId?: string; cfCountry?: string; model?: string } = {}, ): NextRequest { const headers = new Headers() if (apiKey) headers.set('Authorization', `Bearer ${apiKey}`) if (opts.instanceId) headers.set(FREEBUFF_INSTANCE_HEADER, opts.instanceId) if (opts.cfCountry) headers.set('cf-ipcountry', opts.cfCountry) + if (opts.model) headers.set(FREEBUFF_MODEL_HEADER, opts.model) return { headers, } as unknown as NextRequest @@ -153,6 +155,19 @@ describe('POST /api/v1/freebuff/session', () => { expect(body.status).toBe('queued') }) + test('returns model_unavailable for Kimi outside deployment hours', async () => { + const sessionDeps = makeSessionDeps() + const resp = await postFreebuffSession( + makeReq('ok', { model: 'moonshotai/kimi-k2.5' }), + makeDeps(sessionDeps, 'u1'), + ) + expect(resp.status).toBe(409) + const body = await resp.json() + expect(body.status).toBe('model_unavailable') + expect(body.availableHours).toBe('9am ET-5pm PT') + expect(sessionDeps.rows.size).toBe(0) + }) + // Banned bots with valid API keys were POSTing every few seconds and // inflating queueDepth between the 15s admission-tick sweeps. Rejecting at // the HTTP layer with 403 (terminal, like country_blocked) keeps them out diff --git a/web/src/app/api/v1/freebuff/session/_handlers.ts b/web/src/app/api/v1/freebuff/session/_handlers.ts index ec17568a33..6f93e92825 100644 --- a/web/src/app/api/v1/freebuff/session/_handlers.ts +++ b/web/src/app/api/v1/freebuff/session/_handlers.ts @@ -138,12 +138,17 @@ export async function postFreebuffSession( model: requestedModel, deps: deps.sessionDeps, }) - // model_locked is a 409 so it's distinguishable from a normal queued/active - // response on the client. banned is a 403 (terminal, mirrors country_blocked) - // so older CLIs that don't know the status fall into their `!resp.ok` error - // path and back off instead of tight-polling on the unrecognized 200 body. + // model_locked / model_unavailable are 409 so they're distinguishable from + // normal queued/active responses on the client. banned is a 403 (terminal, + // mirrors country_blocked) so older CLIs that don't know the status fall + // into their `!resp.ok` error path and back off instead of tight-polling + // on the unrecognized 200 body. const status = - state.status === 'model_locked' ? 409 : state.status === 'banned' ? 403 : 200 + state.status === 'model_locked' || state.status === 'model_unavailable' + ? 409 + : state.status === 'banned' + ? 403 + : 200 return NextResponse.json(state, { status }) } catch (error) { return serverError(deps, 'POST', auth.userId, error) diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index 9ed91fd0a6..99078f5284 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -3,7 +3,7 @@ import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test' import { createFireworksRequestWithFallback, DEPLOYMENT_COOLDOWN_MS, - FireworksError, + isDeploymentHours, isDeploymentCoolingDown, markDeploymentScalingUp, resetDeploymentCooldown, @@ -11,8 +11,12 @@ import { import type { Logger } from '@codebuff/common/types/contracts/logger' -const STANDARD_MODEL_ID = 'accounts/fireworks/models/glm-5p1' -const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/mjb4i7ea' +const STANDARD_MODEL_ID = 'accounts/fireworks/models/kimi-k2p5' +const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/y5b3z17u' +const IN_DEPLOYMENT_HOURS = new Date('2026-04-17T16:00:00Z') // Friday, 12pm ET / 9am PT +const BEFORE_DEPLOYMENT_HOURS = new Date('2026-04-17T12:59:00Z') // Friday, 8:59am ET +const AFTER_DEPLOYMENT_HOURS = new Date('2026-04-18T00:00:00Z') // Friday, 5pm PT +const WEEKEND_DEPLOYMENT_HOURS = new Date('2026-04-18T16:00:00Z') // Saturday function createMockLogger(): Logger { return { @@ -23,18 +27,19 @@ function createMockLogger(): Logger { } } -// Helper: create a Date at a specific ET hour using a known EDT date (June 2025, UTC-4) -function dateAtEtHour(hour: number): Date { - // June 15, 2025 is EDT (UTC-4), so ET hour H = UTC hour H+4 - const utcHour = hour + 4 - if (utcHour < 24) { - return new Date(`2025-06-15T${String(utcHour).padStart(2, '0')}:30:00Z`) - } - // Wraps to next day - return new Date(`2025-06-16T${String(utcHour - 24).padStart(2, '0')}:30:00Z`) -} - describe('Fireworks deployment routing', () => { + describe('deployment hours', () => { + it('is active from 9am ET until before 5pm PT on weekdays', () => { + expect(isDeploymentHours(BEFORE_DEPLOYMENT_HOURS)).toBe(false) + expect(isDeploymentHours(IN_DEPLOYMENT_HOURS)).toBe(true) + expect(isDeploymentHours(AFTER_DEPLOYMENT_HOURS)).toBe(false) + }) + + it('is inactive on weekends', () => { + expect(isDeploymentHours(WEEKEND_DEPLOYMENT_HOURS)).toBe(false) + }) + }) + describe('deployment cooldown', () => { beforeEach(() => { resetDeploymentCooldown() @@ -78,32 +83,10 @@ describe('Fireworks deployment routing', () => { }) const minimalBody = { - model: 'z-ai/glm-5.1', + model: 'moonshotai/kimi-k2.5', messages: [{ role: 'user' as const, content: 'test' }], } - function spyDeploymentHours(inHours: boolean) { - // Control isDeploymentHours by mocking Date.prototype.toLocaleString - // When called with the ET timezone options, return an hour inside or outside the window - const original = Date.prototype.toLocaleString - const spy = { - restore: () => { - Date.prototype.toLocaleString = original - }, - } - Date.prototype.toLocaleString = function ( - this: Date, - ...args: Parameters - ) { - const options = args[1] as Intl.DateTimeFormatOptions | undefined - if (options?.timeZone === 'America/New_York' && options?.hour === 'numeric') { - return inHours ? '14' : '3' - } - return original.apply(this, args) - } - return spy - } - it('uses standard API when custom deployment is disabled', async () => { const fetchCalls: string[] = [] @@ -115,7 +98,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -128,7 +111,6 @@ describe('Fireworks deployment routing', () => { }) it('tries custom deployment during deployment hours', async () => { - const spy = spyDeploymentHours(true) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { @@ -137,160 +119,115 @@ describe('Fireworks deployment routing', () => { return new Response(JSON.stringify({ ok: true }), { status: 200 }) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(1) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toHaveLength(1) + expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) }) - it('falls back to standard API on 503 DEPLOYMENT_SCALING_UP', async () => { - const spy = spyDeploymentHours(true) + it('returns deployment 503 on DEPLOYMENT_SCALING_UP without serverless fallback', async () => { const fetchCalls: string[] = [] - let callCount = 0 const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { const body = JSON.parse(init?.body as string) fetchCalls.push(body.model) - callCount++ - - if (callCount === 1) { - return new Response( - JSON.stringify({ - error: { - message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.', - code: 'DEPLOYMENT_SCALING_UP', - type: 'error', - }, - }), - { status: 503, statusText: 'Service Unavailable' }, - ) - } - - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ + error: { + message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.', + code: 'DEPLOYMENT_SCALING_UP', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(2) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - // Verify cooldown was activated - expect(isDeploymentCoolingDown()).toBe(true) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(true) }) - it('falls back to standard API on non-scaling 503 from deployment', async () => { - const spy = spyDeploymentHours(true) + it('returns non-scaling deployment 503 without serverless fallback', async () => { const fetchCalls: string[] = [] - let callCount = 0 const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { const body = JSON.parse(init?.body as string) fetchCalls.push(body.model) - callCount++ - - if (callCount === 1) { - return new Response( - JSON.stringify({ - error: { - message: 'Service temporarily unavailable', - code: 'SERVICE_UNAVAILABLE', - type: 'error', - }, - }), - { status: 503, statusText: 'Service Unavailable' }, - ) - } - - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ + error: { + message: 'Service temporarily unavailable', + code: 'SERVICE_UNAVAILABLE', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(2) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - // Non-scaling 503 should NOT activate the cooldown - expect(isDeploymentCoolingDown()).toBe(false) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(false) }) - it('falls back to standard API on 500 Internal Error from deployment', async () => { - const spy = spyDeploymentHours(true) + it('returns 500 Internal Error from deployment without serverless fallback', async () => { const fetchCalls: string[] = [] - let callCount = 0 const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { const body = JSON.parse(init?.body as string) fetchCalls.push(body.model) - callCount++ - - if (callCount === 1) { - return new Response( - JSON.stringify({ error: 'Internal error' }), - { status: 500, statusText: 'Internal Server Error' }, - ) - } - - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ error: 'Internal error' }), + { status: 500, statusText: 'Internal Server Error' }, + ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(2) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - expect(isDeploymentCoolingDown()).toBe(false) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(500) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(false) }) - it('skips deployment during cooldown and goes straight to standard API', async () => { - const spy = spyDeploymentHours(true) + it('returns cooldown error without serverless fallback', async () => { markDeploymentScalingUp() const fetchCalls: string[] = [] @@ -300,26 +237,21 @@ describe('Fireworks deployment routing', () => { return new Response(JSON.stringify({ ok: true }), { status: 200 }) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(1) - expect(fetchCalls[0]).toBe(STANDARD_MODEL_ID) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + expect(fetchCalls).toHaveLength(0) }) it('uses standard API for models without a custom deployment', async () => { - const spy = spyDeploymentHours(true) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { @@ -328,27 +260,43 @@ describe('Fireworks deployment routing', () => { return new Response(JSON.stringify({ ok: true }), { status: 200 }) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: { ...minimalBody, model: 'some-other/model' } as never, - originalModel: 'some-other/model', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(response.status).toBe(200) - expect(fetchCalls).toHaveLength(1) - // Model without mapping falls through to the original model - expect(fetchCalls[0]).toBe('some-other/model') - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: { ...minimalBody, model: 'some-other/model' } as never, + originalModel: 'some-other/model', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: BEFORE_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toHaveLength(1) + // Model without mapping falls through to the original model + expect(fetchCalls[0]).toBe('some-other/model') + }) + + it('returns an availability error for deployment models outside hours', async () => { + const mockFetch = mock(async () => { + throw new Error('should not fetch outside deployment hours') + }) as unknown as typeof globalThis.fetch + + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: BEFORE_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(503) + const body = await response.json() + expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') }) it('returns non-5xx responses from deployment without fallback (e.g. 429)', async () => { - const spy = spyDeploymentHours(true) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { @@ -360,23 +308,20 @@ describe('Fireworks deployment routing', () => { ) }) as unknown as typeof globalThis.fetch - try { - const response = await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - // Non-5xx errors from deployment are returned as-is (caller handles them) - expect(response.status).toBe(429) - expect(fetchCalls).toHaveLength(1) - expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) - } finally { - spy.restore() - } + const response = await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + // Non-5xx errors from deployment are returned as-is (caller handles them) + expect(response.status).toBe(429) + expect(fetchCalls).toHaveLength(1) + expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) }) it('transforms reasoning to reasoning_effort (defaults to medium)', async () => { @@ -393,7 +338,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { enabled: true }, } as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -419,7 +364,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { effort: 'high' }, } as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -445,7 +390,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { enabled: false, effort: 'high' }, } as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -472,7 +417,7 @@ describe('Fireworks deployment routing', () => { reasoning: { effort: 'high' }, tools: [{ type: 'function', function: { name: 'test', arguments: '{}' } }], } as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -498,7 +443,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning_effort: 'low', } as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -524,7 +469,7 @@ describe('Fireworks deployment routing', () => { reasoning_effort: 'low', tools: [{ type: 'function', function: { name: 'test', arguments: '{}' } }], } as never, - originalModel: 'z-ai/glm-5.1', + originalModel: 'moonshotai/kimi-k2.5', fetch: mockFetch, logger, useCustomDeployment: false, @@ -535,41 +480,31 @@ describe('Fireworks deployment routing', () => { expect(fetchedBodies[0].reasoning_effort).toBe('low') }) - it('logs when trying deployment and when falling back on 5xx', async () => { - const spy = spyDeploymentHours(true) - let callCount = 0 - + it('logs when trying deployment and when deployment returns 5xx', async () => { const mockFetch = mock(async () => { - callCount++ - if (callCount === 1) { - return new Response( - JSON.stringify({ - error: { - message: 'Scaling up', - code: 'DEPLOYMENT_SCALING_UP', - type: 'error', - }, - }), - { status: 503, statusText: 'Service Unavailable' }, - ) - } - return new Response(JSON.stringify({ ok: true }), { status: 200 }) + return new Response( + JSON.stringify({ + error: { + message: 'Scaling up', + code: 'DEPLOYMENT_SCALING_UP', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) }) as unknown as typeof globalThis.fetch - try { - await createFireworksRequestWithFallback({ - body: minimalBody as never, - originalModel: 'z-ai/glm-5.1', - fetch: mockFetch, - logger, - useCustomDeployment: true, - sessionId: 'test-user-id', - }) - - expect(logger.info).toHaveBeenCalledTimes(2) - } finally { - spy.restore() - } + await createFireworksRequestWithFallback({ + body: minimalBody as never, + originalModel: 'moonshotai/kimi-k2.5', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(logger.info).toHaveBeenCalledTimes(2) }) }) }) diff --git a/web/src/llm-api/fireworks-config.ts b/web/src/llm-api/fireworks-config.ts index fb6d595801..6856f3f347 100644 --- a/web/src/llm-api/fireworks-config.ts +++ b/web/src/llm-api/fireworks-config.ts @@ -10,7 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217' export const FIREWORKS_DEPLOYMENT_MAP: Record = { // 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9', - // 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2', + 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/y5b3z17u', // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd', - 'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea', } diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 6e304638d7..138671c8aa 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -1,5 +1,9 @@ import { Agent } from 'undici' +import { + FREEBUFF_DEPLOYMENT_HOURS_LABEL, + isFreebuffDeploymentHours, +} from '@codebuff/common/constants/freebuff-models' import { PROFIT_MARGIN } from '@codebuff/common/constants/limits' import { getErrorObject } from '@codebuff/common/util/error' import { env } from '@codebuff/internal/env' @@ -38,9 +42,9 @@ const FIREWORKS_MODEL_MAP: Record = { /** Flag to enable custom Fireworks deployments (set to false to use global API only) */ const FIREWORKS_USE_CUSTOM_DEPLOYMENT = true -/** Check if current time is within deployment hours (always enabled) */ -export function isDeploymentHours(_now: Date = new Date()): boolean { - return true +/** Check if current time is within deployment hours: Mon-Fri, 9am ET to 5pm PT. */ +export function isDeploymentHours(now: Date = new Date()): boolean { + return isFreebuffDeploymentHours(now) } /** @@ -173,7 +177,7 @@ const FIREWORKS_PRICING_MAP: Record = { } function getFireworksPricing(model: string): FireworksPricing { - return FIREWORKS_PRICING_MAP[model] ?? FIREWORKS_MODEL_MAP['z-ai/glm-5.1'] + return FIREWORKS_PRICING_MAP[model] ?? FIREWORKS_PRICING_MAP['z-ai/glm-5.1'] } function extractUsageAndCost(usage: Record | undefined | null, model: string): UsageData { @@ -708,9 +712,10 @@ async function parseFireworksError(response: Response): Promise } /** - * Tries the custom Fireworks deployment during business hours (10am–8pm ET), - * falling back to the standard API if the deployment returns 503 DEPLOYMENT_SCALING_UP. - * Outside deployment hours or during cooldown, goes straight to the standard API. + * Uses custom Fireworks deployments only during deployment hours. Deployment + * mapped models never fall back to the serverless API outside hours, during + * cooldown, or after deployment 5xxs; those states surface as provider errors + * so freebuff can offer MiniMax as the always-on option. */ export async function createFireworksRequestWithFallback(params: { body: ChatCompletionRequestBody @@ -719,17 +724,41 @@ export async function createFireworksRequestWithFallback(params: { logger: Logger useCustomDeployment?: boolean sessionId: string + now?: Date }): Promise { const { body, originalModel, fetch, logger, sessionId } = params + const now = params.now ?? new Date() const useCustomDeployment = params.useCustomDeployment ?? FIREWORKS_USE_CUSTOM_DEPLOYMENT const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel] - const shouldTryDeployment = - useCustomDeployment && - deploymentModelId && - isDeploymentHours() && - !isDeploymentCoolingDown() + const hasDeployment = useCustomDeployment && Boolean(deploymentModelId) + + if (hasDeployment && !isDeploymentHours(now)) { + return new Response( + JSON.stringify({ + error: { + message: `${originalModel} is only available during ${FREEBUFF_DEPLOYMENT_HOURS_LABEL}. Use minimax/minimax-m2.7 outside those hours.`, + code: 'DEPLOYMENT_OUTSIDE_HOURS', + type: 'availability_error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) + } - if (shouldTryDeployment) { + if (hasDeployment && isDeploymentCoolingDown()) { + return new Response( + JSON.stringify({ + error: { + message: `${originalModel} deployment is temporarily unavailable. Use minimax/minimax-m2.7 while it recovers.`, + code: 'DEPLOYMENT_COOLDOWN', + type: 'availability_error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) + } + + if (hasDeployment && deploymentModelId) { logger.info( { model: originalModel, deploymentModel: deploymentModelId }, 'Trying Fireworks custom deployment', @@ -746,15 +775,18 @@ export async function createFireworksRequestWithFallback(params: { const errorText = await response.text() logger.info( { model: originalModel, status: response.status, errorText: errorText.slice(0, 200) }, - 'Fireworks custom deployment returned 5xx, falling back to standard API', + 'Fireworks custom deployment returned 5xx', ) if (errorText.includes('DEPLOYMENT_SCALING_UP')) { markDeploymentScalingUp() } - // Fall through to standard API request below - } else { - return response + return new Response(errorText, { + status: response.status, + statusText: response.statusText, + headers: response.headers, + }) } + return response } return createFireworksRequest({ body, originalModel, fetch, sessionId }) diff --git a/web/src/server/free-session/__tests__/config.test.ts b/web/src/server/free-session/__tests__/config.test.ts new file mode 100644 index 0000000000..93f5fdcf04 --- /dev/null +++ b/web/src/server/free-session/__tests__/config.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, test } from 'bun:test' + +import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models' + +import { getInstantAdmitCapacity } from '../config' + +describe('free session config', () => { + test('every selectable freebuff model has instant-admit capacity', () => { + for (const model of FREEBUFF_MODELS) { + expect(getInstantAdmitCapacity(model.id)).toBeGreaterThan(0) + } + }) +}) diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index a824f6d22b..e0e0aa956b 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -13,7 +13,7 @@ import type { InternalSessionRow } from '../types' const SESSION_LEN = 60 * 60 * 1000 const GRACE_MS = 30 * 60 * 1000 -const DEFAULT_MODEL = 'z-ai/glm-5.1' +const DEFAULT_MODEL = 'minimax/minimax-m2.7' function makeDeps(overrides: Partial = {}): SessionDeps & { rows: Map @@ -177,19 +177,34 @@ describe('requestSession', () => { expect(state.instanceId).toBe('inst-1') }) + test('deployment-hours-only model is unavailable outside deployment hours', async () => { + const state = await requestSession({ + userId: 'u1', + model: 'moonshotai/kimi-k2.5', + deps, + }) + expect(state).toEqual({ + status: 'model_unavailable', + requestedModel: 'moonshotai/kimi-k2.5', + availableHours: '9am ET-5pm PT', + }) + expect(deps.rows.size).toBe(0) + }) + test('queued response includes a per-model depth snapshot for the selector', async () => { - // Seed 2 users in glm + 1 in minimax so the returned map captures both. + deps._tick(new Date('2026-04-17T16:00:00Z')) + // Seed 2 users in MiniMax + 1 in Kimi so the returned map captures both. await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) await requestSession({ userId: 'u2', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) - await requestSession({ userId: 'u3', model: 'minimax/minimax-m2.7', deps }) + await requestSession({ userId: 'u3', model: 'moonshotai/kimi-k2.5', deps }) const state = await getSessionState({ userId: 'u1', deps }) if (state.status !== 'queued') throw new Error('unreachable') expect(state.queueDepthByModel).toEqual({ [DEFAULT_MODEL]: 2, - 'minimax/minimax-m2.7': 1, + 'moonshotai/kimi-k2.5': 1, }) }) @@ -264,11 +279,12 @@ describe('requestSession', () => { }) test('instant-admit: per-model capacities are independent', async () => { - // GLM saturated at 1 active, MiniMax still has room. + // MiniMax saturated at 1 active, Kimi still has room. const admitDeps = makeDeps({ getInstantAdmitCapacity: (model) => model === DEFAULT_MODEL ? 1 : 10, }) + admitDeps._tick(new Date('2026-04-17T16:00:00Z')) await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps: admitDeps }) const s2 = await requestSession({ userId: 'u2', @@ -277,7 +293,7 @@ describe('requestSession', () => { }) const s3 = await requestSession({ userId: 'u3', - model: 'minimax/minimax-m2.7', + model: 'moonshotai/kimi-k2.5', deps: admitDeps, }) expect(s2.status).toBe('queued') diff --git a/web/src/server/free-session/admission.ts b/web/src/server/free-session/admission.ts index 3f3c051d2a..9f0b74c9f9 100644 --- a/web/src/server/free-session/admission.ts +++ b/web/src/server/free-session/admission.ts @@ -1,4 +1,7 @@ -import { FREEBUFF_MODELS } from '@codebuff/common/constants/freebuff-models' +import { + FREEBUFF_MODELS, + isFreebuffModelAvailable, +} from '@codebuff/common/constants/freebuff-models' import { ADMISSION_TICK_MS, @@ -111,7 +114,10 @@ export async function runAdmissionTick( // advisory locks and a single update each. const perModel = await Promise.all( models.map(async (model) => { - const health = fleet[model] ?? 'healthy' + const isRegisteredModel = FREEBUFF_MODELS.some((m) => m.id === model) + const health = !isRegisteredModel || isFreebuffModelAvailable(model, now) + ? fleet[model] ?? 'healthy' + : 'unhealthy' const { admitted, skipped } = await deps.admitFromQueue({ model, sessionLengthMs: deps.sessionLengthMs, diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts index 85bba7fa6f..c0b4d84c66 100644 --- a/web/src/server/free-session/config.ts +++ b/web/src/server/free-session/config.ts @@ -48,7 +48,7 @@ export function getSessionGraceMs(): number { * queue). */ const INSTANT_ADMIT_CAPACITY: Record = { - 'z-ai/glm-5.1': 50, + 'moonshotai/kimi-k2.5': 100, 'minimax/minimax-m2.7': 200, } diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts index 4505404436..7ea85f2e48 100644 --- a/web/src/server/free-session/public-api.ts +++ b/web/src/server/free-session/public-api.ts @@ -1,4 +1,6 @@ import { + FREEBUFF_DEPLOYMENT_HOURS_LABEL, + isFreebuffModelAvailable, isFreebuffModelId as isSelectableFreebuffModel, resolveFreebuffModel, } from '@codebuff/common/constants/freebuff-models' @@ -122,6 +124,11 @@ export type RequestSessionResult = currentModel: string requestedModel: string } + | { + status: 'model_unavailable' + requestedModel: string + availableHours: string + } /** * Client calls this on CLI startup with the model they want to use. @@ -152,6 +159,7 @@ export async function requestSession(params: { }): Promise { const deps = params.deps ?? defaultDeps const model = resolveFreebuffModel(params.model) + const now = nowOf(deps) if (params.userBanned) { return { status: 'banned' } } @@ -161,13 +169,20 @@ export async function requestSession(params: { ) { return { status: 'disabled' } } + if (!isFreebuffModelAvailable(model, now)) { + return { + status: 'model_unavailable', + requestedModel: model, + availableHours: FREEBUFF_DEPLOYMENT_HOURS_LABEL, + } + } let row: InternalSessionRow try { row = await deps.joinOrTakeOver({ userId: params.userId, model, - now: nowOf(deps), + now, }) } catch (err) { if (err instanceof FreeSessionModelLockedError) { @@ -199,7 +214,7 @@ export async function requestSession(params: { userId: params.userId, model, sessionLengthMs: deps.sessionLengthMs, - now: nowOf(deps), + now, }) if (promoted) row = promoted } From 6043ee25a8afd965ec85f3ae41c6c9412385546c Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 24 Apr 2026 12:23:31 -0700 Subject: [PATCH 2/3] switch to kimi k2.6, 9am ET to 5pm PT --- agents/base2/base2.ts | 14 +++---- agents/reviewer/code-reviewer-lite.ts | 2 +- agents/types/agent-definition.ts | 4 +- cli/src/hooks/use-freebuff-session.ts | 3 +- common/src/constants/free-agents.ts | 6 +-- common/src/constants/freebuff-models.ts | 6 +-- .../types/agent-definition.ts | 4 +- common/src/types/freebuff-session.ts | 3 +- docs/freebuff-waiting-room.md | 10 ++--- scripts/test-fireworks-cache-intervals.ts | 14 +++---- scripts/test-fireworks-long.ts | 18 ++++----- .../completions/__tests__/completions.test.ts | 28 +++++--------- .../session/__tests__/session.test.ts | 2 +- .../__tests__/fireworks-deployment.test.ts | 38 ++++++++++--------- web/src/llm-api/fireworks-config.ts | 2 +- web/src/llm-api/fireworks.ts | 4 +- .../free-session/__tests__/public-api.test.ts | 10 ++--- web/src/server/free-session/config.ts | 2 +- 18 files changed, 81 insertions(+), 89 deletions(-) diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index c6f7e15f8a..b1e24efff6 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -25,18 +25,16 @@ export function createBase2( const isFree = mode === 'free' || mode === 'lite' const isSonnet = false - const model = isFree ? 'minimax/minimax-m2.7' : 'anthropic/claude-opus-4.7' + const model = isFree ? 'moonshotai/kimi-k2.6' : 'anthropic/claude-opus-4.7' return { publisher, model, - providerOptions: isFree - ? { - data_collection: 'deny', - } - : { - only: ['amazon-bedrock'], - }, + providerOptions: isFree ? { + data_collection: 'deny', + } : { + only: ['amazon-bedrock'], + }, displayName: 'Buffy the Orchestrator', spawnerPrompt: 'Advanced base agent that orchestrates planning, editing, and reviewing for complex coding tasks', diff --git a/agents/reviewer/code-reviewer-lite.ts b/agents/reviewer/code-reviewer-lite.ts index ee017c24e6..888cadf4f7 100644 --- a/agents/reviewer/code-reviewer-lite.ts +++ b/agents/reviewer/code-reviewer-lite.ts @@ -5,7 +5,7 @@ import { createReviewer } from './code-reviewer' const definition: SecretAgentDefinition = { id: 'code-reviewer-lite', publisher, - ...createReviewer('minimax/minimax-m2.7'), + ...createReviewer('moonshotai/kimi-k2.6'), } export default definition diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts index b28a77c311..2fbfed0a49 100644 --- a/agents/types/agent-definition.ts +++ b/agents/types/agent-definition.ts @@ -423,8 +423,8 @@ export type ModelName = // Other open source models | 'moonshotai/kimi-k2' | 'moonshotai/kimi-k2:nitro' - | 'moonshotai/kimi-k2.5' - | 'moonshotai/kimi-k2.5:nitro' + | 'moonshotai/kimi-k2.6' + | 'moonshotai/kimi-k2.6:nitro' | 'z-ai/glm-5' | 'z-ai/glm-5.1' | 'z-ai/glm-4.6' diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts index 225eee2b24..f24fba7b30 100644 --- a/cli/src/hooks/use-freebuff-session.ts +++ b/cli/src/hooks/use-freebuff-session.ts @@ -78,7 +78,8 @@ async function callSession( } // 409 from POST means the selected model cannot be joined right now, either // because an active session is locked to another model or because a - // deployment-hours-only model is closed. Surface both as non-throw states. + // Surface model-switch conflicts and temporary model availability closures + // as non-throw states. if (resp.status === 409 && method === 'POST') { const body = (await resp.json().catch(() => null)) as | FreebuffSessionResponse diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index 762202dcca..4a2a4a147e 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -28,7 +28,7 @@ export const FREE_MODE_AGENT_MODELS: Record> = { // Root orchestrator 'base2-free': new Set([ 'minimax/minimax-m2.7', - 'moonshotai/kimi-k2.5', + 'moonshotai/kimi-k2.6', ]), // File exploration agents @@ -46,13 +46,13 @@ export const FREE_MODE_AGENT_MODELS: Record> = { // Editor for free mode 'editor-lite': new Set([ 'minimax/minimax-m2.7', - 'moonshotai/kimi-k2.5', + 'moonshotai/kimi-k2.6', ]), // Code reviewer for free mode 'code-reviewer-lite': new Set([ 'minimax/minimax-m2.7', - 'moonshotai/kimi-k2.5', + 'moonshotai/kimi-k2.6', ]), } diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts index d38d187ffc..2f6da2ce0b 100644 --- a/common/src/constants/freebuff-models.ts +++ b/common/src/constants/freebuff-models.ts @@ -18,7 +18,7 @@ export interface FreebuffModelOption { } export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT' -export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.5' +export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6' export const FREEBUFF_MODELS = [ { @@ -29,7 +29,7 @@ export const FREEBUFF_MODELS = [ }, { id: FREEBUFF_KIMI_MODEL_ID, - displayName: 'Kimi K2.5', + displayName: 'Kimi K2.6', tagline: 'Balanced', availability: 'deployment_hours', }, @@ -83,7 +83,7 @@ export function isFreebuffDeploymentHours(now: Date = new Date()): boolean { const eastern = getZonedParts(now, 'America/New_York') const pacific = getZonedParts(now, 'America/Los_Angeles') if (eastern.weekday === 'Sat' || eastern.weekday === 'Sun') return false - return eastern.minutes >= 9 * 60 && pacific.minutes < 24 * 60 + return eastern.minutes >= 9 * 60 && pacific.minutes < 17 * 60 } export function isFreebuffModelAvailable( diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts index b28a77c311..2fbfed0a49 100644 --- a/common/src/templates/initial-agents-dir/types/agent-definition.ts +++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts @@ -423,8 +423,8 @@ export type ModelName = // Other open source models | 'moonshotai/kimi-k2' | 'moonshotai/kimi-k2:nitro' - | 'moonshotai/kimi-k2.5' - | 'moonshotai/kimi-k2.5:nitro' + | 'moonshotai/kimi-k2.6' + | 'moonshotai/kimi-k2.6:nitro' | 'z-ai/glm-5' | 'z-ai/glm-5.1' | 'z-ai/glm-4.6' diff --git a/common/src/types/freebuff-session.ts b/common/src/types/freebuff-session.ts index 43cd3eaa25..d141000a40 100644 --- a/common/src/types/freebuff-session.ts +++ b/common/src/types/freebuff-session.ts @@ -93,8 +93,7 @@ export type FreebuffSessionServerResponse = requestedModel: string } | { - /** Requested model is valid but not selectable right now. Currently - * used for deployment-hours-only models such as Kimi K2.5. */ + /** Requested model is valid but not selectable right now. */ status: 'model_unavailable' requestedModel: string availableHours: string diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md index 73fa779270..153487897a 100644 --- a/docs/freebuff-waiting-room.md +++ b/docs/freebuff-waiting-room.md @@ -5,7 +5,7 @@ The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs: 1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones. -2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; Kimi K2.5 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available. +2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; Kimi K2.6 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available. 3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput. Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session. @@ -149,8 +149,8 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r | Constant | Location | Default | Purpose | |---|---|---|---| | `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. | -| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `kimi-k2.5` | Selectable models; each gets its own queue and admission slot. | -| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `kimi-k2.5` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | +| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `kimi-k2.6` | Selectable models; each gets its own queue and admission slot. | +| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `kimi-k2.6` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | | `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. | | `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime | | `FREEBUFF_SESSION_GRACE_MS` | env | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. | @@ -185,7 +185,7 @@ Response shapes: "queueDepth": 43, // size of this model's queue "queueDepthByModel": { // snapshot of every model's queue — powers the "minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing - "moonshotai/kimi-k2.5": 4 // entries should be treated as 0. + "moonshotai/kimi-k2.6": 4 // entries should be treated as 0. }, "estimatedWaitMs": 384000, "queuedAt": "2026-04-17T12:00:00Z" @@ -285,7 +285,7 @@ waitMs = (position - 1) * 24_000 - Position 1 → 0 (next tick admits you) - Position 2 → 24s, and so on. -`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `moonshotai/kimi-k2.5` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a Kimi Fireworks incident or outside 9am ET-5pm PT, only Kimi's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter. +`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `moonshotai/kimi-k2.6` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a Kimi Fireworks incident or outside 9am ET-5pm PT, only Kimi's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter. ## CLI Integration (frontend-side contract) diff --git a/scripts/test-fireworks-cache-intervals.ts b/scripts/test-fireworks-cache-intervals.ts index 92d7ac49e3..44bffd4b75 100644 --- a/scripts/test-fireworks-cache-intervals.ts +++ b/scripts/test-fireworks-cache-intervals.ts @@ -13,7 +13,7 @@ * * Models: * glm-5.1 (default) — z-ai/glm-5.1 - * kimi-k2.5 — moonshotai/kimi-k2.5 + * kimi-k2.6 — moonshotai/kimi-k2.6 * minimax — minimax/minimax-m2.5 * * Flags: @@ -26,10 +26,10 @@ * bun scripts/test-fireworks-cache-intervals.ts * * # Custom Kimi deployment with a faster sweep - * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.5 --deployment --intervals=30,60,120,300,600 + * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.6 --deployment --intervals=30,60,120,300,600 * * # Long sweep up to 1 hour - * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.5 --deployment --intervals=60,300,600,1200,1800,2700,3600 + * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.6 --deployment --intervals=60,300,600,1200,1800,2700,3600 */ export {} @@ -53,10 +53,10 @@ const MODEL_CONFIGS: Record = { cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.4 / 1_000_000, }, - 'kimi-k2.5': { - id: 'moonshotai/kimi-k2.5', - standardModel: 'accounts/fireworks/models/kimi-k2p5', - deploymentModel: 'accounts/james-65d217/deployments/y5b3z17u', + 'kimi-k2.6': { + id: 'moonshotai/kimi-k2.6', + standardModel: 'accounts/fireworks/models/kimi-k2p6', + deploymentModel: 'accounts/james-65d217/deployments/j8ar2x0y', inputCostPerToken: 0.6 / 1_000_000, cachedInputCostPerToken: 0.1 / 1_000_000, outputCostPerToken: 3.0 / 1_000_000, diff --git a/scripts/test-fireworks-long.ts b/scripts/test-fireworks-long.ts index e506ccf022..45561fbc42 100644 --- a/scripts/test-fireworks-long.ts +++ b/scripts/test-fireworks-long.ts @@ -11,7 +11,7 @@ * * Models: * glm-5.1 (default) — z-ai/glm-5.1 - * kimi-k2.5 — moonshotai/kimi-k2.5 + * kimi-k2.6 — moonshotai/kimi-k2.6 * minimax — minimax/minimax-m2.5 * minimax-m2.7 — minimax/minimax-m2.7 * @@ -19,7 +19,7 @@ * --deployment Use custom deployment instead of serverless (standard API) * Serverless is the default * Examples: - * bun scripts/test-fireworks-long.ts kimi-k2.5 --deployment + * bun scripts/test-fireworks-long.ts kimi-k2.6 --deployment */ import { FIREWORKS_DEPLOYMENT_MAP } from '../web/src/llm-api/fireworks-config' @@ -45,10 +45,10 @@ const MODEL_CONFIGS: Record = { cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, }, - 'kimi-k2.5': { - id: 'moonshotai/kimi-k2.5', - standardModel: 'accounts/fireworks/models/kimi-k2p5', - deploymentModel: FIREWORKS_DEPLOYMENT_MAP['moonshotai/kimi-k2.5'], + 'kimi-k2.6': { + id: 'moonshotai/kimi-k2.6', + standardModel: 'accounts/fireworks/models/kimi-k2p6', + deploymentModel: FIREWORKS_DEPLOYMENT_MAP['moonshotai/kimi-k2.6'], inputCostPerToken: 0.60 / 1_000_000, cachedInputCostPerToken: 0.10 / 1_000_000, outputCostPerToken: 3.00 / 1_000_000, @@ -75,9 +75,9 @@ const DEFAULT_MODEL = 'glm-5.1' const MODEL_ALIASES: Record = { glm: 'glm-5.1', 'z-ai/glm-5.1': 'glm-5.1', - kimi: 'kimi-k2.5', - 'kimi-k2': 'kimi-k2.5', - 'moonshotai/kimi-k2.5': 'kimi-k2.5', + kimi: 'kimi-k2.6', + 'kimi-k2': 'kimi-k2.6', + 'moonshotai/kimi-k2.6': 'kimi-k2.6', 'minimax/minimax-m2.5': 'minimax', 'minimax/minimax-m2.7': 'minimax-m2.7', } diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index 04f9b570cd..5f4490ff2a 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -1,8 +1,6 @@ import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test' import { NextRequest } from 'next/server' -import { isFreebuffDeploymentHours } from '@codebuff/common/constants/freebuff-models' - import { formatQuotaResetCountdown, postChatCompletions } from '../_post' import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics' @@ -557,7 +555,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { expect(response.status).toBe(200) }) - it('lets freebuff use Kimi K2.5 through Fireworks availability rules', async () => { + it('lets freebuff use Kimi K2.6 through Fireworks availability rules', async () => { const fetchedBodies: Record[] = [] const fetchViaFireworks = mock( async (_url: string | URL | Request, init?: RequestInit) => { @@ -565,7 +563,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { return new Response( JSON.stringify({ id: 'test-id', - model: 'accounts/james-65d217/deployments/y5b3z17u', + model: 'accounts/james-65d217/deployments/j8ar2x0y', choices: [{ message: { content: 'test response' } }], usage: { prompt_tokens: 10, @@ -587,7 +585,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-new-free' }, body: JSON.stringify({ - model: 'moonshotai/kimi-k2.5', + model: 'moonshotai/kimi-k2.6', stream: false, codebuff_metadata: { run_id: 'run-free', @@ -612,19 +610,13 @@ describe('/api/v1/chat/completions POST endpoint', () => { }) const body = await response.json() - if (isFreebuffDeploymentHours()) { - expect(response.status).toBe(200) - expect(fetchedBodies).toHaveLength(1) - expect(fetchedBodies[0].model).toBe( - 'accounts/james-65d217/deployments/y5b3z17u', - ) - expect(body.model).toBe('moonshotai/kimi-k2.5') - expect(body.provider).toBe('Fireworks') - } else { - expect(response.status).toBe(503) - expect(fetchedBodies).toHaveLength(0) - expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') - } + expect(response.status).toBe(200) + expect(fetchedBodies).toHaveLength(1) + expect(fetchedBodies[0].model).toBe( + 'accounts/james-65d217/deployments/j8ar2x0y', + ) + expect(body.model).toBe('moonshotai/kimi-k2.6') + expect(body.provider).toBe('Fireworks') }) it('skips credit check when in FREE mode even with 0 credits', async () => { diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts index 2d33a1ae09..bbe31b64e0 100644 --- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts +++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts @@ -158,7 +158,7 @@ describe('POST /api/v1/freebuff/session', () => { test('returns model_unavailable for Kimi outside deployment hours', async () => { const sessionDeps = makeSessionDeps() const resp = await postFreebuffSession( - makeReq('ok', { model: 'moonshotai/kimi-k2.5' }), + makeReq('ok', { model: 'moonshotai/kimi-k2.6' }), makeDeps(sessionDeps, 'u1'), ) expect(resp.status).toBe(409) diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index 99078f5284..7e213e9e66 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -11,11 +11,12 @@ import { import type { Logger } from '@codebuff/common/types/contracts/logger' -const STANDARD_MODEL_ID = 'accounts/fireworks/models/kimi-k2p5' -const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/y5b3z17u' +const STANDARD_MODEL_ID = 'accounts/fireworks/models/kimi-k2p6' +const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/j8ar2x0y' const IN_DEPLOYMENT_HOURS = new Date('2026-04-17T16:00:00Z') // Friday, 12pm ET / 9am PT const BEFORE_DEPLOYMENT_HOURS = new Date('2026-04-17T12:59:00Z') // Friday, 8:59am ET const AFTER_DEPLOYMENT_HOURS = new Date('2026-04-18T00:00:00Z') // Friday, 5pm PT +const WEEKDAY_AFTER_DEPLOYMENT_HOURS = new Date('2026-04-21T00:01:00Z') // Monday, 5:01pm PT const WEEKEND_DEPLOYMENT_HOURS = new Date('2026-04-18T16:00:00Z') // Saturday function createMockLogger(): Logger { @@ -33,6 +34,7 @@ describe('Fireworks deployment routing', () => { expect(isDeploymentHours(BEFORE_DEPLOYMENT_HOURS)).toBe(false) expect(isDeploymentHours(IN_DEPLOYMENT_HOURS)).toBe(true) expect(isDeploymentHours(AFTER_DEPLOYMENT_HOURS)).toBe(false) + expect(isDeploymentHours(WEEKDAY_AFTER_DEPLOYMENT_HOURS)).toBe(false) }) it('is inactive on weekends', () => { @@ -83,7 +85,7 @@ describe('Fireworks deployment routing', () => { }) const minimalBody = { - model: 'moonshotai/kimi-k2.5', + model: 'moonshotai/kimi-k2.6', messages: [{ role: 'user' as const, content: 'test' }], } @@ -98,7 +100,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -121,7 +123,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -154,7 +156,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -187,7 +189,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -214,7 +216,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -239,7 +241,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -283,7 +285,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -310,7 +312,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, @@ -338,7 +340,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { enabled: true }, } as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -364,7 +366,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { effort: 'high' }, } as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -390,7 +392,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { enabled: false, effort: 'high' }, } as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -417,7 +419,7 @@ describe('Fireworks deployment routing', () => { reasoning: { effort: 'high' }, tools: [{ type: 'function', function: { name: 'test', arguments: '{}' } }], } as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -443,7 +445,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning_effort: 'low', } as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -469,7 +471,7 @@ describe('Fireworks deployment routing', () => { reasoning_effort: 'low', tools: [{ type: 'function', function: { name: 'test', arguments: '{}' } }], } as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: false, @@ -496,7 +498,7 @@ describe('Fireworks deployment routing', () => { await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.5', + originalModel: 'moonshotai/kimi-k2.6', fetch: mockFetch, logger, useCustomDeployment: true, diff --git a/web/src/llm-api/fireworks-config.ts b/web/src/llm-api/fireworks-config.ts index 6856f3f347..ff08822426 100644 --- a/web/src/llm-api/fireworks-config.ts +++ b/web/src/llm-api/fireworks-config.ts @@ -10,6 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217' export const FIREWORKS_DEPLOYMENT_MAP: Record = { // 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9', - 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/y5b3z17u', + 'moonshotai/kimi-k2.6': 'accounts/james-65d217/deployments/j8ar2x0y', // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd', } diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 138671c8aa..96d3510917 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -36,7 +36,7 @@ const FIREWORKS_MODEL_MAP: Record = { 'minimax/minimax-m2.5': 'accounts/fireworks/models/minimax-m2p5', 'minimax/minimax-m2.7': 'accounts/fireworks/models/minimax-m2p7', 'z-ai/glm-5.1': 'accounts/fireworks/models/glm-5p1', - 'moonshotai/kimi-k2.5': 'accounts/fireworks/models/kimi-k2p5', + 'moonshotai/kimi-k2.6': 'accounts/fireworks/models/kimi-k2p6', } /** Flag to enable custom Fireworks deployments (set to false to use global API only) */ @@ -169,7 +169,7 @@ const FIREWORKS_PRICING_MAP: Record = { cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, }, - 'moonshotai/kimi-k2.5': { + 'moonshotai/kimi-k2.6': { inputCostPerToken: 0.60 / 1_000_000, cachedInputCostPerToken: 0.10 / 1_000_000, outputCostPerToken: 3.00 / 1_000_000, diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index e0e0aa956b..0a8b0744b9 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -180,12 +180,12 @@ describe('requestSession', () => { test('deployment-hours-only model is unavailable outside deployment hours', async () => { const state = await requestSession({ userId: 'u1', - model: 'moonshotai/kimi-k2.5', + model: 'moonshotai/kimi-k2.6', deps, }) expect(state).toEqual({ status: 'model_unavailable', - requestedModel: 'moonshotai/kimi-k2.5', + requestedModel: 'moonshotai/kimi-k2.6', availableHours: '9am ET-5pm PT', }) expect(deps.rows.size).toBe(0) @@ -198,13 +198,13 @@ describe('requestSession', () => { deps._tick(new Date(deps._now().getTime() + 1000)) await requestSession({ userId: 'u2', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) - await requestSession({ userId: 'u3', model: 'moonshotai/kimi-k2.5', deps }) + await requestSession({ userId: 'u3', model: 'moonshotai/kimi-k2.6', deps }) const state = await getSessionState({ userId: 'u1', deps }) if (state.status !== 'queued') throw new Error('unreachable') expect(state.queueDepthByModel).toEqual({ [DEFAULT_MODEL]: 2, - 'moonshotai/kimi-k2.5': 1, + 'moonshotai/kimi-k2.6': 1, }) }) @@ -293,7 +293,7 @@ describe('requestSession', () => { }) const s3 = await requestSession({ userId: 'u3', - model: 'moonshotai/kimi-k2.5', + model: 'moonshotai/kimi-k2.6', deps: admitDeps, }) expect(s2.status).toBe('queued') diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts index c0b4d84c66..7d1c16c1f1 100644 --- a/web/src/server/free-session/config.ts +++ b/web/src/server/free-session/config.ts @@ -48,7 +48,7 @@ export function getSessionGraceMs(): number { * queue). */ const INSTANT_ADMIT_CAPACITY: Record = { - 'moonshotai/kimi-k2.5': 100, + 'moonshotai/kimi-k2.6': 100, 'minimax/minimax-m2.7': 200, } From 8a2ae6bd2eeb959d22e02a6cb2c5729568231a75 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 24 Apr 2026 14:58:18 -0700 Subject: [PATCH 3/3] feat: replace Kimi K2.6 with GLM 5.1 as freebuff deployment-hours model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch base2-free, editor-lite, code-reviewer-lite agents from kimi-k2.6 to z-ai/glm-5.1 - Update FREEBUFF_KIMI_MODEL_ID → FREEBUFF_GLM_MODEL_ID constant - Update Fireworks deployment map (mjb4i7ea), model map, and pricing - Remove moonshotai/kimi-k2.6 and kimi-k2.6:nitro from ModelName type - Update freebuff model selector to show GLM first with 'Smartest' tagline - Update all test files with new model IDs and deployment IDs - Update docs and scripts to reference GLM instead of Kimi --- agents/base2/base2.ts | 2 +- agents/editor/editor-lite.ts | 2 +- agents/reviewer/code-reviewer-lite.ts | 2 +- agents/types/agent-definition.ts | 2 -- .../components/freebuff-model-selector.tsx | 18 +++++++--- common/src/constants/free-agents.ts | 6 ++-- common/src/constants/freebuff-models.ts | 8 ++--- .../types/agent-definition.ts | 2 -- docs/freebuff-waiting-room.md | 12 +++---- scripts/test-fireworks-cache-intervals.ts | 16 +++------ scripts/test-fireworks-long.ts | 15 ++------ .../completions/__tests__/completions.test.ts | 27 ++++++++------ .../session/__tests__/session.test.ts | 4 +-- .../__tests__/fireworks-deployment.test.ts | 36 +++++++++---------- web/src/llm-api/fireworks-config.ts | 2 +- web/src/llm-api/fireworks.ts | 8 +---- .../free-session/__tests__/public-api.test.ts | 14 ++++---- web/src/server/free-session/config.ts | 2 +- 18 files changed, 82 insertions(+), 96 deletions(-) diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index b1e24efff6..1a81f948bf 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -25,7 +25,7 @@ export function createBase2( const isFree = mode === 'free' || mode === 'lite' const isSonnet = false - const model = isFree ? 'moonshotai/kimi-k2.6' : 'anthropic/claude-opus-4.7' + const model = isFree ? 'z-ai/glm-5.1' : 'anthropic/claude-opus-4.7' return { publisher, diff --git a/agents/editor/editor-lite.ts b/agents/editor/editor-lite.ts index 9cb5675b5e..29225f0c29 100644 --- a/agents/editor/editor-lite.ts +++ b/agents/editor/editor-lite.ts @@ -3,7 +3,7 @@ import { createCodeEditor } from './editor' import type { AgentDefinition } from '../types/agent-definition' const definition: AgentDefinition = { - ...createCodeEditor({ model: 'minimax' }), + ...createCodeEditor({ model: 'glm' }), id: 'editor-lite', } export default definition diff --git a/agents/reviewer/code-reviewer-lite.ts b/agents/reviewer/code-reviewer-lite.ts index 888cadf4f7..feafb87c45 100644 --- a/agents/reviewer/code-reviewer-lite.ts +++ b/agents/reviewer/code-reviewer-lite.ts @@ -5,7 +5,7 @@ import { createReviewer } from './code-reviewer' const definition: SecretAgentDefinition = { id: 'code-reviewer-lite', publisher, - ...createReviewer('moonshotai/kimi-k2.6'), + ...createReviewer('z-ai/glm-5.1'), } export default definition diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts index 2fbfed0a49..3608f36315 100644 --- a/agents/types/agent-definition.ts +++ b/agents/types/agent-definition.ts @@ -423,8 +423,6 @@ export type ModelName = // Other open source models | 'moonshotai/kimi-k2' | 'moonshotai/kimi-k2:nitro' - | 'moonshotai/kimi-k2.6' - | 'moonshotai/kimi-k2.6:nitro' | 'z-ai/glm-5' | 'z-ai/glm-5.1' | 'z-ai/glm-4.6' diff --git a/cli/src/components/freebuff-model-selector.tsx b/cli/src/components/freebuff-model-selector.tsx index 1ba966fd22..5abaac2724 100644 --- a/cli/src/components/freebuff-model-selector.tsx +++ b/cli/src/components/freebuff-model-selector.tsx @@ -6,6 +6,7 @@ import { Button } from './button' import { DEFAULT_FREEBUFF_MODEL_ID, FREEBUFF_DEPLOYMENT_HOURS_LABEL, + FREEBUFF_GLM_MODEL_ID, FREEBUFF_MODELS, isFreebuffModelAvailable, } from '@codebuff/common/constants/freebuff-models' @@ -19,6 +20,11 @@ import { useTheme } from '../hooks/use-theme' import type { KeyEvent } from '@opentui/core' +const FREEBUFF_MODEL_SELECTOR_MODELS = [ + ...FREEBUFF_MODELS.filter((model) => model.id === FREEBUFF_GLM_MODEL_ID), + ...FREEBUFF_MODELS.filter((model) => model.id !== FREEBUFF_GLM_MODEL_ID), +] + /** * Dual-purpose model picker: * - Pre-chat landing (session 'none'): user hasn't joined any queue. Picking @@ -109,7 +115,7 @@ export const FreebuffModelSelector: React.FC = () => { const stackVertically = useMemo(() => { const BUTTON_CHROME = 4 // 2 border + 2 padding const GAP = 2 - const total = FREEBUFF_MODELS.reduce((sum, model, idx) => { + const total = FREEBUFF_MODEL_SELECTOR_MODELS.reduce((sum, model, idx) => { const inner = 2 /* indicator + space */ + model.displayName.length + @@ -167,13 +173,15 @@ export const FreebuffModelSelector: React.FC = () => { } return } - const currentIdx = FREEBUFF_MODELS.findIndex((m) => m.id === focusedId) + const currentIdx = FREEBUFF_MODEL_SELECTOR_MODELS.findIndex( + (m) => m.id === focusedId, + ) if (currentIdx === -1) return - const len = FREEBUFF_MODELS.length + const len = FREEBUFF_MODEL_SELECTOR_MODELS.length const nextIdx = isForward ? (currentIdx + 1) % len : (currentIdx - 1 + len) % len - const target = FREEBUFF_MODELS[nextIdx] + const target = FREEBUFF_MODEL_SELECTOR_MODELS[nextIdx] if (target) { key.preventDefault?.() setFocusedId(target.id) @@ -198,7 +206,7 @@ export const FreebuffModelSelector: React.FC = () => { alignItems: 'flex-start', }} > - {FREEBUFF_MODELS.map((model) => { + {FREEBUFF_MODEL_SELECTOR_MODELS.map((model) => { // 'Selected' means the dot is filled and the label is bold. On the // landing screen ('none') this tracks the pre-focused pick; on the // queued screen it tracks the model the server has us on. Either diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index 4a2a4a147e..308e12df6d 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -28,7 +28,7 @@ export const FREE_MODE_AGENT_MODELS: Record> = { // Root orchestrator 'base2-free': new Set([ 'minimax/minimax-m2.7', - 'moonshotai/kimi-k2.6', + 'z-ai/glm-5.1', ]), // File exploration agents @@ -46,13 +46,13 @@ export const FREE_MODE_AGENT_MODELS: Record> = { // Editor for free mode 'editor-lite': new Set([ 'minimax/minimax-m2.7', - 'moonshotai/kimi-k2.6', + 'z-ai/glm-5.1', ]), // Code reviewer for free mode 'code-reviewer-lite': new Set([ 'minimax/minimax-m2.7', - 'moonshotai/kimi-k2.6', + 'z-ai/glm-5.1', ]), } diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts index 2f6da2ce0b..f1019c6fbf 100644 --- a/common/src/constants/freebuff-models.ts +++ b/common/src/constants/freebuff-models.ts @@ -18,7 +18,7 @@ export interface FreebuffModelOption { } export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT' -export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6' +export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1' export const FREEBUFF_MODELS = [ { @@ -28,9 +28,9 @@ export const FREEBUFF_MODELS = [ availability: 'always', }, { - id: FREEBUFF_KIMI_MODEL_ID, - displayName: 'Kimi K2.6', - tagline: 'Balanced', + id: FREEBUFF_GLM_MODEL_ID, + displayName: 'GLM 5.1', + tagline: 'Smartest', availability: 'deployment_hours', }, ] as const satisfies readonly FreebuffModelOption[] diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts index 2fbfed0a49..3608f36315 100644 --- a/common/src/templates/initial-agents-dir/types/agent-definition.ts +++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts @@ -423,8 +423,6 @@ export type ModelName = // Other open source models | 'moonshotai/kimi-k2' | 'moonshotai/kimi-k2:nitro' - | 'moonshotai/kimi-k2.6' - | 'moonshotai/kimi-k2.6:nitro' | 'z-ai/glm-5' | 'z-ai/glm-5.1' | 'z-ai/glm-4.6' diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md index 153487897a..353bfb046b 100644 --- a/docs/freebuff-waiting-room.md +++ b/docs/freebuff-waiting-room.md @@ -5,7 +5,7 @@ The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs: 1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones. -2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; Kimi K2.6 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available. +2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; GLM 5.1 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available. 3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput. Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session. @@ -149,8 +149,8 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r | Constant | Location | Default | Purpose | |---|---|---|---| | `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. | -| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `kimi-k2.6` | Selectable models; each gets its own queue and admission slot. | -| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `kimi-k2.6` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | +| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `minimax-m2.7`, `glm-5.1` | Selectable models; each gets its own queue and admission slot. | +| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `glm-5.1` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. | | `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. | | `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime | | `FREEBUFF_SESSION_GRACE_MS` | env | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. | @@ -185,7 +185,7 @@ Response shapes: "queueDepth": 43, // size of this model's queue "queueDepthByModel": { // snapshot of every model's queue — powers the "minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing - "moonshotai/kimi-k2.6": 4 // entries should be treated as 0. + "z-ai/glm-5.1": 4 // entries should be treated as 0. }, "estimatedWaitMs": 384000, "queuedAt": "2026-04-17T12:00:00Z" @@ -285,7 +285,7 @@ waitMs = (position - 1) * 24_000 - Position 1 → 0 (next tick admits you) - Position 2 → 24s, and so on. -`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `moonshotai/kimi-k2.6` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a Kimi Fireworks incident or outside 9am ET-5pm PT, only Kimi's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter. +`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a GLM Fireworks incident or outside 9am ET-5pm PT, only GLM's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter. ## CLI Integration (frontend-side contract) @@ -324,7 +324,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr | Spamming POST/GET to starve admission tick | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. | | Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time. | | Fireworks metrics endpoint down / slow | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses. | -| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded Kimi deployment doesn't block MiniMax admissions. | +| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded GLM deployment doesn't block MiniMax admissions. | | Zombie expired sessions holding capacity | Swept on every admission tick, even when upstream is unhealthy | ## Testing diff --git a/scripts/test-fireworks-cache-intervals.ts b/scripts/test-fireworks-cache-intervals.ts index 44bffd4b75..8d4e867406 100644 --- a/scripts/test-fireworks-cache-intervals.ts +++ b/scripts/test-fireworks-cache-intervals.ts @@ -13,7 +13,6 @@ * * Models: * glm-5.1 (default) — z-ai/glm-5.1 - * kimi-k2.6 — moonshotai/kimi-k2.6 * minimax — minimax/minimax-m2.5 * * Flags: @@ -25,11 +24,11 @@ * # Default glm-5.1 serverless with default intervals * bun scripts/test-fireworks-cache-intervals.ts * - * # Custom Kimi deployment with a faster sweep - * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.6 --deployment --intervals=30,60,120,300,600 + * # Custom GLM deployment with a faster sweep + * bun scripts/test-fireworks-cache-intervals.ts glm-5.1 --deployment --intervals=30,60,120,300,600 * * # Long sweep up to 1 hour - * bun scripts/test-fireworks-cache-intervals.ts kimi-k2.6 --deployment --intervals=60,300,600,1200,1800,2700,3600 + * bun scripts/test-fireworks-cache-intervals.ts glm-5.1 --deployment --intervals=60,300,600,1200,1800,2700,3600 */ export {} @@ -49,18 +48,11 @@ const MODEL_CONFIGS: Record = { 'glm-5.1': { id: 'z-ai/glm-5.1', standardModel: 'accounts/fireworks/models/glm-5p1', + deploymentModel: 'accounts/james-65d217/deployments/mjb4i7ea', inputCostPerToken: 1.4 / 1_000_000, cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.4 / 1_000_000, }, - 'kimi-k2.6': { - id: 'moonshotai/kimi-k2.6', - standardModel: 'accounts/fireworks/models/kimi-k2p6', - deploymentModel: 'accounts/james-65d217/deployments/j8ar2x0y', - inputCostPerToken: 0.6 / 1_000_000, - cachedInputCostPerToken: 0.1 / 1_000_000, - outputCostPerToken: 3.0 / 1_000_000, - }, minimax: { id: 'minimax/minimax-m2.5', standardModel: 'accounts/fireworks/models/minimax-m2p5', diff --git a/scripts/test-fireworks-long.ts b/scripts/test-fireworks-long.ts index 45561fbc42..a1e4950f8f 100644 --- a/scripts/test-fireworks-long.ts +++ b/scripts/test-fireworks-long.ts @@ -11,7 +11,6 @@ * * Models: * glm-5.1 (default) — z-ai/glm-5.1 - * kimi-k2.6 — moonshotai/kimi-k2.6 * minimax — minimax/minimax-m2.5 * minimax-m2.7 — minimax/minimax-m2.7 * @@ -19,7 +18,7 @@ * --deployment Use custom deployment instead of serverless (standard API) * Serverless is the default * Examples: - * bun scripts/test-fireworks-long.ts kimi-k2.6 --deployment + * bun scripts/test-fireworks-long.ts glm-5.1 --deployment */ import { FIREWORKS_DEPLOYMENT_MAP } from '../web/src/llm-api/fireworks-config' @@ -41,18 +40,11 @@ const MODEL_CONFIGS: Record = { 'glm-5.1': { id: 'z-ai/glm-5.1', standardModel: 'accounts/fireworks/models/glm-5p1', + deploymentModel: FIREWORKS_DEPLOYMENT_MAP['z-ai/glm-5.1'], inputCostPerToken: 1.40 / 1_000_000, cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, }, - 'kimi-k2.6': { - id: 'moonshotai/kimi-k2.6', - standardModel: 'accounts/fireworks/models/kimi-k2p6', - deploymentModel: FIREWORKS_DEPLOYMENT_MAP['moonshotai/kimi-k2.6'], - inputCostPerToken: 0.60 / 1_000_000, - cachedInputCostPerToken: 0.10 / 1_000_000, - outputCostPerToken: 3.00 / 1_000_000, - }, minimax: { id: 'minimax/minimax-m2.5', standardModel: 'accounts/fireworks/models/minimax-m2p5', @@ -75,9 +67,6 @@ const DEFAULT_MODEL = 'glm-5.1' const MODEL_ALIASES: Record = { glm: 'glm-5.1', 'z-ai/glm-5.1': 'glm-5.1', - kimi: 'kimi-k2.6', - 'kimi-k2': 'kimi-k2.6', - 'moonshotai/kimi-k2.6': 'kimi-k2.6', 'minimax/minimax-m2.5': 'minimax', 'minimax/minimax-m2.7': 'minimax-m2.7', } diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index 5f4490ff2a..1aac8800cd 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -1,6 +1,7 @@ import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test' import { NextRequest } from 'next/server' +import { isFreebuffDeploymentHours } from '@codebuff/common/constants/freebuff-models' import { formatQuotaResetCountdown, postChatCompletions } from '../_post' import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics' @@ -555,7 +556,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { expect(response.status).toBe(200) }) - it('lets freebuff use Kimi K2.6 through Fireworks availability rules', async () => { + it('lets freebuff use GLM 5.1 through Fireworks availability rules', async () => { const fetchedBodies: Record[] = [] const fetchViaFireworks = mock( async (_url: string | URL | Request, init?: RequestInit) => { @@ -563,7 +564,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { return new Response( JSON.stringify({ id: 'test-id', - model: 'accounts/james-65d217/deployments/j8ar2x0y', + model: 'accounts/james-65d217/deployments/mjb4i7ea', choices: [{ message: { content: 'test response' } }], usage: { prompt_tokens: 10, @@ -585,7 +586,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { method: 'POST', headers: { Authorization: 'Bearer test-api-key-new-free' }, body: JSON.stringify({ - model: 'moonshotai/kimi-k2.6', + model: 'z-ai/glm-5.1', stream: false, codebuff_metadata: { run_id: 'run-free', @@ -610,13 +611,19 @@ describe('/api/v1/chat/completions POST endpoint', () => { }) const body = await response.json() - expect(response.status).toBe(200) - expect(fetchedBodies).toHaveLength(1) - expect(fetchedBodies[0].model).toBe( - 'accounts/james-65d217/deployments/j8ar2x0y', - ) - expect(body.model).toBe('moonshotai/kimi-k2.6') - expect(body.provider).toBe('Fireworks') + if (isFreebuffDeploymentHours()) { + expect(response.status).toBe(200) + expect(fetchedBodies).toHaveLength(1) + expect(fetchedBodies[0].model).toBe( + 'accounts/james-65d217/deployments/mjb4i7ea', + ) + expect(body.model).toBe('z-ai/glm-5.1') + expect(body.provider).toBe('Fireworks') + } else { + expect(response.status).toBe(503) + expect(fetchedBodies).toHaveLength(0) + expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') + } }) it('skips credit check when in FREE mode even with 0 credits', async () => { diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts index bbe31b64e0..ffcb8fd364 100644 --- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts +++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts @@ -155,10 +155,10 @@ describe('POST /api/v1/freebuff/session', () => { expect(body.status).toBe('queued') }) - test('returns model_unavailable for Kimi outside deployment hours', async () => { + test('returns model_unavailable for GLM outside deployment hours', async () => { const sessionDeps = makeSessionDeps() const resp = await postFreebuffSession( - makeReq('ok', { model: 'moonshotai/kimi-k2.6' }), + makeReq('ok', { model: 'z-ai/glm-5.1' }), makeDeps(sessionDeps, 'u1'), ) expect(resp.status).toBe(409) diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index 7e213e9e66..58863c6742 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -11,8 +11,8 @@ import { import type { Logger } from '@codebuff/common/types/contracts/logger' -const STANDARD_MODEL_ID = 'accounts/fireworks/models/kimi-k2p6' -const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/j8ar2x0y' +const STANDARD_MODEL_ID = 'accounts/fireworks/models/glm-5p1' +const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/mjb4i7ea' const IN_DEPLOYMENT_HOURS = new Date('2026-04-17T16:00:00Z') // Friday, 12pm ET / 9am PT const BEFORE_DEPLOYMENT_HOURS = new Date('2026-04-17T12:59:00Z') // Friday, 8:59am ET const AFTER_DEPLOYMENT_HOURS = new Date('2026-04-18T00:00:00Z') // Friday, 5pm PT @@ -85,7 +85,7 @@ describe('Fireworks deployment routing', () => { }) const minimalBody = { - model: 'moonshotai/kimi-k2.6', + model: 'z-ai/glm-5.1', messages: [{ role: 'user' as const, content: 'test' }], } @@ -100,7 +100,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -123,7 +123,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -156,7 +156,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -189,7 +189,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -216,7 +216,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -241,7 +241,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -285,7 +285,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -312,7 +312,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -340,7 +340,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { enabled: true }, } as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -366,7 +366,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { effort: 'high' }, } as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -392,7 +392,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning: { enabled: false, effort: 'high' }, } as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -419,7 +419,7 @@ describe('Fireworks deployment routing', () => { reasoning: { effort: 'high' }, tools: [{ type: 'function', function: { name: 'test', arguments: '{}' } }], } as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -445,7 +445,7 @@ describe('Fireworks deployment routing', () => { ...minimalBody, reasoning_effort: 'low', } as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -471,7 +471,7 @@ describe('Fireworks deployment routing', () => { reasoning_effort: 'low', tools: [{ type: 'function', function: { name: 'test', arguments: '{}' } }], } as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -498,7 +498,7 @@ describe('Fireworks deployment routing', () => { await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'moonshotai/kimi-k2.6', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, diff --git a/web/src/llm-api/fireworks-config.ts b/web/src/llm-api/fireworks-config.ts index ff08822426..5667282505 100644 --- a/web/src/llm-api/fireworks-config.ts +++ b/web/src/llm-api/fireworks-config.ts @@ -10,6 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217' export const FIREWORKS_DEPLOYMENT_MAP: Record = { // 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9', - 'moonshotai/kimi-k2.6': 'accounts/james-65d217/deployments/j8ar2x0y', + 'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea', // 'minimax/minimax-m2.7': 'accounts/james-65d217/deployments/nrdudqxd', } diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 96d3510917..028ad42228 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -36,7 +36,6 @@ const FIREWORKS_MODEL_MAP: Record = { 'minimax/minimax-m2.5': 'accounts/fireworks/models/minimax-m2p5', 'minimax/minimax-m2.7': 'accounts/fireworks/models/minimax-m2p7', 'z-ai/glm-5.1': 'accounts/fireworks/models/glm-5p1', - 'moonshotai/kimi-k2.6': 'accounts/fireworks/models/kimi-k2p6', } /** Flag to enable custom Fireworks deployments (set to false to use global API only) */ @@ -97,7 +96,7 @@ function createFireworksRequest(params: { // Transform OpenRouter-style `reasoning` object into Fireworks' `reasoning_effort`. // Unlike OpenAI, Fireworks supports reasoning_effort together with function tools - // (e.g. GLM-4.5/5.1 and Kimi K2 are designed for interleaved reasoning + tool use). + // (e.g. GLM-4.5/5.1 are designed for interleaved reasoning + tool use). if (fireworksBody.reasoning && typeof fireworksBody.reasoning === 'object') { const reasoning = fireworksBody.reasoning as { enabled?: boolean @@ -169,11 +168,6 @@ const FIREWORKS_PRICING_MAP: Record = { cachedInputCostPerToken: 0.26 / 1_000_000, outputCostPerToken: 4.40 / 1_000_000, }, - 'moonshotai/kimi-k2.6': { - inputCostPerToken: 0.60 / 1_000_000, - cachedInputCostPerToken: 0.10 / 1_000_000, - outputCostPerToken: 3.00 / 1_000_000, - }, } function getFireworksPricing(model: string): FireworksPricing { diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index 0a8b0744b9..a90bc800d4 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -180,12 +180,12 @@ describe('requestSession', () => { test('deployment-hours-only model is unavailable outside deployment hours', async () => { const state = await requestSession({ userId: 'u1', - model: 'moonshotai/kimi-k2.6', + model: 'z-ai/glm-5.1', deps, }) expect(state).toEqual({ status: 'model_unavailable', - requestedModel: 'moonshotai/kimi-k2.6', + requestedModel: 'z-ai/glm-5.1', availableHours: '9am ET-5pm PT', }) expect(deps.rows.size).toBe(0) @@ -193,18 +193,18 @@ describe('requestSession', () => { test('queued response includes a per-model depth snapshot for the selector', async () => { deps._tick(new Date('2026-04-17T16:00:00Z')) - // Seed 2 users in MiniMax + 1 in Kimi so the returned map captures both. + // Seed 2 users in MiniMax + 1 in GLM so the returned map captures both. await requestSession({ userId: 'u1', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) await requestSession({ userId: 'u2', model: DEFAULT_MODEL, deps }) deps._tick(new Date(deps._now().getTime() + 1000)) - await requestSession({ userId: 'u3', model: 'moonshotai/kimi-k2.6', deps }) + await requestSession({ userId: 'u3', model: 'z-ai/glm-5.1', deps }) const state = await getSessionState({ userId: 'u1', deps }) if (state.status !== 'queued') throw new Error('unreachable') expect(state.queueDepthByModel).toEqual({ [DEFAULT_MODEL]: 2, - 'moonshotai/kimi-k2.6': 1, + 'z-ai/glm-5.1': 1, }) }) @@ -279,7 +279,7 @@ describe('requestSession', () => { }) test('instant-admit: per-model capacities are independent', async () => { - // MiniMax saturated at 1 active, Kimi still has room. + // MiniMax saturated at 1 active, GLM still has room. const admitDeps = makeDeps({ getInstantAdmitCapacity: (model) => model === DEFAULT_MODEL ? 1 : 10, @@ -293,7 +293,7 @@ describe('requestSession', () => { }) const s3 = await requestSession({ userId: 'u3', - model: 'moonshotai/kimi-k2.6', + model: 'z-ai/glm-5.1', deps: admitDeps, }) expect(s2.status).toBe('queued') diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts index 7d1c16c1f1..85bba7fa6f 100644 --- a/web/src/server/free-session/config.ts +++ b/web/src/server/free-session/config.ts @@ -48,7 +48,7 @@ export function getSessionGraceMs(): number { * queue). */ const INSTANT_ADMIT_CAPACITY: Record = { - 'moonshotai/kimi-k2.6': 100, + 'z-ai/glm-5.1': 50, 'minimax/minimax-m2.7': 200, }