diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index 58863c674..be17a6e2e 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -88,6 +88,10 @@ describe('Fireworks deployment routing', () => { model: 'z-ai/glm-5.1', messages: [{ role: 'user' as const, content: 'test' }], } + const liteBody = { + ...minimalBody, + codebuff_metadata: { cost_mode: 'lite' }, + } it('uses standard API when custom deployment is disabled', async () => { const fetchCalls: string[] = [] @@ -298,6 +302,29 @@ describe('Fireworks deployment routing', () => { expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS') }) + it('falls back to the standard Fireworks API in lite mode outside deployment hours', async () => { + const fetchCalls: string[] = [] + + const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { + const body = JSON.parse(init?.body as string) + fetchCalls.push(body.model) + return new Response(JSON.stringify({ ok: true }), { status: 200 }) + }) as unknown as typeof globalThis.fetch + + const response = await createFireworksRequestWithFallback({ + body: liteBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: BEFORE_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toEqual([STANDARD_MODEL_ID]) + }) + it('returns non-5xx responses from deployment without fallback (e.g. 429)', async () => { const fetchCalls: string[] = [] @@ -508,5 +535,92 @@ describe('Fireworks deployment routing', () => { expect(logger.info).toHaveBeenCalledTimes(2) }) + + it('falls back to the standard Fireworks API in lite mode after deployment scaling 503', async () => { + const fetchCalls: string[] = [] + + const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { + const body = JSON.parse(init?.body as string) + fetchCalls.push(body.model) + if (fetchCalls.length === 1) { + return new Response( + JSON.stringify({ + error: { + message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.', + code: 'DEPLOYMENT_SCALING_UP', + type: 'error', + }, + }), + { status: 503, statusText: 'Service Unavailable' }, + ) + } + return new Response(JSON.stringify({ ok: true }), { status: 200 }) + }) as unknown as typeof globalThis.fetch + + const response = await createFireworksRequestWithFallback({ + body: liteBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID, STANDARD_MODEL_ID]) + expect(isDeploymentCoolingDown()).toBe(true) + }) + + it('falls back to the standard Fireworks API in lite mode during deployment cooldown', async () => { + markDeploymentScalingUp() + + const fetchCalls: string[] = [] + const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { + const body = JSON.parse(init?.body as string) + fetchCalls.push(body.model) + return new Response(JSON.stringify({ ok: true }), { status: 200 }) + }) as unknown as typeof globalThis.fetch + + const response = await createFireworksRequestWithFallback({ + body: liteBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toEqual([STANDARD_MODEL_ID]) + }) + + it('falls back to the standard Fireworks API in lite mode when the deployment request throws', async () => { + const fetchCalls: string[] = [] + + const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { + const body = JSON.parse(init?.body as string) + fetchCalls.push(body.model) + if (fetchCalls.length === 1) { + throw new Error('socket hang up') + } + return new Response(JSON.stringify({ ok: true }), { status: 200 }) + }) as unknown as typeof globalThis.fetch + + const response = await createFireworksRequestWithFallback({ + body: liteBody as never, + originalModel: 'z-ai/glm-5.1', + fetch: mockFetch, + logger, + useCustomDeployment: true, + sessionId: 'test-user-id', + now: IN_DEPLOYMENT_HOURS, + }) + + expect(response.status).toBe(200) + expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID, STANDARD_MODEL_ID]) + expect(logger.warn).toHaveBeenCalledTimes(1) + }) }) }) diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 028ad4222..a2f4f80a8 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -725,8 +725,19 @@ export async function createFireworksRequestWithFallback(params: { const useCustomDeployment = params.useCustomDeployment ?? FIREWORKS_USE_CUSTOM_DEPLOYMENT const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel] const hasDeployment = useCustomDeployment && Boolean(deploymentModelId) + const shouldFallbackToStandardApi = body.codebuff_metadata?.cost_mode === 'lite' + + const createStandardApiRequest = () => + createFireworksRequest({ body, originalModel, fetch, sessionId }) if (hasDeployment && !isDeploymentHours(now)) { + if (shouldFallbackToStandardApi) { + logger.info( + { model: originalModel }, + 'Falling back to Fireworks standard API outside deployment hours', + ) + return createStandardApiRequest() + } return new Response( JSON.stringify({ error: { @@ -740,6 +751,13 @@ export async function createFireworksRequestWithFallback(params: { } if (hasDeployment && isDeploymentCoolingDown()) { + if (shouldFallbackToStandardApi) { + logger.info( + { model: originalModel }, + 'Falling back to Fireworks standard API during deployment cooldown', + ) + return createStandardApiRequest() + } return new Response( JSON.stringify({ error: { @@ -757,13 +775,25 @@ export async function createFireworksRequestWithFallback(params: { { model: originalModel, deploymentModel: deploymentModelId }, 'Trying Fireworks custom deployment', ) - const response = await createFireworksRequest({ - body, - originalModel, - fetch, - modelIdOverride: deploymentModelId, - sessionId, - }) + let response: Response + try { + response = await createFireworksRequest({ + body, + originalModel, + fetch, + modelIdOverride: deploymentModelId, + sessionId, + }) + } catch (error) { + if (shouldFallbackToStandardApi) { + logger.warn( + { model: originalModel, error: getErrorObject(error) }, + 'Fireworks custom deployment request failed, falling back to standard API', + ) + return createStandardApiRequest() + } + throw error + } if (response.status >= 500) { const errorText = await response.text() @@ -774,6 +804,13 @@ export async function createFireworksRequestWithFallback(params: { if (errorText.includes('DEPLOYMENT_SCALING_UP')) { markDeploymentScalingUp() } + if (shouldFallbackToStandardApi) { + logger.info( + { model: originalModel, status: response.status }, + 'Falling back to Fireworks standard API after deployment 5xx', + ) + return createStandardApiRequest() + } return new Response(errorText, { status: response.status, statusText: response.statusText, @@ -783,7 +820,7 @@ export async function createFireworksRequestWithFallback(params: { return response } - return createFireworksRequest({ body, originalModel, fetch, sessionId }) + return createStandardApiRequest() } function creditsToFakeCost(credits: number): number {