Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions web/src/llm-api/__tests__/fireworks-deployment.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ describe('Fireworks deployment routing', () => {
model: 'z-ai/glm-5.1',
messages: [{ role: 'user' as const, content: 'test' }],
}
const liteBody = {
...minimalBody,
codebuff_metadata: { cost_mode: 'lite' },
}

it('uses standard API when custom deployment is disabled', async () => {
const fetchCalls: string[] = []
Expand Down Expand Up @@ -298,6 +302,29 @@ describe('Fireworks deployment routing', () => {
expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
})

it('falls back to the standard Fireworks API in lite mode outside deployment hours', async () => {
const fetchCalls: string[] = []

const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch

const response = await createFireworksRequestWithFallback({
body: liteBody as never,
originalModel: 'z-ai/glm-5.1',
fetch: mockFetch,
logger,
useCustomDeployment: true,
sessionId: 'test-user-id',
now: BEFORE_DEPLOYMENT_HOURS,
})

expect(response.status).toBe(200)
expect(fetchCalls).toEqual([STANDARD_MODEL_ID])
})

it('returns non-5xx responses from deployment without fallback (e.g. 429)', async () => {
const fetchCalls: string[] = []

Expand Down Expand Up @@ -508,5 +535,92 @@ describe('Fireworks deployment routing', () => {

expect(logger.info).toHaveBeenCalledTimes(2)
})

it('falls back to the standard Fireworks API in lite mode after deployment scaling 503', async () => {
const fetchCalls: string[] = []

const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
if (fetchCalls.length === 1) {
return new Response(
JSON.stringify({
error: {
message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.',
code: 'DEPLOYMENT_SCALING_UP',
type: 'error',
},
}),
{ status: 503, statusText: 'Service Unavailable' },
)
}
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch

const response = await createFireworksRequestWithFallback({
body: liteBody as never,
originalModel: 'z-ai/glm-5.1',
fetch: mockFetch,
logger,
useCustomDeployment: true,
sessionId: 'test-user-id',
now: IN_DEPLOYMENT_HOURS,
})

expect(response.status).toBe(200)
expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID, STANDARD_MODEL_ID])
expect(isDeploymentCoolingDown()).toBe(true)
})

it('falls back to the standard Fireworks API in lite mode during deployment cooldown', async () => {
markDeploymentScalingUp()

const fetchCalls: string[] = []
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch

const response = await createFireworksRequestWithFallback({
body: liteBody as never,
originalModel: 'z-ai/glm-5.1',
fetch: mockFetch,
logger,
useCustomDeployment: true,
sessionId: 'test-user-id',
now: IN_DEPLOYMENT_HOURS,
})

expect(response.status).toBe(200)
expect(fetchCalls).toEqual([STANDARD_MODEL_ID])
})

it('falls back to the standard Fireworks API in lite mode when the deployment request throws', async () => {
const fetchCalls: string[] = []

const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
const body = JSON.parse(init?.body as string)
fetchCalls.push(body.model)
if (fetchCalls.length === 1) {
throw new Error('socket hang up')
}
return new Response(JSON.stringify({ ok: true }), { status: 200 })
}) as unknown as typeof globalThis.fetch

const response = await createFireworksRequestWithFallback({
body: liteBody as never,
originalModel: 'z-ai/glm-5.1',
fetch: mockFetch,
logger,
useCustomDeployment: true,
sessionId: 'test-user-id',
now: IN_DEPLOYMENT_HOURS,
})

expect(response.status).toBe(200)
expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID, STANDARD_MODEL_ID])
expect(logger.warn).toHaveBeenCalledTimes(1)
})
})
})
53 changes: 45 additions & 8 deletions web/src/llm-api/fireworks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -725,8 +725,19 @@ export async function createFireworksRequestWithFallback(params: {
const useCustomDeployment = params.useCustomDeployment ?? FIREWORKS_USE_CUSTOM_DEPLOYMENT
const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel]
const hasDeployment = useCustomDeployment && Boolean(deploymentModelId)
const shouldFallbackToStandardApi = body.codebuff_metadata?.cost_mode === 'lite'

const createStandardApiRequest = () =>
createFireworksRequest({ body, originalModel, fetch, sessionId })

if (hasDeployment && !isDeploymentHours(now)) {
if (shouldFallbackToStandardApi) {
logger.info(
{ model: originalModel },
'Falling back to Fireworks standard API outside deployment hours',
)
return createStandardApiRequest()
}
return new Response(
JSON.stringify({
error: {
Expand All @@ -740,6 +751,13 @@ export async function createFireworksRequestWithFallback(params: {
}

if (hasDeployment && isDeploymentCoolingDown()) {
if (shouldFallbackToStandardApi) {
logger.info(
{ model: originalModel },
'Falling back to Fireworks standard API during deployment cooldown',
)
return createStandardApiRequest()
}
return new Response(
JSON.stringify({
error: {
Expand All @@ -757,13 +775,25 @@ export async function createFireworksRequestWithFallback(params: {
{ model: originalModel, deploymentModel: deploymentModelId },
'Trying Fireworks custom deployment',
)
const response = await createFireworksRequest({
body,
originalModel,
fetch,
modelIdOverride: deploymentModelId,
sessionId,
})
let response: Response
try {
response = await createFireworksRequest({
body,
originalModel,
fetch,
modelIdOverride: deploymentModelId,
sessionId,
})
} catch (error) {
if (shouldFallbackToStandardApi) {
logger.warn(
{ model: originalModel, error: getErrorObject(error) },
'Fireworks custom deployment request failed, falling back to standard API',
)
return createStandardApiRequest()
}
throw error
}

if (response.status >= 500) {
const errorText = await response.text()
Expand All @@ -774,6 +804,13 @@ export async function createFireworksRequestWithFallback(params: {
if (errorText.includes('DEPLOYMENT_SCALING_UP')) {
markDeploymentScalingUp()
}
if (shouldFallbackToStandardApi) {
logger.info(
{ model: originalModel, status: response.status },
'Falling back to Fireworks standard API after deployment 5xx',
)
return createStandardApiRequest()
}
return new Response(errorText, {
status: response.status,
statusText: response.statusText,
Expand All @@ -783,7 +820,7 @@ export async function createFireworksRequestWithFallback(params: {
return response
}

return createFireworksRequest({ body, originalModel, fetch, sessionId })
return createStandardApiRequest()
}

function creditsToFakeCost(credits: number): number {
Expand Down
Loading