CodebuffAI · jahooma · Apr 25, 2026 · Apr 25, 2026
@@ -88,6 +88,10 @@ describe('Fireworks deployment routing', () => {
       model: 'z-ai/glm-5.1',
       messages: [{ role: 'user' as const, content: 'test' }],
     }
+    const liteBody = {
+      ...minimalBody,
+      codebuff_metadata: { cost_mode: 'lite' },
+    }
 
     it('uses standard API when custom deployment is disabled', async () => {
       const fetchCalls: string[] = []
@@ -298,6 +302,29 @@ describe('Fireworks deployment routing', () => {
       expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
     })
 
+    it('falls back to the standard Fireworks API in lite mode outside deployment hours', async () => {
+      const fetchCalls: string[] = []
+
+      const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
+        const body = JSON.parse(init?.body as string)
+        fetchCalls.push(body.model)
+        return new Response(JSON.stringify({ ok: true }), { status: 200 })
+      }) as unknown as typeof globalThis.fetch
+
+      const response = await createFireworksRequestWithFallback({
+        body: liteBody as never,
+        originalModel: 'z-ai/glm-5.1',
+        fetch: mockFetch,
+        logger,
+        useCustomDeployment: true,
+        sessionId: 'test-user-id',
+        now: BEFORE_DEPLOYMENT_HOURS,
+      })
+
+      expect(response.status).toBe(200)
+      expect(fetchCalls).toEqual([STANDARD_MODEL_ID])
+    })
+
     it('returns non-5xx responses from deployment without fallback (e.g. 429)', async () => {
       const fetchCalls: string[] = []
 
@@ -508,5 +535,92 @@ describe('Fireworks deployment routing', () => {
 
       expect(logger.info).toHaveBeenCalledTimes(2)
     })
+
+    it('falls back to the standard Fireworks API in lite mode after deployment scaling 503', async () => {
+      const fetchCalls: string[] = []
+
+      const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
+        const body = JSON.parse(init?.body as string)
+        fetchCalls.push(body.model)
+        if (fetchCalls.length === 1) {
+          return new Response(
+            JSON.stringify({
+              error: {
+                message: 'Deployment is currently scaled to zero and is scaling up. Please retry your request in a few minutes.',
+                code: 'DEPLOYMENT_SCALING_UP',
+                type: 'error',
+              },
+            }),
+            { status: 503, statusText: 'Service Unavailable' },
+          )
+        }
+        return new Response(JSON.stringify({ ok: true }), { status: 200 })
+      }) as unknown as typeof globalThis.fetch
+
+      const response = await createFireworksRequestWithFallback({
+        body: liteBody as never,
+        originalModel: 'z-ai/glm-5.1',
+        fetch: mockFetch,
+        logger,
+        useCustomDeployment: true,
+        sessionId: 'test-user-id',
+        now: IN_DEPLOYMENT_HOURS,
+      })
+
+      expect(response.status).toBe(200)
+      expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID, STANDARD_MODEL_ID])
+      expect(isDeploymentCoolingDown()).toBe(true)
+    })
+
+    it('falls back to the standard Fireworks API in lite mode during deployment cooldown', async () => {
+      markDeploymentScalingUp()
+
+      const fetchCalls: string[] = []
+      const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
+        const body = JSON.parse(init?.body as string)
+        fetchCalls.push(body.model)
+        return new Response(JSON.stringify({ ok: true }), { status: 200 })
+      }) as unknown as typeof globalThis.fetch
+
+      const response = await createFireworksRequestWithFallback({
+        body: liteBody as never,
+        originalModel: 'z-ai/glm-5.1',
+        fetch: mockFetch,
+        logger,
+        useCustomDeployment: true,
+        sessionId: 'test-user-id',
+        now: IN_DEPLOYMENT_HOURS,
+      })
+
+      expect(response.status).toBe(200)
+      expect(fetchCalls).toEqual([STANDARD_MODEL_ID])
+    })
+
+    it('falls back to the standard Fireworks API in lite mode when the deployment request throws', async () => {
+      const fetchCalls: string[] = []
+
+      const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
+        const body = JSON.parse(init?.body as string)
+        fetchCalls.push(body.model)
+        if (fetchCalls.length === 1) {
+          throw new Error('socket hang up')
+        }
+        return new Response(JSON.stringify({ ok: true }), { status: 200 })
+      }) as unknown as typeof globalThis.fetch
+
+      const response = await createFireworksRequestWithFallback({
+        body: liteBody as never,
+        originalModel: 'z-ai/glm-5.1',
+        fetch: mockFetch,
+        logger,
+        useCustomDeployment: true,
+        sessionId: 'test-user-id',
+        now: IN_DEPLOYMENT_HOURS,
+      })
+
+      expect(response.status).toBe(200)
+      expect(fetchCalls).toEqual([DEPLOYMENT_MODEL_ID, STANDARD_MODEL_ID])
+      expect(logger.warn).toHaveBeenCalledTimes(1)
+    })
   })
 })
@@ -725,8 +725,19 @@ export async function createFireworksRequestWithFallback(params: {
   const useCustomDeployment = params.useCustomDeployment ?? FIREWORKS_USE_CUSTOM_DEPLOYMENT
   const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel]
   const hasDeployment = useCustomDeployment && Boolean(deploymentModelId)
+  const shouldFallbackToStandardApi = body.codebuff_metadata?.cost_mode === 'lite'
+
+  const createStandardApiRequest = () =>
+    createFireworksRequest({ body, originalModel, fetch, sessionId })
 
   if (hasDeployment && !isDeploymentHours(now)) {
+    if (shouldFallbackToStandardApi) {
+      logger.info(
+        { model: originalModel },
+        'Falling back to Fireworks standard API outside deployment hours',
+      )
+      return createStandardApiRequest()
+    }
     return new Response(
       JSON.stringify({
         error: {
@@ -740,6 +751,13 @@ export async function createFireworksRequestWithFallback(params: {
   }
 
   if (hasDeployment && isDeploymentCoolingDown()) {
+    if (shouldFallbackToStandardApi) {
+      logger.info(
+        { model: originalModel },
+        'Falling back to Fireworks standard API during deployment cooldown',
+      )
+      return createStandardApiRequest()
+    }
     return new Response(
       JSON.stringify({
         error: {
@@ -757,13 +775,25 @@ export async function createFireworksRequestWithFallback(params: {
       { model: originalModel, deploymentModel: deploymentModelId },
       'Trying Fireworks custom deployment',
     )
-    const response = await createFireworksRequest({
-      body,
-      originalModel,
-      fetch,
-      modelIdOverride: deploymentModelId,
-      sessionId,
-    })
+    let response: Response
+    try {
+      response = await createFireworksRequest({
+        body,
+        originalModel,
+        fetch,
+        modelIdOverride: deploymentModelId,
+        sessionId,
+      })
+    } catch (error) {
+      if (shouldFallbackToStandardApi) {
+        logger.warn(
+          { model: originalModel, error: getErrorObject(error) },
+          'Fireworks custom deployment request failed, falling back to standard API',
+        )
+        return createStandardApiRequest()
+      }
+      throw error
+    }
 
     if (response.status >= 500) {
       const errorText = await response.text()
@@ -774,6 +804,13 @@ export async function createFireworksRequestWithFallback(params: {
       if (errorText.includes('DEPLOYMENT_SCALING_UP')) {
         markDeploymentScalingUp()
       }
+      if (shouldFallbackToStandardApi) {
+        logger.info(
+          { model: originalModel, status: response.status },
+          'Falling back to Fireworks standard API after deployment 5xx',
+        )
+        return createStandardApiRequest()
+      }
       return new Response(errorText, {
         status: response.status,
         statusText: response.statusText,
@@ -783,7 +820,7 @@ export async function createFireworksRequestWithFallback(params: {
     return response
   }
 
-  return createFireworksRequest({ body, originalModel, fetch, sessionId })
+  return createStandardApiRequest()
 }
 
 function creditsToFakeCost(credits: number): number {