From 164cf55423ac61953caf465f799786b256dbb4de Mon Sep 17 00:00:00 2001 From: Evgeny Shurakov Date: Thu, 7 May 2026 09:35:45 +0200 Subject: [PATCH] fix(cloud-agent-next): destroy sandbox on prep 500 - Tighten 500 regexes with \b to avoid matching 5000/50012/etc. - Walk cause chain in recovery classifier so wrapped SandboxError 500s are still detected (mkdir error wrappers previously stripped identity, preventing destroy on control-plane 500s) - Skip cleanupWorkspace on cold-start resume when sandbox was destroyed (exec against a destroyed sandbox is a no-op that emits misleading warnings) - Drop redundant WRAPPER_START_FAILED branch; trailing cause recursion already covers it - Add sandbox_500_detected/destroyed/destroy_failed logTags to measure hit rate and destroy success per phase in Axiom --- .../src/execution/orchestrator.ts | 104 ++++--- .../src/persistence/async-preparation.test.ts | 47 ++- .../src/persistence/async-preparation.ts | 269 +++++++++--------- .../src/router/handlers/session-prepare.ts | 241 +++++++++------- .../src/sandbox-recovery.test.ts | 123 ++++++++ .../cloud-agent-next/src/sandbox-recovery.ts | 167 +++++++++++ .../cloud-agent-next/src/session-service.ts | 31 +- .../cloud-agent-next/src/workspace.test.ts | 10 + services/cloud-agent-next/src/workspace.ts | 22 +- 9 files changed, 722 insertions(+), 292 deletions(-) create mode 100644 services/cloud-agent-next/src/sandbox-recovery.test.ts create mode 100644 services/cloud-agent-next/src/sandbox-recovery.ts diff --git a/services/cloud-agent-next/src/execution/orchestrator.ts b/services/cloud-agent-next/src/execution/orchestrator.ts index 7a0676df6a..cf25dc93c6 100644 --- a/services/cloud-agent-next/src/execution/orchestrator.ts +++ b/services/cloud-agent-next/src/execution/orchestrator.ts @@ -26,6 +26,7 @@ import { withDORetry } from '../utils/do-retry.js'; import { normalizeAgentMode } from '../schema.js'; import { buildImagePromptParts, downloadImagePromptParts } from './image-prompt-parts.js'; import { withTimeout } from '@kilocode/worker-utils'; +import { withSandboxInternalServerErrorRecovery } from '../sandbox-recovery.js'; /** Maximum time allowed for workspace preparation (resume, init, fast path). */ const PREPARE_WORKSPACE_TIMEOUT_MS = 10 * 60 * 1000; @@ -137,56 +138,71 @@ export class ExecutionOrchestrator { ); } - // 2. Workspace preparation (may throw WORKSPACE_SETUP_FAILED) - const prepared = await this.prepareWorkspace(sandbox, plan, options?.onProgress); + const prepareExecution = async () => { + // 2. Workspace preparation (may throw WORKSPACE_SETUP_FAILED) + const prepared = await this.prepareWorkspace(sandbox, plan, options?.onProgress); - // 3. Update git remote token if needed (resume path with token overrides) - if (!workspace.shouldPrepare) { - const resumeContext = workspace.resumeContext; - if (resumeContext.githubToken || resumeContext.gitToken) { - await this.updateTokenOverrides(prepared, workspace); + // 3. Update git remote token if needed (resume path with token overrides) + if (!workspace.shouldPrepare) { + const resumeContext = workspace.resumeContext; + if (resumeContext.githubToken || resumeContext.gitToken) { + await this.updateTokenOverrides(prepared, workspace); + } } - } - // 4. Ensure wrapper is running (starts kilo server in-process) - let wrapperClient: WrapperClient; - let kiloSessionId: string; - try { - const result = await WrapperClient.ensureWrapper(sandbox, prepared.session, { - agentSessionId: sessionId, - userId, - workspacePath: prepared.context.workspacePath, - sessionId: wrapper.kiloSessionId, + // 4. Ensure wrapper is running (starts kilo server in-process) + let wrapperClient: WrapperClient; + let kiloSessionId: string; + try { + const result = await WrapperClient.ensureWrapper(sandbox, prepared.session, { + agentSessionId: sessionId, + userId, + workspacePath: prepared.context.workspacePath, + sessionId: wrapper.kiloSessionId, + }); + wrapperClient = result.client; + kiloSessionId = result.sessionId; + } catch (error) { + throw ExecutionError.wrapperStartFailed( + `Failed to start wrapper: ${error instanceof Error ? error.message : String(error)}`, + error + ); + } + + // 5. Record activity for idle timeout tracking + try { + await withDORetry( + () => this.deps.getSessionStub(userId, sessionId), + stub => stub.recordKiloServerActivity(), + 'recordKiloServerActivity' + ); + } catch { + // Non-fatal - log but continue + logger.warn('Failed to record kilo server activity'); + } + + // 6. Download images from R2 to sandbox if provided + const fileParts = await downloadImagePromptParts({ + env: this.deps.env, + session: prepared.session, + userId: plan.userId, + images: plan.images, + createdOnPlatform: this.getCreatedOnPlatform(plan), }); - wrapperClient = result.client; - kiloSessionId = result.sessionId; - } catch (error) { - throw ExecutionError.wrapperStartFailed( - `Failed to start wrapper: ${error instanceof Error ? error.message : String(error)}`, - error - ); - } - // 5. Record activity for idle timeout tracking - try { - await withDORetry( - () => this.deps.getSessionStub(userId, sessionId), - stub => stub.recordKiloServerActivity(), - 'recordKiloServerActivity' - ); - } catch { - // Non-fatal - log but continue - logger.warn('Failed to record kilo server activity'); - } + return { prepared, wrapperClient, kiloSessionId, fileParts }; + }; - // 6. Download images from R2 to sandbox if provided - const fileParts = await downloadImagePromptParts({ - env: this.deps.env, - session: prepared.session, - userId: plan.userId, - images: plan.images, - createdOnPlatform: this.getCreatedOnPlatform(plan), - }); + const { prepared, wrapperClient, kiloSessionId, fileParts } = + await withSandboxInternalServerErrorRecovery( + { + sandbox, + sandboxId, + sessionId, + phase: 'executionWorkspacePreparation', + }, + prepareExecution + ); // 7. Send prompt with execution binding (async - returns messageId immediately) const ingestUrl = this.deps.getIngestUrl(sessionId, userId); diff --git a/services/cloud-agent-next/src/persistence/async-preparation.test.ts b/services/cloud-agent-next/src/persistence/async-preparation.test.ts index a13a3daf49..7a6f2cb625 100644 --- a/services/cloud-agent-next/src/persistence/async-preparation.test.ts +++ b/services/cloud-agent-next/src/persistence/async-preparation.test.ts @@ -2,12 +2,17 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { Env } from '../types.js'; import type { PreparationInput } from './schemas.js'; +const { ensureWrapperMock } = vi.hoisted(() => ({ + ensureWrapperMock: vi.fn(), +})); + const fakeSession = { exec: vi.fn().mockResolvedValue({ exitCode: 0, stdout: '', stderr: '' }), }; const fakeSandbox = { writeFile: vi.fn().mockResolvedValue(undefined), + destroy: vi.fn().mockResolvedValue(undefined), }; vi.mock('@cloudflare/sandbox', () => ({ @@ -45,16 +50,23 @@ vi.mock('../session-service.js', () => ({ vi.mock('../kilo/wrapper-client.js', () => ({ WrapperClient: { - ensureWrapper: vi.fn().mockResolvedValue({ sessionId: 'ses_wrapper_123' }), + ensureWrapper: ensureWrapperMock, }, })); import { cloneGitHubRepo, cloneGitRepo } from '../workspace.js'; import { executePreparationSteps } from './async-preparation.js'; +const wrapperResult = { + client: {}, + sessionId: 'ses_wrapper_123', +}; + describe('executePreparationSteps', () => { beforeEach(() => { vi.clearAllMocks(); + fakeSandbox.destroy.mockResolvedValue(undefined); + ensureWrapperMock.mockResolvedValue(wrapperResult); }); it('skips managed GitLab token resolution when caller already resolved it', async () => { @@ -182,4 +194,37 @@ describe('executePreparationSteps', () => { undefined ); }); + + it('destroys the sandbox when preparation hits a sandbox 500', async () => { + const env = { + Sandbox: {} as Env['Sandbox'], + SandboxSmall: {} as Env['SandboxSmall'], + GIT_TOKEN_SERVICE: { + getTokenForRepo: vi.fn(), + }, + PER_SESSION_SANDBOX_ORG_IDS: '', + GITHUB_APP_SLUG: 'kilo-connect', + GITHUB_APP_BOT_USER_ID: '12345', + } as unknown as Env; + const emitProgress = vi.fn(); + const input = { + sessionId: 'agent_test', + userId: 'test-user', + orgId: 'test-org', + authToken: 'kilo-token', + githubRepo: 'acme/repo', + githubToken: 'github-token', + prompt: 'Fix bug', + mode: 'code', + model: 'kilo/test-model', + autoInitiate: false, + } satisfies PreparationInput; + const error = new Error('HTTP error! status: 500'); + Object.assign(error, { name: 'SandboxError' }); + ensureWrapperMock.mockRejectedValueOnce(error); + + await expect(executePreparationSteps(input, env, emitProgress)).rejects.toBe(error); + + expect(fakeSandbox.destroy).toHaveBeenCalledOnce(); + }); }); diff --git a/services/cloud-agent-next/src/persistence/async-preparation.ts b/services/cloud-agent-next/src/persistence/async-preparation.ts index 5847b03255..1db338d7aa 100644 --- a/services/cloud-agent-next/src/persistence/async-preparation.ts +++ b/services/cloud-agent-next/src/persistence/async-preparation.ts @@ -25,6 +25,7 @@ import { WrapperClient } from '../kilo/wrapper-client.js'; import type { PreparingStep } from '../shared/protocol.js'; import type { PreparationInput } from './schemas.js'; import { readProfileBundle } from '../session-profile.js'; +import { withSandboxInternalServerErrorRecovery } from '../sandbox-recovery.js'; import type { Env as WorkerEnv, SandboxId, SessionId as AgentSessionId } from '../types.js'; type EmitProgress = (step: PreparingStep, message: string) => void; @@ -118,143 +119,153 @@ export async function executePreparationSteps( const sandbox = getSandbox(getSandboxNamespace(env, sandboxId), sandboxId, { sleepAfter: SANDBOX_SLEEP_AFTER_SECONDS, }); - await checkDiskAndCleanBeforeSetup(sandbox, input.orgId, input.userId, input.sessionId); + return withSandboxInternalServerErrorRecovery( + { + sandbox, + sandboxId, + sessionId: input.sessionId, + phase: 'asyncPreparation', + }, + async () => { + await checkDiskAndCleanBeforeSetup(sandbox, input.orgId, input.userId, input.sessionId); - // 3. Workspace setup - emitProgress('workspace_setup', 'Setting up workspace…'); - const { workspacePath, sessionHome } = await setupWorkspace( - sandbox, - input.userId, - input.orgId, - input.sessionId - ); + // 3. Workspace setup + emitProgress('workspace_setup', 'Setting up workspace…'); + const { workspacePath, sessionHome } = await setupWorkspace( + sandbox, + input.userId, + input.orgId, + input.sessionId + ); - // 4. Clone repository - emitProgress('cloning', 'Cloning repository…'); - const branchName = determineBranchName(input.sessionId, input.upstreamBranch); - const sessionId = input.sessionId as AgentSessionId; - const context = sessionService.buildContext({ - sandboxId, - orgId: input.orgId, - userId: input.userId, - sessionId, - workspacePath, - sessionHome, - githubRepo: input.githubRepo, - githubToken: resolvedGithubToken, - gitUrl: input.gitUrl, - gitToken: resolvedGitToken, - platform: input.platform, - upstreamBranch: input.upstreamBranch, - botId: input.botId, - }); + // 4. Clone repository + emitProgress('cloning', 'Cloning repository…'); + const branchName = determineBranchName(input.sessionId, input.upstreamBranch); + const sessionId = input.sessionId as AgentSessionId; + const context = sessionService.buildContext({ + sandboxId, + orgId: input.orgId, + userId: input.userId, + sessionId, + workspacePath, + sessionHome, + githubRepo: input.githubRepo, + githubToken: resolvedGithubToken, + gitUrl: input.gitUrl, + gitToken: resolvedGitToken, + platform: input.platform, + upstreamBranch: input.upstreamBranch, + botId: input.botId, + }); - const session = await sessionService.getOrCreateSession({ - sandbox, - context, - env, - originalToken: input.authToken, - kilocodeModel: input.model, - originalOrgId: input.orgId, - createdOnPlatform: input.createdOnPlatform, - appendSystemPrompt: input.appendSystemPrompt, - profile: readProfileBundle(input), - }); + const session = await sessionService.getOrCreateSession({ + sandbox, + context, + env, + originalToken: input.authToken, + kilocodeModel: input.model, + originalOrgId: input.orgId, + createdOnPlatform: input.createdOnPlatform, + appendSystemPrompt: input.appendSystemPrompt, + profile: readProfileBundle(input), + }); - const cloneOptions = input.shallow ? { shallow: true } : undefined; - if (input.gitUrl) { - await cloneGitRepo( - session, - workspacePath, - input.gitUrl, - resolvedGitToken, - undefined, - cloneOptions - ); - } else if (input.githubRepo) { - await cloneGitHubRepo( - session, - workspacePath, - input.githubRepo, - resolvedGithubToken, - { - GITHUB_APP_SLUG: env.GITHUB_APP_SLUG, - GITHUB_APP_BOT_USER_ID: env.GITHUB_APP_BOT_USER_ID, - }, - cloneOptions - ); - } + const cloneOptions = input.shallow ? { shallow: true } : undefined; + if (input.gitUrl) { + await cloneGitRepo( + session, + workspacePath, + input.gitUrl, + resolvedGitToken, + undefined, + cloneOptions + ); + } else if (input.githubRepo) { + await cloneGitHubRepo( + session, + workspacePath, + input.githubRepo, + resolvedGithubToken, + { + GITHUB_APP_SLUG: env.GITHUB_APP_SLUG, + GITHUB_APP_BOT_USER_ID: env.GITHUB_APP_BOT_USER_ID, + }, + cloneOptions + ); + } - // 5. Branch management - emitProgress('branch', 'Setting up branch…'); - await manageBranch(session, workspacePath, branchName, !!input.upstreamBranch); + // 5. Branch management + emitProgress('branch', 'Setting up branch…'); + await manageBranch(session, workspacePath, branchName, !!input.upstreamBranch); - // 6. Setup commands - const inputSetupCommands = readProfileBundle(input).setupCommands; - if (inputSetupCommands && inputSetupCommands.length > 0) { - emitProgress('setup_commands', 'Running setup commands…'); - await runSetupCommands(session, context, inputSetupCommands, true); - } + // 6. Setup commands + const inputSetupCommands = readProfileBundle(input).setupCommands; + if (inputSetupCommands && inputSetupCommands.length > 0) { + emitProgress('setup_commands', 'Running setup commands…'); + await runSetupCommands(session, context, inputSetupCommands, true); + } - // 7. Write auth file and global rules (runtime skills are written by getOrCreateSession above) - await writeAuthFile(sandbox, sessionHome, input.authToken); - await writeGlobalRules(sandbox, sessionHome, input.sessionId); + // 7. Write auth file and global rules (runtime skills are written by getOrCreateSession above) + await writeAuthFile(sandbox, sessionHome, input.authToken); + await writeGlobalRules(sandbox, sessionHome, input.sessionId); - // 8. Import pre-generated session into CLI's SQLite so the wrapper picks it up - if (input.kiloSessionId) { - emitProgress('kilo_session', 'Importing session…'); - const now = Date.now(); - const defaultTitle = 'New session - ' + new Date(now).toISOString(); - const minimalSessionJson = JSON.stringify({ - info: { - id: input.kiloSessionId, - slug: '', - projectID: '', - directory: '', - title: defaultTitle, - version: '2', - time: { created: now, updated: now }, - }, - messages: [], - }); - const importFilePath = `/tmp/kilo-empty-session-${input.kiloSessionId}.json`; - await sandbox.writeFile(importFilePath, minimalSessionJson); - const escapedFile = importFilePath.replaceAll("'", "'\\''"); - const escapedId = input.kiloSessionId.replaceAll("'", "'\\''"); - const escapedWorkspace = workspacePath.replaceAll("'", "'\\''"); - const restoreResult = await session.exec( - `bun /usr/local/bin/kilo-restore-session.js --file '${escapedFile}' '${escapedId}' '${escapedWorkspace}'`, - { cwd: dirname(workspacePath) } - ); - if (restoreResult.exitCode !== 0) { - const stdout = restoreResult.stdout?.trim() ?? ''; - logger - .withFields({ exitCode: restoreResult.exitCode, stdout }) - .error('Session import failed'); - emitProgress('failed', `Session import failed (exit ${restoreResult.exitCode})`); - return undefined; - } - } + // 8. Import pre-generated session into CLI's SQLite so the wrapper picks it up + if (input.kiloSessionId) { + emitProgress('kilo_session', 'Importing session…'); + const now = Date.now(); + const defaultTitle = 'New session - ' + new Date(now).toISOString(); + const minimalSessionJson = JSON.stringify({ + info: { + id: input.kiloSessionId, + slug: '', + projectID: '', + directory: '', + title: defaultTitle, + version: '2', + time: { created: now, updated: now }, + }, + messages: [], + }); + const importFilePath = `/tmp/kilo-empty-session-${input.kiloSessionId}.json`; + await sandbox.writeFile(importFilePath, minimalSessionJson); + const escapedFile = importFilePath.replaceAll("'", "'\\''"); + const escapedId = input.kiloSessionId.replaceAll("'", "'\\''"); + const escapedWorkspace = workspacePath.replaceAll("'", "'\\''"); + const restoreResult = await session.exec( + `bun /usr/local/bin/kilo-restore-session.js --file '${escapedFile}' '${escapedId}' '${escapedWorkspace}'`, + { cwd: dirname(workspacePath) } + ); + if (restoreResult.exitCode !== 0) { + const stdout = restoreResult.stdout?.trim() ?? ''; + logger + .withFields({ exitCode: restoreResult.exitCode, stdout }) + .error('Session import failed'); + emitProgress('failed', `Session import failed (exit ${restoreResult.exitCode})`); + return undefined; + } + } - // 9. Start wrapper (with --session-id if pre-imported) - emitProgress('kilo_server', 'Starting Kilo…'); - const { sessionId: wrapperSessionId } = await WrapperClient.ensureWrapper(sandbox, session, { - agentSessionId: input.sessionId, - userId: input.userId, - workspacePath, - sessionId: input.kiloSessionId, - }); + // 9. Start wrapper (with --session-id if pre-imported) + emitProgress('kilo_server', 'Starting Kilo…'); + const { sessionId: wrapperSessionId } = await WrapperClient.ensureWrapper(sandbox, session, { + agentSessionId: input.sessionId, + userId: input.userId, + workspacePath, + sessionId: input.kiloSessionId, + }); - return { - sandboxId, - workspacePath, - sessionHome, - branchName, - kiloSessionId: input.kiloSessionId ?? wrapperSessionId, - resolvedInstallationId, - resolvedGithubAppType, - resolvedGithubToken: input.githubRepo ? resolvedGithubToken : undefined, - resolvedGitToken, - gitlabTokenManaged, - }; + return { + sandboxId, + workspacePath, + sessionHome, + branchName, + kiloSessionId: input.kiloSessionId ?? wrapperSessionId, + resolvedInstallationId, + resolvedGithubAppType, + resolvedGithubToken: input.githubRepo ? resolvedGithubToken : undefined, + resolvedGitToken, + gitlabTokenManaged, + }; + } + ); } diff --git a/services/cloud-agent-next/src/router/handlers/session-prepare.ts b/services/cloud-agent-next/src/router/handlers/session-prepare.ts index e1fb3c0b06..acee5bb4bf 100644 --- a/services/cloud-agent-next/src/router/handlers/session-prepare.ts +++ b/services/cloud-agent-next/src/router/handlers/session-prepare.ts @@ -47,6 +47,7 @@ import { } from '../../services/git-token-service-client.js'; import { getPgDb } from '../../db/pg.js'; import { repoFullNameFromGitUrl } from '@kilocode/worker-utils/git-url'; +import { destroySandboxAfterInternalServerError } from '../../sandbox-recovery.js'; type SessionPrepareHandlers = { prepareSession: typeof prepareSessionHandler; @@ -421,125 +422,151 @@ const prepareSessionHandler = internalApiProtectedProcedure sleepAfter: SANDBOX_SLEEP_AFTER_SECONDS, }); - // 4. Check disk space before creating directories; clean stale workspaces if low - await checkDiskAndCleanBeforeSetup( - sandbox, - input.kilocodeOrganizationId, - ctx.userId, - cloudAgentSessionId - ); - - // 5. Setup workspace directories - logger.info('Setting up workspace directories'); - const { workspacePath, sessionHome } = await setupWorkspace( - sandbox, - ctx.userId, - input.kilocodeOrganizationId, - cloudAgentSessionId - ); - - // 6. Build context and create execution session - const branchName = determineBranchName(cloudAgentSessionId, input.upstreamBranch); - const context = sessionService.buildContext({ - sandboxId, - orgId: input.kilocodeOrganizationId, - userId: ctx.userId, - sessionId: cloudAgentSessionId, - workspacePath, - sessionHome, - githubRepo: input.githubRepo, - githubToken: resolvedGithubToken, // Use resolved token (from input or generated from installation) - gitUrl: input.gitUrl, - gitToken: resolvedGitToken, - platform: input.platform, - upstreamBranch: input.upstreamBranch, - botId: ctx.botId, - }); - - logger.info('Creating execution session'); - const session = await sessionService.getOrCreateSession({ - sandbox, - context, - env: ctx.env, - originalToken: ctx.authToken, - kilocodeModel: input.model, - originalOrgId: input.kilocodeOrganizationId, - createdOnPlatform: input.createdOnPlatform, - appendSystemPrompt: input.appendSystemPrompt, - profile: effective, - }); + const prepareWorkspace = async () => { + // 4. Check disk space before creating directories; clean stale workspaces if low + await checkDiskAndCleanBeforeSetup( + sandbox, + input.kilocodeOrganizationId, + ctx.userId, + cloudAgentSessionId + ); - // 7. Clone repository - const cloneOptions = input.shallow ? { shallow: true } : undefined; - logger.info('Cloning repository'); - if (input.gitUrl) { - await cloneGitRepo( - session, - workspacePath, - input.gitUrl, - resolvedGitToken, - undefined, - cloneOptions + // 5. Setup workspace directories + logger.info('Setting up workspace directories'); + const { workspacePath, sessionHome } = await setupWorkspace( + sandbox, + ctx.userId, + input.kilocodeOrganizationId, + cloudAgentSessionId ); - } else if (input.githubRepo) { - await cloneGitHubRepo( - session, + + // 6. Build context and create execution session + const branchName = determineBranchName(cloudAgentSessionId, input.upstreamBranch); + const context = sessionService.buildContext({ + sandboxId, + orgId: input.kilocodeOrganizationId, + userId: ctx.userId, + sessionId: cloudAgentSessionId, workspacePath, - input.githubRepo, - resolvedGithubToken, - { - GITHUB_APP_SLUG: ctx.env.GITHUB_APP_SLUG, - GITHUB_APP_BOT_USER_ID: ctx.env.GITHUB_APP_BOT_USER_ID, - }, - cloneOptions - ); - } else { - throw new TRPCError({ - code: 'BAD_REQUEST', - message: 'Either githubRepo or gitUrl must be provided', + sessionHome, + githubRepo: input.githubRepo, + githubToken: resolvedGithubToken, // Use resolved token (from input or generated from installation) + gitUrl: input.gitUrl, + gitToken: resolvedGitToken, + platform: input.platform, + upstreamBranch: input.upstreamBranch, + botId: ctx.botId, }); - } - // 8. Branch management - logger - .withFields({ branchName, upstreamBranch: input.upstreamBranch }) - .info('Managing branch'); - if (input.upstreamBranch) { - // For upstream branches, use manageBranch (verifies exists remotely) - await manageBranch(session, workspacePath, branchName, true); - } else { - // For session branches, create directly (can't exist remotely with UUID-based name) - const result = await session.exec(`cd ${workspacePath} && git checkout -b '${branchName}'`); - if (result.exitCode !== 0) { + logger.info('Creating execution session'); + const session = await sessionService.getOrCreateSession({ + sandbox, + context, + env: ctx.env, + originalToken: ctx.authToken, + kilocodeModel: input.model, + originalOrgId: input.kilocodeOrganizationId, + createdOnPlatform: input.createdOnPlatform, + appendSystemPrompt: input.appendSystemPrompt, + profile: effective, + }); + + // 7. Clone repository + const cloneOptions = input.shallow ? { shallow: true } : undefined; + logger.info('Cloning repository'); + if (input.gitUrl) { + await cloneGitRepo( + session, + workspacePath, + input.gitUrl, + resolvedGitToken, + undefined, + cloneOptions + ); + } else if (input.githubRepo) { + await cloneGitHubRepo( + session, + workspacePath, + input.githubRepo, + resolvedGithubToken, + { + GITHUB_APP_SLUG: ctx.env.GITHUB_APP_SLUG, + GITHUB_APP_BOT_USER_ID: ctx.env.GITHUB_APP_BOT_USER_ID, + }, + cloneOptions + ); + } else { throw new TRPCError({ - code: 'INTERNAL_SERVER_ERROR', - message: `Failed to create branch ${branchName}: ${result.stderr || result.stdout}`, + code: 'BAD_REQUEST', + message: 'Either githubRepo or gitUrl must be provided', }); } - } - // 9. Run setup commands - if (effective.setupCommands && effective.setupCommands.length > 0) { - logger.withFields({ count: effective.setupCommands.length }).info('Running setup commands'); - await runSetupCommands(session, context, effective.setupCommands, true); // fail-fast - } + // 8. Branch management + logger + .withFields({ branchName, upstreamBranch: input.upstreamBranch }) + .info('Managing branch'); + if (input.upstreamBranch) { + // For upstream branches, use manageBranch (verifies exists remotely) + await manageBranch(session, workspacePath, branchName, true); + } else { + // For session branches, create directly (can't exist remotely with UUID-based name) + const result = await session.exec( + `cd ${workspacePath} && git checkout -b '${branchName}'` + ); + if (result.exitCode !== 0) { + throw new TRPCError({ + code: 'INTERNAL_SERVER_ERROR', + message: `Failed to create branch ${branchName}: ${result.stderr || result.stdout}`, + }); + } + } - // 10. Write auth file for session ingest, plus global rules. - // (runtime skills were written by getOrCreateSession above) - await writeAuthFile(sandbox, sessionHome, ctx.authToken); - await writeGlobalRules(sandbox, sessionHome, cloudAgentSessionId); + // 9. Run setup commands + if (effective.setupCommands && effective.setupCommands.length > 0) { + logger + .withFields({ count: effective.setupCommands.length }) + .info('Running setup commands'); + await runSetupCommands(session, context, effective.setupCommands, true); // fail-fast + } - // 11. Start wrapper (which starts kilo server in-process and creates session) - logger.info('Starting wrapper'); - const { client: _wrapperClient, sessionId: kiloSessionId } = - await WrapperClient.ensureWrapper(sandbox, session, { - agentSessionId: cloudAgentSessionId, - userId: ctx.userId, - workspacePath, - }); + // 10. Write auth file for session ingest, plus global rules. + // (runtime skills were written by getOrCreateSession above) + await writeAuthFile(sandbox, sessionHome, ctx.authToken); + await writeGlobalRules(sandbox, sessionHome, cloudAgentSessionId); + + // 11. Start wrapper (which starts kilo server in-process and creates session) + logger.info('Starting wrapper'); + const { client: _wrapperClient, sessionId: kiloSessionId } = + await WrapperClient.ensureWrapper(sandbox, session, { + agentSessionId: cloudAgentSessionId, + userId: ctx.userId, + workspacePath, + }); + + logger.setTags({ kiloSessionId }); + logger.info('Wrapper started, kilo session created'); + + return { workspacePath, sessionHome, branchName, kiloSessionId }; + }; + + let preparedWorkspace: Awaited>; + try { + preparedWorkspace = await prepareWorkspace(); + } catch (error) { + await destroySandboxAfterInternalServerError( + { + sandbox, + sandboxId, + sessionId: cloudAgentSessionId, + phase: 'prepareSession', + }, + error + ); + throw error; + } - logger.setTags({ kiloSessionId }); - logger.info('Wrapper started, kilo session created'); + const { workspacePath, sessionHome, branchName, kiloSessionId } = preparedWorkspace; // 13. Create cli_sessions_v2 record via session-ingest RPC (blocking) logger.info('Creating cli_sessions_v2 record via session-ingest'); diff --git a/services/cloud-agent-next/src/sandbox-recovery.test.ts b/services/cloud-agent-next/src/sandbox-recovery.test.ts new file mode 100644 index 0000000000..eee1a98696 --- /dev/null +++ b/services/cloud-agent-next/src/sandbox-recovery.test.ts @@ -0,0 +1,123 @@ +import { describe, expect, it, vi } from 'vitest'; + +const { mockError, mockInfo, mockWithFields } = vi.hoisted(() => { + const error = vi.fn(); + const info = vi.fn(); + const withFields = vi.fn(() => ({ error, info })); + return { mockError: error, mockInfo: info, mockWithFields: withFields }; +}); + +vi.mock('./logger.js', () => ({ + logger: { + withFields: mockWithFields, + }, +})); + +import { + destroySandboxAfterInternalServerError, + isSandboxInternalServerError, + withSandboxInternalServerErrorRecovery, +} from './sandbox-recovery.js'; + +describe('sandbox recovery', () => { + it('classifies sandbox SDK internal server errors', () => { + const error = new Error('control plane failed'); + Object.assign(error, { + name: 'SandboxError', + code: 'INTERNAL_ERROR', + httpStatus: 500, + errorResponse: { + code: 'INTERNAL_ERROR', + httpStatus: 500, + }, + }); + + expect(isSandboxInternalServerError(error)).toBe(true); + }); + + it('classifies nested wrapper failures caused by sandbox 500s', () => { + const cause = new Error('HTTP error! status: 500'); + Object.assign(cause, { name: 'SandboxError' }); + const error = new Error('Failed to start wrapper: HTTP error! status: 500', { cause }); + Object.assign(error, { + name: 'ExecutionError', + code: 'WRAPPER_START_FAILED', + }); + + expect(isSandboxInternalServerError(error)).toBe(true); + }); + + it('does not classify execution errors by wrapper message alone', () => { + const error = new Error('Failed to start wrapper: HTTP error! status: 500'); + Object.assign(error, { + name: 'ExecutionError', + code: 'WRAPPER_START_FAILED', + }); + + expect(isSandboxInternalServerError(error)).toBe(false); + }); + + it('does not classify regular internal errors as sandbox 500s', () => { + expect(isSandboxInternalServerError(new Error('Internal server error'))).toBe(false); + expect(isSandboxInternalServerError(new Error('Git clone failed'))).toBe(false); + }); + + it('does not classify workspace execution wrappers around non-sandbox 500s', () => { + const cause = new Error('Upstream API failed with HTTP 500'); + const error = new Error('Failed to prepare workspace: Upstream API failed with HTTP 500', { + cause, + }); + Object.assign(error, { + name: 'ExecutionError', + code: 'WORKSPACE_SETUP_FAILED', + }); + + expect(isSandboxInternalServerError(error)).toBe(false); + }); + + it('does not classify plain HTTP 500 messages without sandbox context', () => { + expect(isSandboxInternalServerError(new Error('HTTP error! status: 500'))).toBe(false); + }); + + it('destroys sandbox when a preparation operation throws a sandbox 500', async () => { + const sandbox = { destroy: vi.fn().mockResolvedValue(undefined) }; + const error = new Error('HTTP error! status: 500'); + Object.assign(error, { name: 'SandboxError' }); + + await expect( + withSandboxInternalServerErrorRecovery( + { + sandbox, + sandboxId: 'ses-test', + sessionId: 'agent_test', + phase: 'asyncPreparation', + }, + async () => { + throw error; + } + ) + ).rejects.toBe(error); + + expect(sandbox.destroy).toHaveBeenCalledOnce(); + expect(mockError).toHaveBeenCalledWith( + 'Sandbox returned 500 during workspace preparation; destroying sandbox' + ); + expect(mockInfo).toHaveBeenCalledWith('Destroyed sandbox after workspace preparation 500'); + }); + + it('does not destroy sandbox for unrelated errors', async () => { + const sandbox = { destroy: vi.fn().mockResolvedValue(undefined) }; + const destroyed = await destroySandboxAfterInternalServerError( + { + sandbox, + sandboxId: 'ses-test', + sessionId: 'agent_test', + phase: 'asyncPreparation', + }, + new Error('Git clone failed') + ); + + expect(destroyed).toBe(false); + expect(sandbox.destroy).not.toHaveBeenCalled(); + }); +}); diff --git a/services/cloud-agent-next/src/sandbox-recovery.ts b/services/cloud-agent-next/src/sandbox-recovery.ts new file mode 100644 index 0000000000..8e4529548c --- /dev/null +++ b/services/cloud-agent-next/src/sandbox-recovery.ts @@ -0,0 +1,167 @@ +import { logger } from './logger.js'; + +type DestroyableSandbox = { + destroy(): Promise; +}; + +type RecoveryContext = { + sandbox: DestroyableSandbox; + sandboxId: string; + sessionId?: string; + phase: string; +}; + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null; +} + +function getStringProperty(value: unknown, key: string): string | undefined { + if (!isRecord(value)) return undefined; + + const property = value[key]; + return typeof property === 'string' ? property : undefined; +} + +function getNumberProperty(value: unknown, key: string): number | undefined { + if (!isRecord(value)) return undefined; + + const property = value[key]; + return typeof property === 'number' ? property : undefined; +} + +function getNestedProperty(value: unknown, key: string): unknown { + if (!isRecord(value)) return undefined; + return value[key]; +} + +function getErrorMessage(error: unknown): string { + if (error instanceof Error) return error.message; + return String(error); +} + +function messageLooksLikeSandboxInternalServerError(message: string): boolean { + return ( + /http\s+error!\s+status:\s*500\b/i.test(message) || + /http\s*500\b/i.test(message) || + /status:\s*500\b/i.test(message) || + (/internal server error/i.test(message) && /(sandbox|container|cloudflare)/i.test(message)) + ); +} + +function isSandboxErrorObject(value: unknown): boolean { + const name = getStringProperty(value, 'name'); + const code = getStringProperty(value, 'code'); + + return name === 'SandboxError' || code === 'INTERNAL_ERROR'; +} + +function hasInternalServerStatus(value: unknown): boolean { + if (getNumberProperty(value, 'httpStatus') === 500) return true; + + const errorResponse = getNestedProperty(value, 'errorResponse'); + if (getNumberProperty(errorResponse, 'httpStatus') === 500) return true; + + return ( + getNumberProperty(value, 'status') === 500 && + (isSandboxErrorObject(value) || isSandboxErrorObject(errorResponse)) + ); +} + +function isSandboxInternalServerErrorWithSeen(error: unknown, seen: WeakSet): boolean { + if (typeof error === 'string') { + return messageLooksLikeSandboxInternalServerError(error); + } + + if (!isRecord(error)) { + return false; + } + + if (seen.has(error)) { + return false; + } + seen.add(error); + + if (hasInternalServerStatus(error)) { + return true; + } + + const sandboxErrorObject = isSandboxErrorObject(error); + const message = getStringProperty(error, 'message') ?? getErrorMessage(error); + if (messageLooksLikeSandboxInternalServerError(message) && sandboxErrorObject) { + return true; + } + + // Wrapped errors (e.g. ExecutionError with code WRAPPER_START_FAILED, or + // workspace setup wrappers) are classified by walking errorResponse and cause + // so we recover whenever the underlying SandboxError is a 500. + const errorResponse = getNestedProperty(error, 'errorResponse'); + if (isSandboxInternalServerErrorWithSeen(errorResponse, seen)) { + return true; + } + + const cause = getNestedProperty(error, 'cause'); + return isSandboxInternalServerErrorWithSeen(cause, seen); +} + +export function isSandboxInternalServerError(error: unknown): boolean { + return isSandboxInternalServerErrorWithSeen(error, new WeakSet()); +} + +export async function destroySandboxAfterInternalServerError( + context: RecoveryContext, + error: unknown +): Promise { + if (!isSandboxInternalServerError(error)) { + return false; + } + + const errorMessage = getErrorMessage(error); + logger + .withFields({ + sandboxId: context.sandboxId, + sessionId: context.sessionId, + phase: context.phase, + error: errorMessage, + logTag: 'sandbox_500_detected', + }) + .error('Sandbox returned 500 during workspace preparation; destroying sandbox'); + + try { + await context.sandbox.destroy(); + logger + .withFields({ + sandboxId: context.sandboxId, + sessionId: context.sessionId, + phase: context.phase, + logTag: 'sandbox_500_destroyed', + }) + .info('Destroyed sandbox after workspace preparation 500'); + return true; + } catch (destroyError) { + logger + .withFields({ + sandboxId: context.sandboxId, + sessionId: context.sessionId, + phase: context.phase, + originalError: errorMessage, + destroyError: getErrorMessage(destroyError), + logTag: 'sandbox_500_destroy_failed', + }) + .error('Failed to destroy sandbox after workspace preparation 500'); + return false; + } +} + +export async function withSandboxInternalServerErrorRecovery( + context: RecoveryContext, + operation: () => Promise +): Promise { + try { + return await operation(); + } catch (error) { + const cause = getNestedProperty(error, 'cause'); + const recoveryError = isSandboxInternalServerError(cause) ? cause : error; + await destroySandboxAfterInternalServerError(context, recoveryError); + throw error; + } +} diff --git a/services/cloud-agent-next/src/session-service.ts b/services/cloud-agent-next/src/session-service.ts index a90b03fcae..f08c489b28 100644 --- a/services/cloud-agent-next/src/session-service.ts +++ b/services/cloud-agent-next/src/session-service.ts @@ -36,6 +36,7 @@ import { decryptWithPrivateKey, mergeEnvVarsWithSecrets } from './utils/encrypti import type { MCPSecretValue } from './router/schemas.js'; import type { SessionProfileBundle } from './session-profile.js'; import { readProfileBundle } from './session-profile.js'; +import { destroySandboxAfterInternalServerError } from './sandbox-recovery.js'; const SETUP_COMMAND_TIMEOUT_SECONDS = 300; // 5 minutes const SANDBOX_RETRY_DEFAULTS = { @@ -1679,15 +1680,29 @@ export class SessionService { // Wrapper will be (re)started by the orchestrator after we return onProgress?.('kilo_server', 'Starting Kilo…'); } catch (error) { - // Remove the workspace and sessionHome so the next retry sees a true - // cold start and re-runs the full restore from scratch. - logger - .withFields({ + const sandboxDestroyed = await destroySandboxAfterInternalServerError( + { + sandbox, + sandboxId: context.sandboxId, sessionId, - error: error instanceof Error ? error.message : String(error), - }) - .warn('Cold-start resume step failed; removing workspace for clean retry'); - await cleanupWorkspace(session, context.workspacePath, context.sessionHome); + phase: 'coldStartResume', + }, + error + ); + + // If we destroyed the sandbox, the workspace is gone with it and the next + // retry will start from a fresh container. Otherwise, remove the workspace + // and sessionHome so the next retry sees a true cold start and re-runs the + // full restore from scratch. + if (!sandboxDestroyed) { + logger + .withFields({ + sessionId, + error: error instanceof Error ? error.message : String(error), + }) + .warn('Cold-start resume step failed; removing workspace for clean retry'); + await cleanupWorkspace(session, context.workspacePath, context.sessionHome); + } throw error; } diff --git a/services/cloud-agent-next/src/workspace.test.ts b/services/cloud-agent-next/src/workspace.test.ts index 42de83d7ba..b6a1caa08b 100644 --- a/services/cloud-agent-next/src/workspace.test.ts +++ b/services/cloud-agent-next/src/workspace.test.ts @@ -590,6 +590,16 @@ describe('disk space checking', () => { ); expect(mockTimeoutWarn).toHaveBeenCalledWith('Sandbox operation timed out'); }); + + it('preserves sandbox 500 errors for recovery handling', async () => { + const error = new Error('HTTP error! status: 500'); + Object.assign(error, { name: 'SandboxError' }); + mockGitCheckout.mockRejectedValueOnce(error); + + await expect( + cloneGitRepo(fakeSession, '/workspace', 'https://example.com/repo.git') + ).rejects.toBe(error); + }); }); describe('updateGitRemoteToken', () => { diff --git a/services/cloud-agent-next/src/workspace.ts b/services/cloud-agent-next/src/workspace.ts index c61a2897ba..2d44fdec54 100644 --- a/services/cloud-agent-next/src/workspace.ts +++ b/services/cloud-agent-next/src/workspace.ts @@ -12,6 +12,7 @@ import { withSandboxOperationTimeoutLog, } from './sandbox-timeout-logging.js'; import { withTimeout } from '@kilocode/worker-utils'; +import { isSandboxInternalServerError } from './sandbox-recovery.js'; /** * Minimal interface for running shell commands. @@ -121,7 +122,8 @@ export type SessionPaths = { /** * Check disk space and clean up stale workspaces if low, using the sandbox * directly so it can run before any session or workspace directory exists. - * Errors are caught and logged — never rethrown — so cleanup failure never blocks setup. + * Errors are caught and logged so cleanup failure never blocks setup, except + * sandbox 500s which indicate a bad container that should be destroyed. */ export async function checkDiskAndCleanBeforeSetup( sandbox: SandboxInstance, @@ -136,6 +138,13 @@ export async function checkDiskAndCleanBeforeSetup( await cleanupStaleWorkspaces(sandbox, getBaseWorkspacePath(orgId, userId), sessionId); } } catch (error) { + if (isSandboxInternalServerError(error)) { + logger + .withFields({ error: error instanceof Error ? error.message : String(error) }) + .error('Pre-setup disk check hit sandbox 500, aborting workspace setup'); + throw error; + } + // Log and continue — a failed disk check should not block workspace setup. // The worst case is that mkdir fails (which it would have anyway without cleanup). logger @@ -157,7 +166,8 @@ export async function setupWorkspace( await sandbox.mkdir(sessionWorkspacePath, { recursive: true }); } catch (error) { throw new Error( - `Failed to create workspace directory: ${error instanceof Error ? error.message : String(error)}` + `Failed to create workspace directory: ${error instanceof Error ? error.message : String(error)}`, + { cause: error } ); } @@ -165,7 +175,8 @@ export async function setupWorkspace( await sandbox.mkdir(sessionHome, { recursive: true }); } catch (error) { throw new Error( - `Failed to prepare session home: ${error instanceof Error ? error.message : String(error)}` + `Failed to prepare session home: ${error instanceof Error ? error.message : String(error)}`, + { cause: error } ); } @@ -550,6 +561,11 @@ export async function cloneGitRepo( error: err instanceof Error ? err.message : String(err), gitUrl: sanitizedGitUrl, }); + + if (isSandboxInternalServerError(err)) { + throw err; + } + // Throw generic error to avoid leaking token in response throw new Error(`Failed to clone repository from ${sanitizedGitUrl}`); }