-
Notifications
You must be signed in to change notification settings - Fork 38
fix(gastown): bug fixes — org billing (#1756), reconciler spam (#1364), stuck MR beads (#1632), session leak (#1341) #1862
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
efac6ed
0c942dd
09a22c6
c421917
d85a726
fca2b59
8223168
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -620,6 +620,14 @@ export async function startAgent( | |
| console.log( | ||
| `${MANAGER_LOG} startAgent: stopping existing session for ${request.agentId} (status=${existing.status})` | ||
| ); | ||
|
|
||
| // If the agent is still starting, abort the in-flight startup to prevent | ||
| // an orphaned session from being created after stopAgent returns. | ||
| if (existing.status === 'starting' && existing.startupAbortController) { | ||
| console.log(`${MANAGER_LOG} startAgent: aborting in-flight startup for ${request.agentId}`); | ||
| existing.startupAbortController.abort(); | ||
| } | ||
|
|
||
| await stopAgent(request.agentId).catch(err => { | ||
| console.warn( | ||
| `${MANAGER_LOG} startAgent: failed to stop existing session for ${request.agentId}`, | ||
|
|
@@ -629,6 +637,7 @@ export async function startAgent( | |
| } | ||
|
|
||
| const now = new Date().toISOString(); | ||
| const startupAbortController = new AbortController(); | ||
| const agent: ManagedAgent = { | ||
| agentId: request.agentId, | ||
| rigId: request.rigId, | ||
|
|
@@ -653,15 +662,22 @@ export async function startAgent( | |
| completionCallbackUrl: request.envVars?.GASTOWN_COMPLETION_CALLBACK_URL ?? null, | ||
| model: request.model ?? null, | ||
| startupEnv: env, | ||
| startupAbortController, | ||
| }; | ||
| agents.set(request.agentId, agent); | ||
|
|
||
| const { signal } = startupAbortController; | ||
| let sessionCounted = false; | ||
| try { | ||
| // 1. Ensure SDK server is running for this workdir | ||
| const { client, port } = await ensureSDKServer(workdir, env); | ||
| agent.serverPort = port; | ||
|
|
||
| // Check if startup was cancelled while waiting for the SDK server | ||
| if (signal.aborted) { | ||
| throw new StartupAbortedError(request.agentId); | ||
| } | ||
|
|
||
| // Track session count on the SDK instance | ||
| const instance = sdkInstances.get(workdir); | ||
| if (instance) { | ||
|
|
@@ -671,6 +687,10 @@ export async function startAgent( | |
|
|
||
| // 2. Create a session | ||
| const sessionResult = await client.session.create({ body: {} }); | ||
|
|
||
| // Parse and store the session ID immediately so the catch block can | ||
| // abort an orphaned session if startupAbortController fires during | ||
| // the await above. | ||
| const rawSession: unknown = sessionResult.data ?? sessionResult; | ||
| const parsed = SessionResponse.safeParse(rawSession); | ||
| if (!parsed.success) { | ||
|
|
@@ -684,6 +704,12 @@ export async function startAgent( | |
| const sessionId = parsed.data.id; | ||
| agent.sessionId = sessionId; | ||
|
|
||
| // Now check if startup was cancelled while creating the session. | ||
| // agent.sessionId is already set, so the catch block will abort it. | ||
| if (signal.aborted) { | ||
| throw new StartupAbortedError(request.agentId); | ||
| } | ||
|
|
||
| // 3. Subscribe to events (async, runs in background) | ||
| void subscribeToEvents(client, agent, request); | ||
|
|
||
|
|
@@ -705,6 +731,11 @@ export async function startAgent( | |
| modelParam = { providerID: 'kilo', modelID: request.model }; | ||
| } | ||
|
|
||
| // Final abort check before sending the prompt | ||
| if (signal.aborted) { | ||
| throw new StartupAbortedError(request.agentId); | ||
| } | ||
|
|
||
| await client.session.prompt({ | ||
| path: { id: sessionId }, | ||
| body: { | ||
|
|
@@ -722,6 +753,7 @@ export async function startAgent( | |
| sessionCounted = false; | ||
| throw new Error('Event stream failed during initial prompt'); | ||
| } | ||
| agent.startupAbortController = null; | ||
|
|
||
| agent.messageCount = 1; | ||
|
|
||
|
|
@@ -735,7 +767,39 @@ export async function startAgent( | |
|
|
||
| return agent; | ||
| } catch (err) { | ||
| // On abort, clean up silently — the new startAgent invocation will | ||
| // proceed with a fresh entry. | ||
| if (err instanceof StartupAbortedError) { | ||
| console.log(`${MANAGER_LOG} startAgent: startup aborted for ${request.agentId}, cleaning up`); | ||
| if (sessionCounted) { | ||
| const instance = sdkInstances.get(workdir); | ||
| if (instance) { | ||
| // Abort the orphaned session if one was created before the abort | ||
| if (agent.sessionId) { | ||
| try { | ||
| await instance.client.session.abort({ path: { id: agent.sessionId } }); | ||
| } catch (abortErr) { | ||
| console.error( | ||
| `${MANAGER_LOG} startAgent: failed to abort orphaned session ${agent.sessionId}:`, | ||
| abortErr | ||
| ); | ||
| } | ||
| } | ||
| instance.sessionCount--; | ||
| if (instance.sessionCount <= 0) { | ||
| instance.server.close(); | ||
| sdkInstances.delete(workdir); | ||
| } | ||
| } | ||
| } | ||
| if (agents.get(request.agentId) === agent) { | ||
| agents.delete(request.agentId); | ||
| } | ||
| throw err; | ||
| } | ||
|
|
||
| agent.status = 'failed'; | ||
| agent.startupAbortController = null; | ||
| agent.exitReason = err instanceof Error ? err.message : String(err); | ||
| if (sessionCounted) { | ||
| const instance = sdkInstances.get(workdir); | ||
|
|
@@ -745,6 +809,18 @@ export async function startAgent( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Thrown when a startup sequence is cancelled via AbortController. | ||
| * Distinct from other errors so the catch block can clean up without | ||
| * marking the agent as failed (a new startup is taking over). | ||
| */ | ||
| class StartupAbortedError extends Error { | ||
| constructor(agentId: string) { | ||
| super(`Startup aborted for agent ${agentId}`); | ||
| this.name = 'StartupAbortedError'; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Stop an agent by aborting its session. | ||
| */ | ||
|
|
@@ -753,6 +829,13 @@ export async function stopAgent(agentId: string): Promise<void> { | |
| if (!agent) throw new Error(`Agent ${agentId} not found`); | ||
| if (agent.status !== 'running' && agent.status !== 'starting') return; | ||
|
|
||
| // If still starting, abort the in-flight startup so session.create() | ||
| // doesn't produce an orphaned session after we return. | ||
| if (agent.startupAbortController) { | ||
| agent.startupAbortController.abort(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. WARNING: Restart cleanup can release the same session twice After aborting the startup controller here, |
||
| agent.startupAbortController = null; | ||
| } | ||
|
|
||
| agent.status = 'stopping'; | ||
|
|
||
| // Cancel any pending idle timer | ||
|
|
@@ -839,6 +922,12 @@ export async function sendMessage(agentId: string, prompt: string): Promise<void | |
| * by `buildKiloConfigContent` at agent startup. | ||
| */ | ||
| function extractOrganizationId(): string | undefined { | ||
| // Primary source: standalone env var set by control-server on /agents/start | ||
| // and updated on every PATCH /model via X-Town-Config. | ||
| const envOrgId = process.env.GASTOWN_ORGANIZATION_ID; | ||
| if (envOrgId) return envOrgId; | ||
|
|
||
| // Fallback: extract from KILO_CONFIG_CONTENT (legacy path) | ||
| const raw = process.env.KILO_CONFIG_CONTENT; | ||
| if (!raw) return undefined; | ||
| try { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.