From 180768ff078c51d512d63c4d84a1d555b71487e7 Mon Sep 17 00:00:00 2001 From: Stephen Hellicar Date: Thu, 26 Mar 2026 01:26:49 +1100 Subject: [PATCH 1/4] Suppress SDK abort crash on early escape (#121) --- src/ClaudeCli.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/ClaudeCli.ts b/src/ClaudeCli.ts index cebf266..754242a 100644 --- a/src/ClaudeCli.ts +++ b/src/ClaudeCli.ts @@ -1,7 +1,7 @@ import { appendFileSync, type FSWatcher, readFileSync, statSync, watch } from 'node:fs'; import { homedir } from 'node:os'; import { resolve } from 'node:path'; -import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk'; +import { AbortError, type SDKMessage } from '@anthropic-ai/claude-agent-sdk'; import type { DocumentBlockParam, ImageBlockParam, SearchResultBlockParam, TextBlockParam, ToolReferenceBlockParam } from '@anthropic-ai/sdk/resources'; import { ExecInputSchema } from '@shellicar/mcp-exec'; import { AppState } from './AppState.js'; @@ -818,6 +818,19 @@ export class ClaudeCli { } this.term = new Terminal(this.appState, config.drowningThreshold, this.attachmentStore, this.commandMode); + + // Workaround for SDK bug (#121): the SDK's readMessages calls + // handleControlRequest fire-and-forget. When we abort during early + // query initialization, its catch block's transport.write() throws + // AbortError as an unhandled rejection that crashes the process. + process.on('unhandledRejection', (reason: unknown) => { + if (reason instanceof AbortError) { + this.term.error(`[suppressed] ${reason.message}`); + return; + } + throw reason; + }); + this.session = new QuerySession(config.model, config.maxTurns, config.thinking, config.thinkingEffort); this.session.shellicarMcp = config.shellicarMcp; From 29e8a0f23f52e3d2f8f28390976f884640565e11 Mon Sep 17 00:00:00 2001 From: Stephen Hellicar Date: Thu, 26 Mar 2026 01:28:59 +1100 Subject: [PATCH 2/4] Add session log for #121 Phase 3 --- .claude/CLAUDE.md | 2 +- .claude/sessions/2026-03-25.md | 127 +++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index b836541..48d6507 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -73,7 +73,7 @@ Only update the `Status` field — do not modify any other frontmatter or prompt ## Current State Branch: `main` -In-progress: PR #119 (release alpha.70) open, auto-merge enabled. After merge, run github-release skill to publish. +In-progress: PR #119 (release alpha.70) open, auto-merge enabled. After merge, run github-release skill to publish. PR #122 (fix #121, abort crash on early escape) open, auto-merge enabled. diff --git a/.claude/sessions/2026-03-25.md b/.claude/sessions/2026-03-25.md index 8deba0d..938961c 100644 --- a/.claude/sessions/2026-03-25.md +++ b/.claude/sessions/2026-03-25.md @@ -7,3 +7,130 @@ - Decisions: Version alpha.70 confirmed by SC. Change 1 from prompt 116 (mcp-exec bump) was already merged as part of PR #118, so only the release step was needed. PR #119 closes both #115 and #116. - Next: After PR #119 merges, run github-release skill to publish the release - Violations: Used bash as program with script as arg in first tool call; args do not expand ~. Should have used script path directly as program. Caught and corrected. + +### 23:56 - abort/investigation (Phase 1 of #121) + +- Did: Traced the abort crash from Escape keypress through claude-cli and into the SDK. Identified root cause as an SDK bug in `handleControlRequest` fire-and-forget call. Produced detailed findings and proposed fix below. +- Files: None (investigation only) +- Decisions: Fix belongs in claude-cli as a workaround (upstream SDK bug). Proposed approach: install `unhandledRejection` handler in `cancel()` to suppress SDK's "Operation aborted" error. +- Next: Phase 2 (implementation) can proceed using the findings below. +- Violations: None + +#### Investigation Findings for Phase 2 + +**Root cause**: SDK bug in `@anthropic-ai/claude-agent-sdk@0.2.83`. The SDK's `G0.readMessages()` (demangled: `Query.readMessages()`) calls `handleControlRequest()` fire-and-forget (no `.catch()`, no `await`). When the transport is aborted mid-request, `handleControlRequest`'s catch block tries to write an error response back to the transport, but `Q0.write()` (demangled: `ProcessTransport.write()`) throws `AbortError("Operation aborted")` because the abort signal is active. This second throw escapes the catch block as an unhandled promise rejection, crashing the process. + +**Evidence that the SDK authors know this can reject**: In `processPendingPermissionRequests`, the SDK calls `this.handleControlRequest(X).catch(() => {})` (with `.catch()`). But in `readMessages`, the same call is made without `.catch()`. + +**Step-by-step trace of early escape crash**: + +1. User sends query: `session.send()` creates `AbortController`, calls SDK `query()` which spawns Claude Code child process and starts `readMessages()` loop (fire-and-forget) and `initialize()`. +2. Claude Code process sends a `control_request` (e.g., `can_use_tool` for tool permission check) through stdout. +3. SDK's `readMessages` loop reads this message and calls `handleControlRequest($)` fire-and-forget. +4. `handleControlRequest` calls `processControlRequest`, which calls claude-cli's `canUseTool` callback. This either creates a `PermissionManager` waiter (awaiting user input) or returns immediately. +5. User presses Escape: `ClaudeCli.handleKey()` at `src/ClaudeCli.ts:509-528`: + - `permissions.cancelAll()` resolves any pending waiters with `{behavior: 'deny'}` (synchronous) + - `setTimeout(() => session.cancel(), 0)` schedules abort for next macrotask +6. `cancel()` fires (`src/session.ts:197-201`): + - `this.abort.abort()` signals the `AbortController`, transport's abort handler kills child process with SIGTERM + - `this.activeQuery.close()` calls SDK's `cleanup()`: closes transport (`ready=false`, stdin ended), calls `inputStream.done()` (exits the `for await` loop in `send()`) +7. `readMessages` is still running (pipe buffer may contain data). It reads the `control_request` from the buffer and calls `handleControlRequest` fire-and-forget. +8. `handleControlRequest` calls `canUseTool` callback. The callback returns `{behavior: 'deny', message: 'Query is no longer active'}` immediately (since `session.isActive` is false after cleanup). +9. `handleControlRequest` try block builds success response and calls `transport.write(response)`. +10. `ProcessTransport.write()` checks `this.abortController.signal.aborted` (true) and throws `AbortError("Operation aborted")`. +11. `handleControlRequest` catch block catches the error. Builds error response. Calls `transport.write(errorResponse)`. +12. `write()` throws `AbortError("Operation aborted")` again. +13. This second throw escapes the catch block. Since `handleControlRequest` was called fire-and-forget (no `.catch()`), it becomes an unhandled promise rejection. +14. Node.js default behavior: terminate the process. + +**Why late abort works (Escape after output has started)**: + +When aborting after output has started, the SDK is in a different state: +- The model is generating text/thinking, not requesting tool permissions +- There are no in-flight `handleControlRequest` calls (no pending `control_request` messages) +- The abort causes `readMessages`'s `for await` loop to exit (stdout closes when child is killed) +- `readMessages`'s try/catch handles the stream termination error +- No fire-and-forget `handleControlRequest` rejection to escape + +The crash ONLY occurs when there is an in-flight `control_request` being processed at abort time. This happens during early query initialization (before the model starts generating text). + +**Can this be fixed in claude-cli alone?** + +Yes, as a workaround. The upstream fix would be adding `.catch()` to the `handleControlRequest` call in `readMessages`, or wrapping the catch block's `write()` in its own try/catch. Since we cannot change the SDK, we catch the unhandled rejection at the process level. + +**Proposed fix (in `src/session.ts`):** + +In the `cancel()` method, install a temporary `process.on('unhandledRejection')` handler before aborting. The handler suppresses `AbortError("Operation aborted")` errors. A `setTimeout` with `.unref()` removes the handler after SDK cleanup settles. Non-abort rejections are re-thrown as uncaught exceptions (preserving Node.js default crash behavior). + +```typescript +public cancel(): void { + this.aborted = true; + + // Workaround for SDK bug (#121): the SDK's readMessages calls + // handleControlRequest fire-and-forget. When we abort, its catch + // block's transport.write() throws "Operation aborted" as an + // unhandled rejection that crashes the process. Suppress it. + const suppress = (reason: unknown) => { + if (reason instanceof Error && reason.message === 'Operation aborted') { + return; + } + throw reason; + }; + process.on('unhandledRejection', suppress); + setTimeout(() => process.off('unhandledRejection', suppress), 5000).unref(); + + this.abort?.abort(); + this.activeQuery?.close(); +} +``` + +Key details for the implementer: +- The handler checks `reason.message === 'Operation aborted'` (the exact string from `AbortError`) +- 5-second timeout covers the SDK's 2s SIGTERM timeout plus margin for pipe buffer draining +- `.unref()` ensures the timer doesn't keep the process alive on exit +- Non-abort rejections are re-thrown (becomes `uncaughtException`, crashes the process as expected) +- No changes needed to `send()` or any other method +- `AbortError` is exported from the SDK (`@anthropic-ai/claude-agent-sdk`), so the implementer could use `instanceof AbortError` instead of string matching if preferred + +**SDK references (minified names)**: +- `Q0` = `ProcessTransport`, `write()` at sdk.mjs line 19, throws on aborted signal +- `G0` = `Query`, `handleControlRequest()` at sdk.mjs line 19-20, fire-and-forget in `readMessages()` +- `u$` = `AbortError` (exported as `AbortError` in sdk.d.ts line 17), `class u$ extends Error{}` +- `Y0` = input stream (async queue), `done()` ends the `for await` loop in `send()` + +### 00:32 - abort/implementation (Phase 2 of #121) FAILED VERIFICATION + +- Did: Implemented the abort crash fix in `session.ts`. Added a temporary `unhandledRejection` handler in `cancel()` that suppresses the SDK's `AbortError` during abort teardown. Used `instanceof AbortError` (exported from SDK) rather than string matching. +- Files: src/session.ts +- **Failed**: The 5-second fixed timeout is insufficient. If a `canUseTool` callback takes longer than 5 seconds to resolve (e.g. waiting for user permission input), the handler expires before the SDK's fire-and-forget `handleControlRequest` rejects. The crash still occurs. +- **Corrected approach**: Install the handler in `cancel()`, remove it in `send()`'s `finally` block. This ties the handler lifetime to the actual query lifecycle instead of a fixed timeout. Use `import { AbortError } from '@anthropic-ai/claude-agent-sdk';` and `instanceof AbortError` for the check. +- Next: Phase 2 must be re-done with the corrected approach. Revert the current `session.ts` changes first. + +### 01:01 - abort/implementation (Phase 2 of #121) FAILED VERIFICATION + +- Did: Implemented the abort crash fix in `session.ts` using the corrected approach from the failed attempt. Handler installed in `cancel()`, removed in `send()`'s `finally` block (query lifecycle, not fixed timeout). Uses `instanceof AbortError` from the SDK. +- Files: src/session.ts +- **Failed**: The `send()` `finally` block runs immediately on abort (the `for await` loop exits when `inputStream.done()` is called). But the SDK's background `readMessages` loop and `canUseTool` callbacks fire after `send()` has fully returned. The handler is removed before the error occurs. +- **Root problem**: No lifecycle boundary in claude-cli outlives the SDK's background callbacks. Scoping the handler to `cancel()`, `send()`, or any timeout all fail for the same reason. +- **Corrected approach**: Install a global `unhandledRejection` handler at application startup (not scoped to abort). Use `import { AbortError } from '@anthropic-ai/claude-agent-sdk';` and `instanceof AbortError`. Log the error with `term.error` (non-fatal). This is standard Node.js practice; there is no legitimate scenario where an unhandled rejection should crash a CLI mid-session. +- Next: Phase 2 must be re-done with the global handler approach. Revert the current `session.ts` changes first. + +### 01:12 - abort/implementation (Phase 2 of #121) + +- Did: Installed a global `unhandledRejection` handler in `ClaudeCli.start()` (after `term` is created) that suppresses SDK `AbortError` rejections and re-throws all others. All verification passes (type-check, build, 105 tests, biome lint). +- Files: src/ClaudeCli.ts +- Decisions: Handler placed in `ClaudeCli.start()` rather than `main.ts` because `term.error` is only available after Terminal construction. Still fires before any user interaction. Used `instanceof AbortError` (SDK export) rather than string matching for type safety. +- Next: Phase 3 (ship): stage, commit, push, create PR. Awaiting manual verification of reproduction steps by Supreme Commander. +- Violations: None + +#### Summary for Phase 3 + +The SDK's `readMessages` loop calls `handleControlRequest` fire-and-forget (no `.catch()`). When the user aborts a query before the first SDK event, the transport is already closed, and `handleControlRequest`'s catch block throws `AbortError("Operation aborted")` as an unhandled promise rejection, crashing the process. Two scoped approaches (fixed timeout, query lifecycle) failed because no claude-cli lifecycle boundary outlives the SDK's background callbacks. The fix installs a global `unhandledRejection` handler at startup that suppresses SDK `AbortError` instances (logging them non-fatally) while letting all other unhandled rejections crash the process as expected. This is a workaround for an upstream SDK bug where `.catch()` is missing on the `handleControlRequest` call in `readMessages`. + +### 01:25 - abort/ship (Phase 3 of #121) + +- Did: Staged, committed, pushed branch, created PR #122 with auto-merge enabled. Checks (CodeQL, Node.js CI) running. +- Files: src/ClaudeCli.ts (committed from Phase 2) +- Decisions: None (shipping only) +- Next: PR #122 will auto-merge when checks pass. Prompt delivered. +- Violations: None From 1a2f01b8046ffba0ebbf02ac6739c8c68602b7fe Mon Sep 17 00:00:00 2001 From: Stephen Hellicar Date: Thu, 26 Mar 2026 01:40:11 +1100 Subject: [PATCH 3/4] Don't rethrow errors in global error handler. --- src/ClaudeCli.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ClaudeCli.ts b/src/ClaudeCli.ts index 754242a..2555b60 100644 --- a/src/ClaudeCli.ts +++ b/src/ClaudeCli.ts @@ -698,7 +698,7 @@ export class ClaudeCli { } private pasteImage(): void { - const log = (_msg: string) => {}; + const log = (_msg: string) => { }; readClipboardImage(this.platform, log) .then((clip) => { switch (clip.kind) { @@ -823,12 +823,13 @@ export class ClaudeCli { // handleControlRequest fire-and-forget. When we abort during early // query initialization, its catch block's transport.write() throws // AbortError as an unhandled rejection that crashes the process. - process.on('unhandledRejection', (reason: unknown) => { - if (reason instanceof AbortError) { - this.term.error(`[suppressed] ${reason.message}`); - return; + process.on('unhandledRejection', (err: unknown) => { + if (err instanceof Error) { + this.term.error(err.message); + if (err.stack) { + this.term.error(err.stack); + } } - throw reason; }); this.session = new QuerySession(config.model, config.maxTurns, config.thinking, config.thinkingEffort); From 08a679b1d9c26f228af30114d83b59ca448d4611 Mon Sep 17 00:00:00 2001 From: Stephen Hellicar Date: Thu, 26 Mar 2026 01:41:40 +1100 Subject: [PATCH 4/4] Linting --- src/ClaudeCli.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ClaudeCli.ts b/src/ClaudeCli.ts index 2555b60..8177f9a 100644 --- a/src/ClaudeCli.ts +++ b/src/ClaudeCli.ts @@ -1,7 +1,7 @@ import { appendFileSync, type FSWatcher, readFileSync, statSync, watch } from 'node:fs'; import { homedir } from 'node:os'; import { resolve } from 'node:path'; -import { AbortError, type SDKMessage } from '@anthropic-ai/claude-agent-sdk'; +import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk'; import type { DocumentBlockParam, ImageBlockParam, SearchResultBlockParam, TextBlockParam, ToolReferenceBlockParam } from '@anthropic-ai/sdk/resources'; import { ExecInputSchema } from '@shellicar/mcp-exec'; import { AppState } from './AppState.js'; @@ -698,7 +698,7 @@ export class ClaudeCli { } private pasteImage(): void { - const log = (_msg: string) => { }; + const log = (_msg: string) => {}; readClipboardImage(this.platform, log) .then((clip) => { switch (clip.kind) {