diff --git a/src/backends/claude-code/index.ts b/src/backends/claude-code/index.ts index 20cc3d30..33ed8812 100644 --- a/src/backends/claude-code/index.ts +++ b/src/backends/claude-code/index.ts @@ -1,3 +1,4 @@ +import { randomUUID } from 'node:crypto'; import { constants, accessSync, existsSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import { rm } from 'node:fs/promises'; import { homedir } from 'node:os'; @@ -9,6 +10,7 @@ import type { SDKResultSuccess, SDKStatusMessage, SDKSystemMessage, + SDKUserMessage, } from '@anthropic-ai/claude-agent-sdk'; import { getEngineSettings } from '../../config/engineSettings.js'; import { logger } from '../../utils/logging.js'; @@ -24,7 +26,7 @@ import { import { cleanupContextFiles } from '../contextFiles.js'; import { buildSystemPrompt, buildTaskPrompt } from '../nativeTools.js'; import { logLlmCall } from '../shared/llmCallLogger.js'; -import type { AgentEngine, AgentEngineResult, AgentExecutionPlan } from '../types.js'; +import type { AgentEngine, AgentEngineResult, AgentExecutionPlan, ContextImage } from '../types.js'; import { buildClaudeEnv } from './env.js'; import { buildHooks } from './hooks.js'; import { CLAUDE_CODE_MODEL_IDS, DEFAULT_CLAUDE_CODE_MODEL } from './models.js'; @@ -64,6 +66,74 @@ export function ensureOnboardingFlag(): void { } } +const CLAUDE_SUPPORTED_IMAGE_TYPES = new Set([ + 'image/jpeg', + 'image/png', + 'image/gif', + 'image/webp', +]); + +/** + * Build an AsyncIterable of SDKUserMessages that delivers the task prompt text along + * with any work-item images as native SDK image content blocks. + * + * Used by the Claude Code engine to inject images directly into the first conversation + * turn rather than writing them to disk and hoping the agent reads the files. + */ +export async function* buildPromptWithImages( + text: string, + images: ContextImage[], +): AsyncIterable { + const imageBlocks = images + .filter((img) => CLAUDE_SUPPORTED_IMAGE_TYPES.has(img.mimeType)) + .map((img) => ({ + type: 'image' as const, + source: { + type: 'base64' as const, + media_type: img.mimeType as 'image/jpeg' | 'image/png' | 'image/gif' | 'image/webp', + data: img.base64Data, + }, + })); + + yield { + type: 'user', + message: { role: 'user', content: [{ type: 'text', text }, ...imageBlocks] }, + parent_tool_use_id: null, + session_id: randomUUID(), + }; +} + +/** + * Filter context images to those supported by the Claude SDK. + * Logs an INFO message when images will be injected, and a WARN for any skipped MIME types. + */ +function filterContextImages( + contextInjections: AgentExecutionPlan['contextInjections'], + logWriter: AgentExecutionPlan['logWriter'], +): ContextImage[] { + const allImages = contextInjections.flatMap((inj) => inj.images ?? []); + const supported = allImages.filter((img) => CLAUDE_SUPPORTED_IMAGE_TYPES.has(img.mimeType)); + const skipped = allImages.length - supported.length; + if (supported.length > 0) { + logWriter('INFO', 'Injecting work item images as SDK content blocks', { + count: supported.length, + }); + } + if (skipped > 0) { + logWriter('WARN', 'Skipped unsupported image MIME types', { + skipped, + types: [ + ...new Set( + allImages + .filter((img) => !CLAUDE_SUPPORTED_IMAGE_TYPES.has(img.mimeType)) + .map((img) => img.mimeType), + ), + ], + }); + } + return supported; +} + /** * Extract a GitHub PR URL from assistant messages (tool results containing create-pr output). */ @@ -525,9 +595,15 @@ export class ClaudeCodeEngine implements AgentEngine { async execute(input: AgentExecutionPlan): Promise { const startTime = Date.now(); const systemPrompt = buildSystemPrompt(input.systemPrompt, input.availableTools); + + // Collect supported images for native SDK delivery; strip from injections so + // offloadLargeContext does not also write them to disk (redundant for this engine). + const supportedImages = filterContextImages(input.contextInjections, input.logWriter); + const injectionsForPrompt = input.contextInjections.map(({ images: _images, ...rest }) => rest); + const { prompt: taskPrompt, hasOffloadedContext } = await buildTaskPrompt( input.taskPrompt, - input.contextInjections, + injectionsForPrompt, input.repoDir, ); // Resolve model again here for backward compatibility: execute() may be called @@ -569,7 +645,10 @@ export class ClaudeCodeEngine implements AgentEngine { const maxContinuationTurns = input.completionRequirements?.maxContinuationTurns ?? 0; let continuationTurns = 0; - let promptText = taskPrompt; + // Use AsyncIterable prompt for the first turn when images are present; string otherwise. + // Continuation turns always use a plain string prompt. + let promptInput: string | AsyncIterable = + supportedImages.length > 0 ? buildPromptWithImages(taskPrompt, supportedImages) : taskPrompt; let isContinuation = false; let turnCount = 0; let totalCost: number | undefined; @@ -577,7 +656,7 @@ export class ClaudeCodeEngine implements AgentEngine { for (;;) { const stderrChunks: string[] = []; const stream = query({ - prompt: promptText, + prompt: promptInput, options: { model, systemPrompt, @@ -641,7 +720,7 @@ export class ClaudeCodeEngine implements AgentEngine { if (decision.done) return decision.result; continuationTurns++; - promptText = decision.promptText; + promptInput = decision.promptText; isContinuation = true; } } diff --git a/src/backends/shared/contextFiles.ts b/src/backends/shared/contextFiles.ts index 5e082c79..5ad02be1 100644 --- a/src/backends/shared/contextFiles.ts +++ b/src/backends/shared/contextFiles.ts @@ -6,8 +6,10 @@ * to read them on-demand using its built-in Read tool. * * When context injections contain images, each image is written as a binary - * file to `.cascade/context/images/` so native-tool engines (Claude Code, - * OpenCode, Codex) can read them with their built-in Read tool. + * file to `.cascade/context/images/` so native-tool engines (Codex, OpenCode) + * can read them with their built-in Read tool. The Claude Code engine receives + * images natively as SDK ImageBlockParam content blocks instead (see + * src/backends/claude-code/index.ts buildPromptWithImages). */ import { mkdir, rm, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; @@ -104,7 +106,7 @@ function generateReadInstructions(files: OffloadedFile[], images: OffloadedImage if (images.length > 0) { if (files.length > 0) lines.push(''); lines.push( - `The following context images have been saved to \`${CONTEXT_OFFLOAD_CONFIG.contextDir}/${IMAGES_SUBDIR}/\`:`, + 'The following work item images were pre-downloaded using authenticated credentials. Use the Read tool on the file paths below — do NOT curl or HTTP-fetch the original attachment URLs:', ); lines.push(''); for (const img of images) { diff --git a/src/backends/shared/nativeToolPrompts.ts b/src/backends/shared/nativeToolPrompts.ts index 6809c895..f3283c2c 100644 --- a/src/backends/shared/nativeToolPrompts.ts +++ b/src/backends/shared/nativeToolPrompts.ts @@ -6,12 +6,13 @@ const NATIVE_TOOL_EXECUTION_RULES = `## Native Tool Execution Rules You are operating in a native-tool environment, not a gadget/function-call environment. - Never write pseudo tool calls such as \`[tool_call: ...]\`, \`ReadFile(...)\`, \`RipGrep(...)\`, \`Tmux(...)\`, \`CreatePR(...)\`, or similar function-call text in your assistant response. -- Use actual OpenCode/Codex tool invocations instead: +- Use your built-in tools instead: - use built-in file/search tools or the shell tool for repository exploration - use the edit tool for file modifications - use the shell tool for all \`cascade-tools ...\`, \`git ...\`, \`rg ...\`, \`fd ...\`, test, lint, and build commands - When the task instructions mention gadget names like \`CreatePR\`, \`PostComment\`, \`UpdateChecklistItem\`, \`Finish\`, \`ReadWorkItem\`, \`TodoUpsert\`, or \`TodoUpdateStatus\`, treat that as a request to run the equivalent real command or tool action, not to print the gadget name. -- If you catch yourself composing a pseudo tool call in plain text, stop and use the real tool instead.`; +- If you catch yourself composing a pseudo tool call in plain text, stop and use the real tool instead. +- Trello, JIRA, and GitHub attachment URLs require backend authentication. NEVER curl, wget, or HTTP-fetch them — they return an authorization error. Work item images are pre-fetched and available either as images in your conversation context or as files under \`.cascade/context/images/\` — use whichever is present; never fetch the original URLs.`; /** * Format a single CLI parameter for tool guidance documentation. diff --git a/src/backends/types.ts b/src/backends/types.ts index d15ebaa9..d445be3b 100644 --- a/src/backends/types.ts +++ b/src/backends/types.ts @@ -5,6 +5,7 @@ import type { CompletionRequirements } from './completion.js'; // Re-export shared contracts so downstream code that imports from here continues to work. export type { + ContextImage, ContextInjection, LogWriter, ProgressReporter, diff --git a/tests/unit/backends/claude-code-imagePrompt.test.ts b/tests/unit/backends/claude-code-imagePrompt.test.ts new file mode 100644 index 00000000..23c7af05 --- /dev/null +++ b/tests/unit/backends/claude-code-imagePrompt.test.ts @@ -0,0 +1,112 @@ +import type { SDKUserMessage } from '@anthropic-ai/claude-agent-sdk'; +import type { ContentBlockParam } from '@anthropic-ai/sdk/resources'; +import { describe, expect, it, vi } from 'vitest'; +import type { ContextImage } from '../../../src/agents/contracts/index.js'; +import { buildPromptWithImages } from '../../../src/backends/claude-code/index.js'; + +vi.mock('@anthropic-ai/claude-agent-sdk', () => ({ + query: vi.fn(), +})); + +vi.mock('../../../src/utils/logging.js', () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +vi.mock('../../../src/db/repositories/runsRepository.js', () => ({ + storeLlmCall: vi.fn().mockResolvedValue(undefined), +})); + +async function collect(iterable: AsyncIterable): Promise { + const results: T[] = []; + for await (const item of iterable) results.push(item); + return results; +} + +const PNG_IMAGE: ContextImage = { + base64Data: 'aGVsbG8=', + mimeType: 'image/png', + altText: 'A diagram', +}; + +const JPEG_IMAGE: ContextImage = { + base64Data: 'dGVzdA==', + mimeType: 'image/jpeg', +}; + +const TIFF_IMAGE: ContextImage = { + base64Data: 'dGlmZg==', + mimeType: 'image/tiff', // unsupported +}; + +describe('buildPromptWithImages', () => { + it('yields one SDKUserMessage with text block + image block', async () => { + const msgs = await collect(buildPromptWithImages('do task', [PNG_IMAGE])); + + expect(msgs).toHaveLength(1); + const msg = msgs[0]; + expect(msg.type).toBe('user'); + expect(msg.message.role).toBe('user'); + + const content = msg.message.content as ContentBlockParam[]; + expect(content).toHaveLength(2); + expect(content[0]).toEqual({ type: 'text', text: 'do task' }); + expect(content[1].type).toBe('image'); + const imageBlock = content[1] as { + type: 'image'; + source: { type: string; media_type: string; data: string }; + }; + expect(imageBlock.source.type).toBe('base64'); + expect(imageBlock.source.media_type).toBe('image/png'); + expect(imageBlock.source.data).toBe('aGVsbG8='); + }); + + it('sets a non-empty session_id and null parent_tool_use_id', async () => { + const msgs = await collect(buildPromptWithImages('task', [PNG_IMAGE])); + const msg = msgs[0]; + expect(msg.session_id).toBeTruthy(); + expect(msg.parent_tool_use_id).toBeNull(); + }); + + it('includes multiple images as separate image blocks', async () => { + const msgs = await collect(buildPromptWithImages('task', [PNG_IMAGE, JPEG_IMAGE])); + const content = msgs[0].message.content as ContentBlockParam[]; + expect(content).toHaveLength(3); // text + 2 images + expect(content[0].type).toBe('text'); + expect(content[1].type).toBe('image'); + expect(content[2].type).toBe('image'); + }); + + it('filters out unsupported MIME types', async () => { + const msgs = await collect(buildPromptWithImages('task', [PNG_IMAGE, TIFF_IMAGE])); + const content = msgs[0].message.content as ContentBlockParam[]; + // text + 1 image (TIFF filtered out) + expect(content).toHaveLength(2); + expect(content[0].type).toBe('text'); + expect(content[1].type).toBe('image'); + }); + + it('yields text-only message when all images are unsupported', async () => { + const msgs = await collect(buildPromptWithImages('task', [TIFF_IMAGE])); + const content = msgs[0].message.content as ContentBlockParam[]; + expect(content).toHaveLength(1); + expect(content[0]).toEqual({ type: 'text', text: 'task' }); + }); + + it('yields text-only message when images array is empty', async () => { + const msgs = await collect(buildPromptWithImages('task', [])); + const content = msgs[0].message.content as ContentBlockParam[]; + expect(content).toHaveLength(1); + expect(content[0]).toEqual({ type: 'text', text: 'task' }); + }); + + it('yields exactly one message', async () => { + const msgs = await collect(buildPromptWithImages('task', [PNG_IMAGE, JPEG_IMAGE, TIFF_IMAGE])); + expect(msgs).toHaveLength(1); + }); + + it('each call produces a unique session_id', async () => { + const msgs1 = await collect(buildPromptWithImages('task', [PNG_IMAGE])); + const msgs2 = await collect(buildPromptWithImages('task', [PNG_IMAGE])); + expect(msgs1[0].session_id).not.toBe(msgs2[0].session_id); + }); +}); diff --git a/tests/unit/backends/claude-code.test.ts b/tests/unit/backends/claude-code.test.ts index de13c412..51ea8869 100644 --- a/tests/unit/backends/claude-code.test.ts +++ b/tests/unit/backends/claude-code.test.ts @@ -895,6 +895,65 @@ describe('execute', () => { await Promise.resolve(); expect(mockStoreLlmCall).not.toHaveBeenCalled(); }); + + it('passes AsyncIterable prompt to query() when contextInjections has images', async () => { + mockStream([ + { type: 'result', subtype: 'success', result: 'Done', total_cost_usd: 0, num_turns: 1 }, + ]); + + const input = makeInput({ + contextInjections: [ + { + toolName: 'ReadWorkItem', + params: {}, + result: 'card content', + description: 'Work item', + images: [{ base64Data: 'abc', mimeType: 'image/png' }], + }, + ], + }); + + await new ClaudeCodeEngine().execute(input); + + const promptArg = mockQuery.mock.calls[0][0].prompt; + expect(typeof promptArg).not.toBe('string'); + expect(promptArg[Symbol.asyncIterator]).toBeDefined(); + }); + + it('logs image injection and strips images before buildTaskPrompt', async () => { + mockStream([ + { type: 'result', subtype: 'success', result: 'Done', total_cost_usd: 0, num_turns: 1 }, + ]); + + const input = makeInput({ + contextInjections: [ + { + toolName: 'ReadWorkItem', + params: {}, + result: 'card content', + description: 'Work item', + images: [{ base64Data: 'imagedata123', mimeType: 'image/png' }], + }, + ], + }); + + await new ClaudeCodeEngine().execute(input); + + expect(input.logWriter).toHaveBeenCalledWith( + 'INFO', + 'Injecting work item images as SDK content blocks', + { count: 1 }, + ); + + // Collect text from the AsyncIterable prompt + const promptArg = mockQuery.mock.calls[0][0].prompt as AsyncIterable<{ + message: { content: { type: string; text?: string }[] }; + }>; + const msgs: { message: { content: { type: string; text?: string }[] } }[] = []; + for await (const m of promptArg) msgs.push(m); + const textBlock = msgs[0].message.content.find((b) => b.type === 'text'); + expect(textBlock?.text).not.toContain('imagedata123'); + }); }); describe('continuation loop', () => {