diff --git a/src/agent.ts b/src/agent.ts index dcf38ce..47340dc 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -229,7 +229,7 @@ export class Agent { * Run a query with streaming events. */ async *query( - prompt: string, + prompt: string | any[], overrides?: Partial, ): AsyncGenerator { await this.setupDone @@ -504,7 +504,7 @@ export function createAgent(options: AgentOptions = {}): Agent { * The agent is created, used, and cleaned up automatically. */ export async function* query(params: { - prompt: string + prompt: string | any[] options?: AgentOptions }): AsyncGenerator { const ephemeral = createAgent(params.options) diff --git a/src/providers/openai.ts b/src/providers/openai.ts index 81aa68b..50a403f 100644 --- a/src/providers/openai.ts +++ b/src/providers/openai.ts @@ -23,11 +23,15 @@ import type { interface OpenAIChatMessage { role: 'system' | 'user' | 'assistant' | 'tool' - content?: string | null + content?: string | OpenAIContentPart[] | null tool_calls?: OpenAIToolCall[] tool_call_id?: string } +type OpenAIContentPart = + | { type: 'text'; text: string } + | { type: 'image_url'; image_url: { url: string; detail?: 'auto' | 'low' | 'high' } } + interface OpenAIToolCall { id: string type: 'function' @@ -153,13 +157,30 @@ export class OpenAIProvider implements LLMProvider { return } - // Content blocks may contain text and/or tool_result blocks - const textParts: string[] = [] + // Content blocks may contain text, image, and/or tool_result blocks + const contentParts: OpenAIContentPart[] = [] const toolResults: Array<{ tool_use_id: string; content: string }> = [] for (const block of msg.content) { if (block.type === 'text') { - textParts.push(block.text) + contentParts.push({ type: 'text', text: block.text }) + } else if (block.type === 'image') { + const source = block.source + let imageUrl: string + if (source?.type === 'base64' && source.data) { + const mediaType = source.media_type || 'image/png' + imageUrl = `data:${mediaType};base64,${source.data}` + } else if (source?.type === 'url' && source.url) { + imageUrl = source.url + } else if (typeof source === 'string') { + imageUrl = source + } else { + continue + } + contentParts.push({ + type: 'image_url', + image_url: { url: imageUrl, detail: 'high' }, + }) } else if (block.type === 'tool_result') { toolResults.push({ tool_use_id: block.tool_use_id, @@ -177,9 +198,18 @@ export class OpenAIProvider implements LLMProvider { }) } - // Text parts become a user message - if (textParts.length > 0) { - result.push({ role: 'user', content: textParts.join('\n') }) + // Content parts become a user message + if (contentParts.length > 0) { + const hasImages = contentParts.some(p => p.type === 'image_url') + if (!hasImages) { + const text = contentParts + .filter((p): p is { type: 'text'; text: string } => p.type === 'text') + .map(p => p.text) + .join('\n') + result.push({ role: 'user', content: text }) + } else { + result.push({ role: 'user', content: contentParts }) + } } }