diff --git a/backend/src/adapters/request/anthropic.ts b/backend/src/adapters/request/anthropic.ts index 12bfea0..ffe9e57 100644 --- a/backend/src/adapters/request/anthropic.ts +++ b/backend/src/adapters/request/anthropic.ts @@ -4,6 +4,7 @@ */ import type { + ImageContentBlock, InternalContentBlock, InternalMessage, InternalRequest, @@ -25,9 +26,10 @@ interface AnthropicContentBlock { text?: string; thinking?: string; source?: { - type: "base64"; - media_type: string; - data: string; + type: "base64" | "url"; + media_type?: string; + data?: string; + url?: string; }; id?: string; name?: string; @@ -137,7 +139,28 @@ function convertContentBlock( } case "image": - // Images not supported in MVP + // Handle both base64 and URL source types + if (block.source?.type === "url" && block.source.url) { + return { + type: "image", + source: { + type: "url", + url: block.source.url, + }, + } as ImageContentBlock; + } + // Handle base64 - only if type matches and data is present + if (block.source?.type === "base64" && block.source.data) { + return { + type: "image", + source: { + type: "base64", + mediaType: block.source.media_type, + data: block.source.data, + }, + } as ImageContentBlock; + } + // Skip images with missing data return null; default: diff --git a/backend/src/adapters/request/openai-chat.ts b/backend/src/adapters/request/openai-chat.ts index db3802b..208d664 100644 --- a/backend/src/adapters/request/openai-chat.ts +++ b/backend/src/adapters/request/openai-chat.ts @@ -4,6 +4,8 @@ */ import type { + ImageContentBlock, + ImageSource, InternalContentBlock, InternalMessage, InternalRequest, @@ -14,6 +16,35 @@ import type { ToolUseContentBlock, } from "../types"; +// ============================================================================= +// Helper Functions +// ============================================================================= + +/** + * Parse a data URL into base64 source, or return URL source for regular URLs + * Data URL format: data:[][;base64], + */ +function parseImageUrl(url: string): ImageSource { + if (url.startsWith("data:")) { + // Parse data URL: data:image/jpeg;base64,/9j/4AAQ... + const match = url.match(/^data:([^;,]+)?(?:;base64)?,(.*)$/); + if (match) { + const mediaType = match[1] || "image/jpeg"; + const data = match[2] || ""; + return { + type: "base64", + mediaType, + data, + }; + } + } + // Regular URL + return { + type: "url", + url, + }; +} + // ============================================================================= // OpenAI Chat Request Types // ============================================================================= @@ -122,13 +153,20 @@ function convertContent( if (typeof content === "string") { return content; } - // Array of content parts - currently only support text (images not supported yet) + // Array of content parts - support text and image_url const blocks: InternalContentBlock[] = []; for (const part of content) { if (part.type === "text" && part.text) { blocks.push({ type: "text", text: part.text }); + } else if (part.type === "image_url" && part.image_url?.url) { + // Parse URL - handles both regular URLs and data URLs (base64) + const source = parseImageUrl(part.image_url.url); + blocks.push({ + type: "image", + source, + detail: part.image_url.detail, + } as ImageContentBlock); } - // Skip image_url parts for now (not supported in MVP) } const [firstBlock] = blocks; diff --git a/backend/src/adapters/request/openai-response.ts b/backend/src/adapters/request/openai-response.ts index c211a34..296ed7e 100644 --- a/backend/src/adapters/request/openai-response.ts +++ b/backend/src/adapters/request/openai-response.ts @@ -4,6 +4,9 @@ */ import type { + ImageContentBlock, + ImageSource, + InternalContentBlock, InternalMessage, InternalRequest, InternalToolDefinition, @@ -12,6 +15,35 @@ import type { ToolResultContentBlock, } from "../types"; +// ============================================================================= +// Helper Functions +// ============================================================================= + +/** + * Parse a data URL into base64 source, or return URL source for regular URLs + * Data URL format: data:[][;base64], + */ +function parseImageUrl(url: string): ImageSource { + if (url.startsWith("data:")) { + // Parse data URL: data:image/jpeg;base64,/9j/4AAQ... + const match = url.match(/^data:([^;,]+)?(?:;base64)?,(.*)$/); + if (match) { + const mediaType = match[1] || "image/jpeg"; + const data = match[2] || ""; + return { + type: "base64", + mediaType, + data, + }; + } + } + // Regular URL + return { + type: "url", + url, + }; +} + // ============================================================================= // OpenAI Response API Request Types // ============================================================================= @@ -102,13 +134,38 @@ const KNOWN_FIELDS = new Set([ // ============================================================================= /** - * Convert Response API content parts to string + * Convert Response API content parts to string or content blocks */ -function convertContentParts(parts: ResponseApiContentPart[]): string { - return parts - .filter((p) => p.type === "input_text" || p.type === "text") - .map((p) => p.text || "") - .join(""); +function convertContentParts( + parts: ResponseApiContentPart[], +): string | InternalContentBlock[] { + const hasImages = parts.some((p) => p.type === "input_image"); + + if (!hasImages) { + // Simple case: text only + return parts + .filter((p) => p.type === "input_text" || p.type === "text") + .map((p) => p.text || "") + .join(""); + } + + // Complex case: includes images + const blocks: InternalContentBlock[] = []; + for (const part of parts) { + if (part.type === "input_text" || part.type === "text") { + if (part.text) { + blocks.push({ type: "text", text: part.text }); + } + } else if (part.type === "input_image" && part.image_url) { + // Parse URL - handles both regular URLs and data URLs (base64) + const source = parseImageUrl(part.image_url); + blocks.push({ + type: "image", + source, + } as ImageContentBlock); + } + } + return blocks; } /** diff --git a/backend/src/adapters/response/anthropic.ts b/backend/src/adapters/response/anthropic.ts index be19c8d..cf323af 100644 --- a/backend/src/adapters/response/anthropic.ts +++ b/backend/src/adapters/response/anthropic.ts @@ -94,6 +94,9 @@ function convertContentBlock( case "tool_result": // Tool results are not included in Anthropic assistant responses return null; + case "image": + // Images are not included in assistant responses (only in requests) + return null; } } diff --git a/backend/src/adapters/types.ts b/backend/src/adapters/types.ts index 04e3400..0d13fa4 100644 --- a/backend/src/adapters/types.ts +++ b/backend/src/adapters/types.ts @@ -45,6 +45,29 @@ export interface ToolResultContentBlock { isError?: boolean; } +/** + * Image source types - discriminated union for type safety + */ +export type ImageSource = + | { + type: "base64"; + mediaType?: string; // "image/jpeg", "image/png", etc. + data: string; + } + | { + type: "url"; + url: string; + }; + +/** + * Image content block - represents an image input for vision models + */ +export interface ImageContentBlock { + type: "image"; + source: ImageSource; + detail?: "auto" | "low" | "high"; // OpenAI vision detail level +} + /** * Union type for all content blocks */ @@ -52,7 +75,8 @@ export type InternalContentBlock = | TextContentBlock | ThinkingContentBlock | ToolUseContentBlock - | ToolResultContentBlock; + | ToolResultContentBlock + | ImageContentBlock; // ============================================================================= // Message Types diff --git a/backend/src/adapters/upstream/anthropic.ts b/backend/src/adapters/upstream/anthropic.ts index df66db0..4e5d20c 100644 --- a/backend/src/adapters/upstream/anthropic.ts +++ b/backend/src/adapters/upstream/anthropic.ts @@ -26,6 +26,12 @@ interface AnthropicContentBlock { type: "text" | "image" | "tool_use" | "tool_result" | "thinking"; text?: string; thinking?: string; + source?: { + type: "base64" | "url"; + media_type?: string; + data?: string; + url?: string; + }; id?: string; name?: string; input?: Record; @@ -195,6 +201,27 @@ function convertMessage(msg: InternalMessage): AnthropicMessage | null { text: block.text, cache_control: block.cacheControl, }); + } else if (block.type === "image") { + // Only push image blocks with valid data + if (block.source.type === "base64" && block.source.data) { + content.push({ + type: "image", + source: { + type: "base64", + media_type: block.source.mediaType || "image/jpeg", + data: block.source.data, + }, + }); + } else if (block.source.type === "url" && block.source.url) { + // Anthropic also supports URL source type + content.push({ + type: "image", + source: { + type: "url", + url: block.source.url, + }, + }); + } } } diff --git a/backend/src/adapters/upstream/openai-responses.ts b/backend/src/adapters/upstream/openai-responses.ts index fea801e..5fe6c93 100644 --- a/backend/src/adapters/upstream/openai-responses.ts +++ b/backend/src/adapters/upstream/openai-responses.ts @@ -16,14 +16,17 @@ import type { ToolUseContentBlock, UpstreamAdapter, } from "../types"; +import { convertImageToUrl, hasImages } from "./utils"; // ============================================================================= // Response API Types // ============================================================================= interface ResponseApiContentPart { - type: "input_text" | "output_text" | "refusal"; + type: "input_text" | "output_text" | "refusal" | "input_image"; text?: string; + image_url?: string; + detail?: "auto" | "low" | "high"; } interface ResponseApiInputItem { @@ -126,14 +129,54 @@ function convertMessage(msg: InternalMessage): ResponseApiInputItem | null { }; } - // Regular messages - const content = - typeof msg.content === "string" - ? msg.content - : msg.content - .filter((b) => b.type === "text") - .map((b) => b.text) - .join(""); + // Handle string content + if (typeof msg.content === "string") { + return { + type: "message", + role: msg.role, + content: msg.content, + }; + } + + // Handle content array - check if it contains images + if (hasImages(msg.content)) { + // Build content array with input_text and input_image parts + const contentParts: ResponseApiContentPart[] = []; + for (const block of msg.content) { + if (block.type === "text") { + contentParts.push({ type: "input_text", text: block.text }); + } else if (block.type === "image") { + // Only include images with valid data + const imageUrl = convertImageToUrl(block); + if (imageUrl) { + contentParts.push({ + type: "input_image", + image_url: imageUrl, + detail: block.detail, + }); + } + } + } + // Ensure we don't send empty content array to API + if (contentParts.length === 0) { + return { + type: "message", + role: msg.role, + content: "", + }; + } + return { + type: "message", + role: msg.role, + content: contentParts, + }; + } + + // Text-only content - join as string + const content = msg.content + .filter((b) => b.type === "text") + .map((b) => b.text) + .join(""); return { type: "message", diff --git a/backend/src/adapters/upstream/openai.ts b/backend/src/adapters/upstream/openai.ts index 32c8c80..f04b928 100644 --- a/backend/src/adapters/upstream/openai.ts +++ b/backend/src/adapters/upstream/openai.ts @@ -18,14 +18,24 @@ import type { ToolUseContentBlock, UpstreamAdapter, } from "../types"; +import { convertImageToUrl, hasImages } from "./utils"; // ============================================================================= // OpenAI Request/Response Types // ============================================================================= +interface OpenAIContentPart { + type: "text" | "image_url"; + text?: string; + image_url?: { + url: string; + detail?: "auto" | "low" | "high"; + }; +} + interface OpenAIMessage { role: "system" | "user" | "assistant" | "tool"; - content: string | null; + content: string | OpenAIContentPart[] | null; name?: string; tool_calls?: OpenAIToolCall[]; tool_call_id?: string; @@ -168,14 +178,53 @@ function convertMessage(msg: InternalMessage): OpenAIMessage { }; } - // Regular messages - const content = - typeof msg.content === "string" - ? msg.content - : msg.content - .filter((b) => b.type === "text") - .map((b) => b.text) - .join(""); + // Handle string content + if (typeof msg.content === "string") { + return { + role: msg.role, + content: msg.content, + }; + } + + // Handle content array - check if it contains images + if (hasImages(msg.content)) { + // Build content array with text and image_url parts + const contentParts: OpenAIContentPart[] = []; + for (const block of msg.content) { + if (block.type === "text") { + contentParts.push({ type: "text", text: block.text }); + } else if (block.type === "image") { + // Only include images with valid data + const imageUrl = convertImageToUrl(block); + if (imageUrl) { + contentParts.push({ + type: "image_url", + image_url: { + url: imageUrl, + detail: block.detail, + }, + }); + } + } + } + // Ensure we don't send empty content array to API + if (contentParts.length === 0) { + return { + role: msg.role, + content: "", + }; + } + return { + role: msg.role, + content: contentParts, + }; + } + + // Text-only content array - join as string + const content = msg.content + .filter((b) => b.type === "text") + .map((b) => b.text) + .join(""); return { role: msg.role, diff --git a/backend/src/adapters/upstream/utils.ts b/backend/src/adapters/upstream/utils.ts new file mode 100644 index 0000000..41ebb18 --- /dev/null +++ b/backend/src/adapters/upstream/utils.ts @@ -0,0 +1,26 @@ +/** + * Shared utility functions for upstream adapters + */ + +import type { ImageContentBlock, InternalContentBlock } from "../types"; + +/** + * Convert image source to URL format (data URL for base64, direct URL for url type) + */ +export function convertImageToUrl(block: ImageContentBlock): string { + if (block.source.type === "url") { + return block.source.url; + } + // Convert base64 to data URL + if (block.source.type === "base64") { + return `data:${block.source.mediaType || "image/jpeg"};base64,${block.source.data}`; + } + return ""; +} + +/** + * Check if content blocks contain any images + */ +export function hasImages(content: InternalContentBlock[]): boolean { + return content.some((b) => b.type === "image"); +} diff --git a/backend/src/api/v1/completions.ts b/backend/src/api/v1/completions.ts index db36840..068829e 100644 --- a/backend/src/api/v1/completions.ts +++ b/backend/src/api/v1/completions.ts @@ -72,11 +72,34 @@ const tToolChoice = t.Union([ }), ]); +// Content part schema - supports text and image_url +const tContentPart = t.Union([ + t.Object({ + type: t.Literal("text"), + text: t.String(), + }), + t.Object({ + type: t.Literal("image_url"), + image_url: t.Object({ + url: t.String(), + detail: t.Optional(t.Union([ + t.Literal("auto"), + t.Literal("low"), + t.Literal("high"), + ])), + }), + }), +]); + // Message schema - supports various message types const tMessage = t.Object( { role: t.String(), - content: t.Optional(t.Union([t.String(), t.Null()])), + content: t.Optional(t.Union([ + t.String(), + t.Null(), + t.Array(tContentPart), + ])), tool_calls: t.Optional(t.Array(t.Object({ id: t.String(), type: t.Literal("function"), diff --git a/frontend/src/pages/requests/detail-panel/pretty-view.tsx b/frontend/src/pages/requests/detail-panel/pretty-view.tsx index 9c84e66..25fb92e 100644 --- a/frontend/src/pages/requests/detail-panel/pretty-view.tsx +++ b/frontend/src/pages/requests/detail-panel/pretty-view.tsx @@ -5,6 +5,7 @@ import { CopyIcon, ForwardIcon, HelpCircleIcon, + ImageIcon, ReplyIcon, WrenchIcon, TerminalIcon, @@ -47,6 +48,15 @@ interface ToolDefinition { } } +// Image content part type +interface ImageContentPart { + type: 'image_url' + image_url: { + url: string + detail?: 'auto' | 'low' | 'high' + } +} + export function MessagesPrettyView() { const { t } = useTranslation() @@ -132,6 +142,9 @@ function MessageContent({ message }: { message: RequestMessage }) { // Check if this is an assistant message with tool calls const toolCalls = extendedMessage.tool_calls + // Extract images from content array + const images = getMessageImages(message) + const { content, reasoning } = match(message) .with({ role: 'assistant' }, () => extractReasoning(messageText)) .otherwise(() => ({ reasoning: null, content: messageText })) @@ -147,6 +160,20 @@ function MessageContent({ message }: { message: RequestMessage }) { {reasoning && } {content && } + {images.length > 0 && ( +
+
+ + {t('pages.requests.detail-panel.pretty-view.Images', { defaultValue: 'Images' })} + {images.length} +
+
+ {images.map((image, index) => ( + + ))} +
+
+ )} {toolCalls && toolCalls.length > 0 && (
@@ -463,6 +490,74 @@ function getMessageText(message: RequestMessage): string { .otherwise(() => '') } +/** + * Extract image content parts from a message + */ +function getMessageImages(message: RequestMessage): ImageContentPart[] { + // Handle case where content is an array + const content = (message as { content?: unknown }).content + if (!content || typeof content === 'string' || !Array.isArray(content)) { + return [] + } + + const images: ImageContentPart[] = [] + for (const part of content) { + if ( + part && + typeof part === 'object' && + 'type' in part && + part.type === 'image_url' && + 'image_url' in part && + part.image_url && + typeof part.image_url === 'object' && + 'url' in part.image_url + ) { + images.push({ + type: 'image_url', + image_url: { + url: String(part.image_url.url), + detail: (part.image_url as { detail?: 'auto' | 'low' | 'high' }).detail, + }, + }) + } + } + return images +} + +/** + * Component to display an image from a message + */ +function ImageContentDisplay({ image }: { image: ImageContentPart }) { + const { t } = useTranslation() + const { url, detail } = image.image_url + + // Check if it's a data URL (base64) + const isDataUrl = url.startsWith('data:') + + return ( +
+ + {t('pages.requests.detail-panel.pretty-view.UserImage', + + {detail && ( +
+ {t('pages.requests.detail-panel.pretty-view.ImageDetail', { defaultValue: 'Detail' })}: {detail} +
+ )} +
+ ) +} + /** * Component to display a single tool call */