diff --git a/src/apps/desktop/src/api/image_analysis_api.rs b/src/apps/desktop/src/api/image_analysis_api.rs deleted file mode 100644 index 961df1fb..00000000 --- a/src/apps/desktop/src/api/image_analysis_api.rs +++ /dev/null @@ -1,109 +0,0 @@ -//! Image Analysis API - -use crate::api::app_state::AppState; -use bitfun_core::agentic::coordination::{ - DialogScheduler, DialogSubmissionPolicy, DialogTriggerSource, -}; -use bitfun_core::agentic::image_analysis::{ - resolve_vision_model_from_ai_config, AnalyzeImagesRequest, ImageAnalysisResult, ImageAnalyzer, - MessageEnhancer, SendEnhancedMessageRequest, -}; -use log::error; -use std::path::PathBuf; -use std::sync::Arc; -use tauri::State; - -fn resolve_session_workspace_path( - request: &AnalyzeImagesRequest, -) -> Result, String> { - if let Some(workspace_path) = request.workspace_path.as_deref() { - if !workspace_path.trim().is_empty() { - return Ok(Some(PathBuf::from(workspace_path))); - } - } - - let coordinator = bitfun_core::agentic::coordination::get_global_coordinator() - .ok_or_else(|| "Coordinator not initialized".to_string())?; - - Ok(coordinator - .get_session_manager() - .get_session(&request.session_id) - .and_then(|session| session.config.workspace_path.clone()) - .filter(|workspace_path| !workspace_path.is_empty()) - .map(PathBuf::from)) -} - -#[tauri::command] -pub async fn analyze_images( - request: AnalyzeImagesRequest, - state: State<'_, AppState>, -) -> Result, String> { - let ai_config: bitfun_core::service::config::types::AIConfig = state - .config_service - .get_config(Some("ai")) - .await - .map_err(|e| { - error!("Failed to get AI config: error={}", e); - format!("Failed to get AI config: {}", e) - })?; - - let image_model = resolve_vision_model_from_ai_config(&ai_config).map_err(|e| { - error!( - "Image understanding model resolution failed: available_models={:?}, error={}", - ai_config.models.iter().map(|m| &m.id).collect::>(), - e - ); - format!( - "Image understanding model is not configured.\n\n\ - Please select a model for [Settings → Default Model Config → Image Understanding Model].\n\n\ - Details: {}", - e - ) - })?; - - let workspace_path = resolve_session_workspace_path(&request)?; - - let ai_client = state - .ai_client_factory - .get_client_by_id(&image_model.id) - .await - .map_err(|e| format!("Failed to create AI client: {}", e))?; - - let analyzer = ImageAnalyzer::new(workspace_path, ai_client); - - let results = analyzer - .analyze_images(request, &image_model) - .await - .map_err(|e| format!("Image analysis failed: {}", e))?; - - Ok(results) -} - -#[tauri::command] -pub async fn send_enhanced_message( - request: SendEnhancedMessageRequest, - scheduler: State<'_, Arc>, - _state: State<'_, AppState>, -) -> Result<(), String> { - let enhanced_message = MessageEnhancer::enhance_with_image_analysis( - &request.original_message, - &request.image_analyses, - &request.other_contexts, - ); - - scheduler - .submit( - request.session_id.clone(), - enhanced_message, - Some(request.original_message.clone()), - Some(request.dialog_turn_id.clone()), - request.agent_type.clone(), - None, - DialogSubmissionPolicy::for_source(DialogTriggerSource::DesktopApi), - None, - ) - .await - .map_err(|e| format!("Failed to send enhanced message: {}", e))?; - - Ok(()) -} diff --git a/src/apps/desktop/src/api/mod.rs b/src/apps/desktop/src/api/mod.rs index bdc521a5..a672bddb 100644 --- a/src/apps/desktop/src/api/mod.rs +++ b/src/apps/desktop/src/api/mod.rs @@ -15,7 +15,6 @@ pub mod dto; pub mod git_agent_api; pub mod git_api; pub mod i18n_api; -pub mod image_analysis_api; pub mod lsp_api; pub mod lsp_workspace_api; pub mod mcp_api; diff --git a/src/apps/desktop/src/lib.rs b/src/apps/desktop/src/lib.rs index 0ad2f237..55842416 100644 --- a/src/apps/desktop/src/lib.rs +++ b/src/apps/desktop/src/lib.rs @@ -311,8 +311,6 @@ pub async fn run() { api::btw_api::btw_ask, api::btw_api::btw_ask_stream, api::btw_api::btw_cancel, - api::image_analysis_api::analyze_images, - api::image_analysis_api::send_enhanced_message, api::context_upload_api::upload_image_contexts, get_all_tools_info, get_readonly_tools_info, diff --git a/src/crates/core/src/agentic/coordination/coordinator.rs b/src/crates/core/src/agentic/coordination/coordinator.rs index f692b403..a6f080c4 100644 --- a/src/crates/core/src/agentic/coordination/coordinator.rs +++ b/src/crates/core/src/agentic/coordination/coordinator.rs @@ -143,6 +143,19 @@ impl ConversationCoordinator { } } + async fn is_chinese_locale() -> bool { + use crate::service::config::get_global_config_service; + use crate::service::config::types::AppConfig; + let Ok(config_service) = get_global_config_service().await else { + return false; + }; + let app: AppConfig = config_service + .get_config(Some("app")) + .await + .unwrap_or_default(); + app.language.starts_with("zh") + } + fn assistant_bootstrap_system_reminder( kickoff_query: &str, expected_reply_language: &str, @@ -689,135 +702,6 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet .await } - /// Pre-analyze images using the configured vision model. - /// - /// Strategy: - /// 1. Vision model configured → analyze images → enhance user message with text descriptions → clear image_contexts - /// 2. No vision model → reject with a user-friendly message - async fn pre_analyze_images_if_needed( - &self, - user_input: String, - image_contexts: Option>, - session_id: &str, - image_metadata: Option, - workspace: Option, - ) -> BitFunResult<(String, Option>)> { - let images = match &image_contexts { - Some(imgs) if !imgs.is_empty() => imgs, - _ => return Ok((user_input, image_contexts)), - }; - - use crate::agentic::image_analysis::{ - resolve_vision_model_from_global_config, AnalyzeImagesRequest, ImageAnalyzer, - MessageEnhancer, - }; - use crate::infrastructure::ai::get_global_ai_client_factory; - - let vision_model = match resolve_vision_model_from_global_config().await { - Ok(m) => m, - Err(_e) => { - let is_chinese = Self::is_chinese_locale().await; - let msg = if is_chinese { - "请先在桌面端「设置 → AI 模型」中配置图片理解模型,然后再发送图片。" - } else { - "Please configure an Image Understanding Model in Settings → AI Models on the desktop app before sending images." - }; - return Err(BitFunError::service(msg)); - } - }; - - let factory = match get_global_ai_client_factory().await { - Ok(f) => f, - Err(e) => { - warn!("Failed to get AI client factory for vision: {}", e); - return Ok((user_input, image_contexts)); - } - }; - - let vision_client = match factory.get_client_by_id(&vision_model.id).await { - Ok(c) => c, - Err(e) => { - warn!("Failed to create vision AI client: {}", e); - return Ok((user_input, image_contexts)); - } - }; - - let workspace_path = workspace.map(|binding| binding.root_path); - let request_workspace_path = workspace_path - .as_ref() - .map(|path| path.to_string_lossy().to_string()); - let analyzer = ImageAnalyzer::new(workspace_path, vision_client); - let request = AnalyzeImagesRequest { - images: images.clone(), - user_message: Some(user_input.clone()), - session_id: session_id.to_string(), - workspace_path: request_workspace_path, - }; - - self.emit_event(AgenticEvent::ImageAnalysisStarted { - session_id: session_id.to_string(), - image_count: images.len(), - user_input: user_input.clone(), - image_metadata: image_metadata.clone(), - }) - .await; - - let analysis_start = std::time::Instant::now(); - - match analyzer.analyze_images(request, &vision_model).await { - Ok(results) => { - let duration_ms = analysis_start.elapsed().as_millis() as u64; - - self.emit_event(AgenticEvent::ImageAnalysisCompleted { - session_id: session_id.to_string(), - success: true, - duration_ms, - }) - .await; - - info!( - "Vision pre-analysis completed: session={}, images={}, results={}, duration={}ms", - session_id, - images.len(), - results.len(), - duration_ms - ); - let enhanced = - MessageEnhancer::enhance_with_image_analysis(&user_input, &results, &[]); - Ok((enhanced, None)) - } - Err(e) => { - let duration_ms = analysis_start.elapsed().as_millis() as u64; - - self.emit_event(AgenticEvent::ImageAnalysisCompleted { - session_id: session_id.to_string(), - success: false, - duration_ms, - }) - .await; - - warn!( - "Vision pre-analysis failed, falling back to multimodal: session={}, error={}", - session_id, e - ); - Ok((user_input, image_contexts)) - } - } - } - - async fn is_chinese_locale() -> bool { - use crate::service::config::get_global_config_service; - use crate::service::config::types::AppConfig; - let Ok(config_service) = get_global_config_service().await else { - return true; - }; - let app: AppConfig = config_service - .get_config(Some("app")) - .await - .unwrap_or_default(); - app.language.starts_with("zh") - } - async fn start_dialog_turn_internal( &self, session_id: String, @@ -1052,20 +936,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet user_message_metadata = Some(metadata); } - // Auto vision pre-analysis: when images are present, try to use the configured - // vision model to pre-analyze them, then enhance the user message with text descriptions. - // This is the single authoritative code path for all image handling (desktop, remote, bot). - // If no vision model is configured, the request is rejected with a user-friendly message. let session_workspace = Self::session_workspace_binding(&session); - let (user_input, image_contexts) = self - .pre_analyze_images_if_needed( - user_input, - image_contexts, - &session_id, - user_message_metadata.clone(), - session_workspace.clone(), - ) - .await?; let wrapped_user_input = self .wrap_user_input( diff --git a/src/crates/core/src/agentic/execution/execution_engine.rs b/src/crates/core/src/agentic/execution/execution_engine.rs index 0327a751..a0ac2885 100644 --- a/src/crates/core/src/agentic/execution/execution_engine.rs +++ b/src/crates/core/src/agentic/execution/execution_engine.rs @@ -16,7 +16,7 @@ use crate::agentic::tools::{get_all_registered_tools, SubagentParentInfo}; use crate::agentic::WorkspaceBinding; use crate::infrastructure::ai::get_global_ai_client_factory; use crate::service::config::get_global_config_service; -use crate::service::config::types::{ModelCapability, ModelCategory}; +use crate::service::config::types::ModelCapability; use crate::util::errors::{BitFunError, BitFunResult}; use crate::util::token_counter::TokenCounter; use crate::util::types::Message as AIMessage; @@ -218,6 +218,7 @@ impl ExecutionEngine { provider: &str, workspace_path: Option<&Path>, current_turn_id: &str, + attach_images: bool, ) -> BitFunResult> { let limits = ImageLimits::for_provider(provider); @@ -227,6 +228,13 @@ impl ExecutionEngine { for msg in messages { match &msg.content { MessageContent::Multimodal { text, images } => { + if !attach_images { + // Primary model is text-only (or images are disabled). Convert to text-only + // placeholder so providers that don't support image inputs won't error. + result.push(AIMessage::from(msg)); + continue; + } + let prompt = if text.trim().is_empty() { "(image attached)".to_string() } else { @@ -281,6 +289,51 @@ impl ExecutionEngine { Ok(result) } + fn render_multimodal_as_text( + text: &str, + images: &[ImageContextData], + can_use_view_image: bool, + ) -> String { + let mut content = text.to_string(); + + if images.is_empty() { + return content; + } + + content.push_str("\n\n[Attached image(s):\n"); + for image in images { + let name = image + .metadata + .as_ref() + .and_then(|m| m.get("name")) + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .or_else(|| image.image_path.as_ref().filter(|s| !s.is_empty()).cloned()) + .unwrap_or_else(|| image.id.clone()); + + // Keep the raw image payload out of text-only models. + // Provide `image_id` so the primary model can choose to call `view_image` when needed. + content.push_str(&format!( + "- {} ({}, image_id={})\n", + name, image.mime_type, image.id + )); + } + content.push_str("]\n"); + + if can_use_view_image { + content.push_str( + "If you need to inspect an image, call the `view_image` tool with `image_id`.\n", + ); + } else { + content.push_str( + "Note: image inspection is not available for this session.\n", + ); + } + + content + } + /// Compress context, will emit compression events (Started, Completed, and Failed) pub async fn compress_messages( &self, @@ -648,7 +701,6 @@ impl ExecutionEngine { m.capabilities .iter() .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding)) - || matches!(m.category, ModelCategory::Multimodal) }); (resolved_id, supports) @@ -677,6 +729,30 @@ impl ExecutionEngine { "primary_model_supports_image_understanding".to_string(), primary_supports_image_understanding.to_string(), ); + execution_context_vars.insert("turn_index".to_string(), context.turn_index.to_string()); + + // If the primary model is text-only, do not send image payloads to the provider. + // Instead, keep a text-only placeholder (including `image_id`) so the model can decide + // whether it wants to call `view_image` explicitly. + if !primary_supports_image_understanding { + let can_use_view_image = available_tools.iter().any(|t| t == "view_image"); + + for msg in messages.iter_mut() { + let MessageContent::Multimodal { text, images } = &msg.content else { + continue; + }; + + let original_text = text.clone(); + let original_images = images.clone(); + + // Replace multimodal messages with text-only versions to avoid provider errors. + let next_text = + Self::render_multimodal_as_text(&original_text, &original_images, can_use_view_image); + + msg.content = MessageContent::Text(next_text); + msg.metadata.tokens = None; + } + } // Loop to execute model rounds loop { @@ -797,6 +873,7 @@ impl ExecutionEngine { .as_ref() .map(|workspace| workspace.root_path()), &context.dialog_turn_id, + primary_supports_image_understanding, ) .await?; diff --git a/src/crates/core/src/service/config/types.rs b/src/crates/core/src/service/config/types.rs index 4aa0fae0..430cfb84 100644 --- a/src/crates/core/src/service/config/types.rs +++ b/src/crates/core/src/service/config/types.rs @@ -1193,7 +1193,7 @@ impl Default for AIModelConfig { presence_penalty: None, enabled: false, category: ModelCategory::GeneralChat, - capabilities: vec![ModelCapability::TextChat], + capabilities: vec![], recommended_for: vec![], metadata: None, enable_thinking_process: false, @@ -1246,8 +1246,10 @@ impl Default for MinimapConfig { } impl AIModelConfig { - /// Infers the model category from the model name and provider (for backward compatibility - /// with older configs). + /// Legacy helper that infers the model category from the model name and provider. + /// + /// This is kept for one-off migrations/debugging, but runtime behavior should prefer + /// explicitly configured `category`/`capabilities`. pub fn infer_category_from_model_name(&self) -> ModelCategory { let model_name_lower = self.model_name.to_lowercase(); let provider_lower = self.provider.to_lowercase(); @@ -1274,6 +1276,7 @@ impl AIModelConfig { || model_name_lower.contains("claude-3") || model_name_lower.contains("gemini-pro-vision") || model_name_lower.contains("gemini-1.5") + || model_name_lower.starts_with("kimi") { return ModelCategory::Multimodal; } @@ -1288,7 +1291,10 @@ impl AIModelConfig { ModelCategory::GeneralChat } - /// Infers capability tags from the model category and name. + /// Legacy helper that infers capability tags from the model category and name. + /// + /// This is kept for one-off migrations/debugging, but runtime behavior should prefer + /// explicitly configured `category`/`capabilities`. pub fn infer_capabilities_from_model(&self) -> Vec { let mut capabilities = vec![]; let model_name_lower = self.model_name.to_lowercase(); @@ -1331,16 +1337,31 @@ impl AIModelConfig { capabilities } - /// Auto-completes missing category and capability information (used for configuration - /// migration). - pub fn ensure_category_and_capabilities(&mut self) { - if self.category == ModelCategory::GeneralChat && self.capabilities.is_empty() { - self.category = self.infer_category_from_model_name(); - self.capabilities = self.infer_capabilities_from_model(); + fn default_capabilities_for_category(&self) -> Vec { + match self.category { + ModelCategory::GeneralChat => vec![ModelCapability::TextChat], + ModelCategory::Multimodal => { + vec![ModelCapability::TextChat, ModelCapability::ImageUnderstanding] + } + ModelCategory::ImageGeneration => vec![ModelCapability::ImageGeneration], + ModelCategory::Embedding => vec![ModelCapability::Embedding], + ModelCategory::SearchEnhanced => { + vec![ModelCapability::TextChat, ModelCapability::Search] + } + ModelCategory::CodeSpecialized => { + vec![ModelCapability::TextChat, ModelCapability::CodeSpecialized] + } + ModelCategory::SpeechRecognition => vec![ModelCapability::SpeechRecognition], } + } + /// Auto-completes missing capability information without rewriting explicit configuration. + /// + /// Important: we intentionally do not upgrade `category` or append inferred capabilities + /// based on the model name here. Runtime behavior should follow explicit configuration. + pub fn ensure_category_and_capabilities(&mut self) { if self.capabilities.is_empty() { - self.capabilities = self.infer_capabilities_from_model(); + self.capabilities = self.default_capabilities_for_category(); } } } diff --git a/src/web-ui/src/flow_chat/components/FlowToolCard.tsx b/src/web-ui/src/flow_chat/components/FlowToolCard.tsx index 7b19277b..3432b5a6 100644 --- a/src/web-ui/src/flow_chat/components/FlowToolCard.tsx +++ b/src/web-ui/src/flow_chat/components/FlowToolCard.tsx @@ -10,6 +10,20 @@ import { createLogger } from '@/shared/utils/logger'; const log = createLogger('FlowToolCard'); +/** + * When the primary model is multimodal, `view_image` returns `{ mode: "attached_to_primary_model" }` + * instead of a textual analysis. In that case the tool card is pure noise and should be hidden. + */ +function isViewImageAttachedMode(toolItem: FlowToolItem): boolean { + if (toolItem.toolName !== 'view_image') return false; + const raw = toolItem.toolResult?.result as Record | undefined; + const mode = + raw?.mode ?? + (raw?.result as Record | undefined)?.mode ?? + (raw?.data as Record | undefined)?.mode; + return mode === 'attached_to_primary_model'; +} + interface FlowToolCardProps { toolItem: FlowToolItem; onConfirm?: (toolId: string, updatedInput?: any) => void; @@ -31,6 +45,10 @@ export const FlowToolCard: React.FC = React.memo(({ sessionId, className = '' }) => { + if (isViewImageAttachedMode(toolItem)) { + return null; + } + const config = getToolCardConfig(toolItem.toolName); const CardComponent = getToolCardComponent(toolItem.toolName); @@ -81,4 +99,3 @@ export const FlowToolCard: React.FC = React.memo(({ JSON.stringify(prevProps.toolItem.toolResult) === JSON.stringify(nextProps.toolItem.toolResult) ); }); - diff --git a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts index aeaf02f0..074ecc4c 100644 --- a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts +++ b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts @@ -59,16 +59,6 @@ interface UseMessageSenderReturn { isSending: boolean; } -function formatImageContextLine(ctx: ImageContext): string { - const imgName = ctx.imageName || 'Untitled image'; - const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : ''; - const sourceLine = ctx.isLocal - ? `Path: ${ctx.imagePath}` - : `Image ID: ${ctx.id}`; - - return `[Image: ${imgName}${imgSize}]\n${sourceLine}`; -} - export function useMessageSender(props: UseMessageSenderProps): UseMessageSenderReturn { const { currentSessionId, @@ -164,7 +154,10 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender case 'code-snippet': return `[Code Snippet: ${ctx.filePath}:${ctx.startLine}-${ctx.endLine}]`; case 'image': - return formatImageContextLine(ctx); + // Images are sent out-of-band via `imageContexts` so the backend can attach them + // (multimodal) or let the model call `view_image` (text-only). Avoid embedding + // "Image ID" references into the user prompt, which can cause redundant tool calls. + return ''; case 'terminal-command': return `[Command: ${ctx.command}]`; case 'mermaid-node': diff --git a/src/web-ui/src/flow_chat/services/FlowChatManager.ts b/src/web-ui/src/flow_chat/services/FlowChatManager.ts index df29a0fd..fd3e0d6b 100644 --- a/src/web-ui/src/flow_chat/services/FlowChatManager.ts +++ b/src/web-ui/src/flow_chat/services/FlowChatManager.ts @@ -190,7 +190,7 @@ export class FlowChatManager { agentType?: string, switchToMode?: string, options?: { - imageContexts?: import('@/infrastructure/api/service-api/ImageAnalysisAPI').ImageContextData[]; + imageContexts?: import('@/infrastructure/api/service-api/ImageContextTypes').ImageContextData[]; imageDisplayData?: Array<{ id: string; name: string; dataUrl?: string; imagePath?: string; mimeType?: string }>; } ): Promise { diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts index 3336a483..57ea3ff1 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts @@ -15,7 +15,7 @@ import { createLogger } from '@/shared/utils/logger'; import type { FlowChatContext, DialogTurn } from './types'; import { ensureBackendSession, retryCreateBackendSession } from './SessionModule'; import { cleanupSessionBuffers } from './TextChunkModule'; -import type { ImageContextData as ImageInputContextData } from '@/infrastructure/api/service-api/ImageAnalysisAPI'; +import type { ImageContextData as ImageInputContextData } from '@/infrastructure/api/service-api/ImageContextTypes'; const log = createLogger('MessageModule'); @@ -130,7 +130,9 @@ export async function sendMessage( images: options?.imageDisplayData, }, modelRounds: [], - status: hasImages ? 'image_analyzing' : 'pending', + // Images are handled by the agent/tooling (e.g. `view_image`) or sent directly to multimodal + // primary models. We don't run a separate frontend "image pre-analysis" phase here. + status: 'pending', startTime: Date.now() }; diff --git a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts index 05fd0586..75934bbd 100644 --- a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts +++ b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts @@ -2,7 +2,7 @@ import { api } from './ApiClient'; import { createTauriCommandError } from '../errors/TauriCommandError'; -import type { ImageContextData as ImageInputContextData } from './ImageAnalysisAPI'; +import type { ImageContextData as ImageInputContextData } from './ImageContextTypes'; diff --git a/src/web-ui/src/infrastructure/api/service-api/ImageAnalysisAPI.ts b/src/web-ui/src/infrastructure/api/service-api/ImageAnalysisAPI.ts deleted file mode 100644 index 1a1ef6b6..00000000 --- a/src/web-ui/src/infrastructure/api/service-api/ImageAnalysisAPI.ts +++ /dev/null @@ -1,91 +0,0 @@ - - -import { api } from './ApiClient'; -import { createTauriCommandError } from '../errors/TauriCommandError'; -import type { ImageContext } from '@/shared/types/context'; - - - -export interface ImageContextData { - id: string; - image_path?: string; - data_url?: string; - mime_type: string; - metadata?: Record; -} - -export interface ImageAnalysisResult { - image_id: string; - summary: string; - detailed_description: string; - detected_elements: string[]; - confidence: number; - analysis_time_ms: number; -} - -export interface AnalyzeImagesRequest { - images: ImageContextData[]; - user_message?: string; - session_id: string; - workspace_path?: string; -} - -export interface SendEnhancedMessageRequest { - original_message: string; - image_analyses: ImageAnalysisResult[]; - other_contexts: any[]; - session_id: string; - dialog_turn_id: string; -} - - - -export class ImageAnalysisAPI { - - async analyzeImages(request: AnalyzeImagesRequest): Promise { - try { - return await api.invoke('analyze_images', { - request - }); - } catch (error) { - throw createTauriCommandError('analyze_images', error, request); - } - } - - - async sendEnhancedMessage(request: SendEnhancedMessageRequest): Promise { - try { - await api.invoke('send_enhanced_message', { - request - }); - } catch (error) { - throw createTauriCommandError('send_enhanced_message', error, request); - } - } - - - convertContextToData(context: ImageContext): ImageContextData { - return { - id: context.id, - image_path: context.isLocal ? context.imagePath : undefined, - data_url: !context.isLocal ? context.dataUrl : undefined, - mime_type: context.mimeType, - metadata: { - name: context.imageName, - width: context.width, - height: context.height, - file_size: context.fileSize, - source: context.source, - } - }; - } - - - convertContextsToData(contexts: ImageContext[]): ImageContextData[] { - return contexts.map(ctx => this.convertContextToData(ctx)); - } -} - - -export const imageAnalysisAPI = new ImageAnalysisAPI(); - diff --git a/src/web-ui/src/infrastructure/api/service-api/ImageContextTypes.ts b/src/web-ui/src/infrastructure/api/service-api/ImageContextTypes.ts new file mode 100644 index 00000000..cf338617 --- /dev/null +++ b/src/web-ui/src/infrastructure/api/service-api/ImageContextTypes.ts @@ -0,0 +1,14 @@ +// Shared image context payload shape passed from frontend to backend. +// +// Note: This is intentionally small and does not include the legacy image analysis APIs +// (`analyze_images`, `send_enhanced_message`). Image handling is done by the backend +// coordinator + tools (e.g. `view_image`). + +export interface ImageContextData { + id: string; + image_path?: string; + data_url?: string; + mime_type: string; + metadata?: Record; +} + diff --git a/src/web-ui/src/infrastructure/config/components/DefaultModelConfig.tsx b/src/web-ui/src/infrastructure/config/components/DefaultModelConfig.tsx index f36d09ec..a02e38e0 100644 --- a/src/web-ui/src/infrastructure/config/components/DefaultModelConfig.tsx +++ b/src/web-ui/src/infrastructure/config/components/DefaultModelConfig.tsx @@ -234,11 +234,11 @@ export const DefaultModelConfig: React.FC = () => { return enabledModels.filter(m => { switch (capability) { case 'image_understanding': - return m.capabilities?.includes('image_understanding') || m.category === 'multimodal'; + return m.capabilities?.includes('image_understanding'); case 'image_generation': - return m.capabilities?.includes('image_generation') || m.category === 'image_generation'; + return m.capabilities?.includes('image_generation'); case 'speech_recognition': - return m.capabilities?.includes('speech_recognition') || m.category === 'speech_recognition'; + return m.capabilities?.includes('speech_recognition'); default: return true; } @@ -318,8 +318,12 @@ export const DefaultModelConfig: React.FC = () => { value={configuredModelId || ''} onChange={(value) => handleCapabilityChange(capability, normalizeSelectValue(value))} placeholder={t('optional.selectModel')} - disabled={availableModels.length === 0} - options={availableModels.map(buildModelOption)} + // Allow clearing the selection even when there are no compatible models. + disabled={availableModels.length === 0 && !configuredModelId} + options={[ + { label: t('optional.notSet'), value: '' }, + ...availableModels.map(buildModelOption), + ]} renderOption={renderModelOption} renderValue={renderModelValue} className="default-model-config__model-select"