diff --git a/codex-rs/core/src/tools/handlers/view_image.rs b/codex-rs/core/src/tools/handlers/view_image.rs index fe62522180c..cfcd5ec2abc 100644 --- a/codex-rs/core/src/tools/handlers/view_image.rs +++ b/codex-rs/core/src/tools/handlers/view_image.rs @@ -1,5 +1,6 @@ use async_trait::async_trait; use codex_protocol::models::FunctionCallOutputBody; +use codex_protocol::openai_models::InputModality; use serde::Deserialize; use tokio::fs; @@ -18,6 +19,9 @@ use codex_protocol::models::local_image_content_items_with_label_number; pub struct ViewImageHandler; +const VIEW_IMAGE_UNSUPPORTED_MESSAGE: &str = + "view_image is not allowed because you do not support image inputs"; + #[derive(Deserialize)] struct ViewImageArgs { path: String, @@ -30,6 +34,17 @@ impl ToolHandler for ViewImageHandler { } async fn handle(&self, invocation: ToolInvocation) -> Result { + if !invocation + .turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Err(FunctionCallError::RespondToModel( + VIEW_IMAGE_UNSUPPORTED_MESSAGE.to_string(), + )); + } + let ToolInvocation { session, turn, diff --git a/codex-rs/core/src/tools/spec.rs b/codex-rs/core/src/tools/spec.rs index d0c5a3c07dd..7254252eb40 100644 --- a/codex-rs/core/src/tools/spec.rs +++ b/codex-rs/core/src/tools/spec.rs @@ -17,7 +17,6 @@ use codex_protocol::dynamic_tools::DynamicToolSpec; use codex_protocol::models::VIEW_IMAGE_TOOL_NAME; use codex_protocol::openai_models::ApplyPatchToolType; use codex_protocol::openai_models::ConfigShellToolType; -use codex_protocol::openai_models::InputModality; use codex_protocol::openai_models::ModelInfo; use serde::Deserialize; use serde::Serialize; @@ -31,7 +30,6 @@ pub(crate) struct ToolsConfig { pub shell_type: ConfigShellToolType, pub apply_patch_tool_type: Option, pub web_search_mode: Option, - pub supports_image_input: bool, pub search_tool: bool, pub collab_tools: bool, pub collaboration_modes_tools: bool, @@ -87,7 +85,6 @@ impl ToolsConfig { shell_type, apply_patch_tool_type, web_search_mode: *web_search_mode, - supports_image_input: model_info.input_modalities.contains(&InputModality::Image), search_tool: include_search_tool, collab_tools: include_collab_tools, collaboration_modes_tools: include_collaboration_modes_tools, @@ -1498,10 +1495,8 @@ pub(crate) fn build_specs( Some(WebSearchMode::Disabled) | None => {} } - if config.supports_image_input { - builder.push_spec_with_parallel_support(create_view_image_tool(), true); - builder.register_handler("view_image", view_image_handler); - } + builder.push_spec_with_parallel_support(create_view_image_tool(), true); + builder.register_handler("view_image", view_image_handler); if config.collab_tools { let collab_handler = Arc::new(CollabHandler); @@ -2076,29 +2071,6 @@ mod tests { ); } - #[test] - fn test_non_multimodal_models_exclude_view_image() { - let config = test_config(); - let mut model_info = ModelsManager::construct_model_info_offline("gpt-5.1", &config); - model_info.input_modalities = vec![InputModality::Text]; - let mut features = Features::with_defaults(); - features.enable(Feature::CollaborationModes); - let tools_config = ToolsConfig::new(&ToolsConfigParams { - model_info: &model_info, - features: &features, - web_search_mode: Some(WebSearchMode::Cached), - }); - let (tools, _) = build_specs(&tools_config, Some(HashMap::new()), &[]).build(); - - assert!( - !tools - .iter() - .map(|t| t.spec.name()) - .any(|name| name == VIEW_IMAGE_TOOL_NAME), - "view_image should be excluded for non-multimodal models" - ); - } - #[test] fn test_gpt_5_1_codex_max_unified_exec_web_search() { let mut features = Features::with_defaults(); diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs index b72f66fce84..cabee944db1 100644 --- a/codex-rs/core/tests/suite/view_image.rs +++ b/codex-rs/core/tests/suite/view_image.rs @@ -2,17 +2,28 @@ use base64::Engine; use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; +use codex_core::CodexAuth; +use codex_core::features::Feature; use codex_core::protocol::AskForApproval; use codex_core::protocol::EventMsg; use codex_core::protocol::Op; use codex_core::protocol::SandboxPolicy; use codex_protocol::config_types::ReasoningSummary; +use codex_protocol::openai_models::ConfigShellToolType; +use codex_protocol::openai_models::InputModality; +use codex_protocol::openai_models::ModelInfo; +use codex_protocol::openai_models::ModelVisibility; +use codex_protocol::openai_models::ModelsResponse; +use codex_protocol::openai_models::ReasoningEffort; +use codex_protocol::openai_models::ReasoningEffortPreset; +use codex_protocol::openai_models::TruncationPolicyConfig; use codex_protocol::user_input::UserInput; use core_test_support::responses; use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_completed; use core_test_support::responses::ev_function_call; use core_test_support::responses::ev_response_created; +use core_test_support::responses::mount_models_once; use core_test_support::responses::sse; use core_test_support::responses::start_mock_server; use core_test_support::skip_if_no_network; @@ -26,6 +37,8 @@ use image::Rgba; use image::load_from_memory; use serde_json::Value; use tokio::time::Duration; +use wiremock::BodyPrintLimit; +use wiremock::MockServer; fn find_image_message(body: &Value) -> Option<&Value> { body.get("input") @@ -521,6 +534,120 @@ async fn view_image_tool_errors_when_file_missing() -> anyhow::Result<()> { Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn view_image_tool_returns_unsupported_message_for_text_only_model() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + // Use MockServer directly (not start_mock_server) so the first /models request returns our + // text-only model. start_mock_server mounts empty models first, causing get_model_info to + // fall back to model_info_from_slug with default_input_modalities (Text+Image), which would + // incorrectly allow view_image. + let server = MockServer::builder() + .body_print_limit(BodyPrintLimit::Limited(80_000)) + .start() + .await; + let model_slug = "text-only-view-image-test-model"; + let text_only_model = ModelInfo { + slug: model_slug.to_string(), + display_name: "Text-only view_image test model".to_string(), + description: Some("Remote model for view_image unsupported-path coverage".to_string()), + default_reasoning_level: Some(ReasoningEffort::Medium), + supported_reasoning_levels: vec![ReasoningEffortPreset { + effort: ReasoningEffort::Medium, + description: ReasoningEffort::Medium.to_string(), + }], + shell_type: ConfigShellToolType::ShellCommand, + visibility: ModelVisibility::List, + supported_in_api: true, + input_modalities: vec![InputModality::Text], + priority: 1, + upgrade: None, + base_instructions: "base instructions".to_string(), + model_messages: None, + supports_reasoning_summaries: false, + support_verbosity: false, + default_verbosity: None, + apply_patch_tool_type: None, + truncation_policy: TruncationPolicyConfig::bytes(10_000), + supports_parallel_tool_calls: false, + context_window: Some(272_000), + auto_compact_token_limit: None, + effective_context_window_percent: 95, + experimental_supported_tools: Vec::new(), + }; + mount_models_once( + &server, + ModelsResponse { + models: vec![text_only_model], + }, + ) + .await; + + let TestCodex { codex, cwd, .. } = test_codex() + .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing()) + .with_config(|config| { + config.features.enable(Feature::RemoteModels); + config.model = Some(model_slug.to_string()); + }) + .build(&server) + .await?; + + let rel_path = "assets/example.png"; + let abs_path = cwd.path().join(rel_path); + if let Some(parent) = abs_path.parent() { + std::fs::create_dir_all(parent)?; + } + let image = ImageBuffer::from_pixel(20, 20, Rgba([255u8, 0, 0, 255])); + image.save(&abs_path)?; + + let call_id = "view-image-unsupported-model"; + let arguments = serde_json::json!({ "path": rel_path }).to_string(); + let first_response = sse(vec![ + ev_response_created("resp-1"), + ev_function_call(call_id, "view_image", &arguments), + ev_completed("resp-1"), + ]); + responses::mount_sse_once(&server, first_response).await; + + let second_response = sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]); + let mock = responses::mount_sse_once(&server, second_response).await; + + codex + .submit(Op::UserTurn { + items: vec![UserInput::Text { + text: "please attach the image".into(), + text_elements: Vec::new(), + }], + final_output_json_schema: None, + cwd: cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy: SandboxPolicy::DangerFullAccess, + model: model_slug.to_string(), + effort: None, + summary: ReasoningSummary::Auto, + collaboration_mode: None, + personality: None, + }) + .await?; + + wait_for_event(&codex, |event| matches!(event, EventMsg::TurnComplete(_))).await; + + let output_text = mock + .single_request() + .function_call_output_content_and_success(call_id) + .and_then(|(content, _)| content) + .expect("output text present"); + assert_eq!( + output_text, + "view_image is not allowed because you do not support image inputs" + ); + + Ok(()) +} + #[cfg(not(debug_assertions))] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn replaces_invalid_local_image_after_bad_request() -> anyhow::Result<()> {