diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index cfc94016..365969ac 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -518,6 +518,7 @@ impl Session { include_apply_patch_tool: config.include_apply_patch_tool, include_web_search_request: config.tools_web_search_request, use_streamable_shell_tool: config.use_experimental_streamable_shell_tool, + include_view_image_tool: config.include_view_image_tool, }), user_instructions, base_instructions, @@ -1108,6 +1109,7 @@ async fn submission_loop( include_apply_patch_tool: config.include_apply_patch_tool, include_web_search_request: config.tools_web_search_request, use_streamable_shell_tool: config.use_experimental_streamable_shell_tool, + include_view_image_tool: config.include_view_image_tool, }); let new_turn_context = TurnContext { @@ -1193,6 +1195,7 @@ async fn submission_loop( include_web_search_request: config.tools_web_search_request, use_streamable_shell_tool: config .use_experimental_streamable_shell_tool, + include_view_image_tool: config.include_view_image_tool, }), user_instructions: turn_context.user_instructions.clone(), base_instructions: turn_context.base_instructions.clone(), @@ -2077,6 +2080,36 @@ async fn handle_function_call( ) .await } + "view_image" => { + #[derive(serde::Deserialize)] + struct SeeImageArgs { + path: String, + } + let args = match serde_json::from_str::(&arguments) { + Ok(a) => a, + Err(e) => { + return ResponseInputItem::FunctionCallOutput { + call_id, + output: FunctionCallOutputPayload { + content: format!("failed to parse function arguments: {e}"), + success: Some(false), + }, + }; + } + }; + let abs = turn_context.resolve_path(Some(args.path)); + let output = match sess.inject_input(vec![InputItem::LocalImage { path: abs }]) { + Ok(()) => FunctionCallOutputPayload { + content: "attached local image path".to_string(), + success: Some(true), + }, + Err(_) => FunctionCallOutputPayload { + content: "unable to attach image (no active task)".to_string(), + success: Some(false), + }, + }; + ResponseInputItem::FunctionCallOutput { call_id, output } + } "apply_patch" => { let args = match serde_json::from_str::(&arguments) { Ok(a) => a, diff --git a/codex-rs/core/src/config.rs b/codex-rs/core/src/config.rs index 98a8fde1..9b8f288c 100644 --- a/codex-rs/core/src/config.rs +++ b/codex-rs/core/src/config.rs @@ -178,6 +178,9 @@ pub struct Config { pub preferred_auth_method: AuthMode, pub use_experimental_streamable_shell_tool: bool, + + /// Include the `view_image` tool that lets the agent attach a local image path to context. + pub include_view_image_tool: bool, } impl Config { @@ -497,6 +500,10 @@ pub struct ToolsToml { // Renamed from `web_search_request`; keep alias for backwards compatibility. #[serde(default, alias = "web_search_request")] pub web_search: Option, + + /// Enable the `view_image` tool that lets the agent attach local images. + #[serde(default)] + pub view_image: Option, } impl ConfigToml { @@ -586,6 +593,7 @@ pub struct ConfigOverrides { pub base_instructions: Option, pub include_plan_tool: Option, pub include_apply_patch_tool: Option, + pub include_view_image_tool: Option, pub disable_response_storage: Option, pub show_raw_agent_reasoning: Option, pub tools_web_search_request: Option, @@ -613,6 +621,7 @@ impl Config { base_instructions, include_plan_tool, include_apply_patch_tool, + include_view_image_tool, disable_response_storage, show_raw_agent_reasoning, tools_web_search_request: override_tools_web_search_request, @@ -681,6 +690,10 @@ impl Config { .or(cfg.tools.as_ref().and_then(|t| t.web_search)) .unwrap_or(false); + let include_view_image_tool = include_view_image_tool + .or(cfg.tools.as_ref().and_then(|t| t.view_image)) + .unwrap_or(true); + let model = model .or(config_profile.model) .or(cfg.model) @@ -784,6 +797,7 @@ impl Config { use_experimental_streamable_shell_tool: cfg .experimental_use_exec_command_tool .unwrap_or(false), + include_view_image_tool, }; Ok(config) } @@ -1152,6 +1166,7 @@ disable_response_storage = true responses_originator_header: "codex_cli_rs".to_string(), preferred_auth_method: AuthMode::ChatGPT, use_experimental_streamable_shell_tool: false, + include_view_image_tool: true, }, o3_profile_config ); @@ -1208,6 +1223,7 @@ disable_response_storage = true responses_originator_header: "codex_cli_rs".to_string(), preferred_auth_method: AuthMode::ChatGPT, use_experimental_streamable_shell_tool: false, + include_view_image_tool: true, }; assert_eq!(expected_gpt3_profile_config, gpt3_profile_config); @@ -1279,6 +1295,7 @@ disable_response_storage = true responses_originator_header: "codex_cli_rs".to_string(), preferred_auth_method: AuthMode::ChatGPT, use_experimental_streamable_shell_tool: false, + include_view_image_tool: true, }; assert_eq!(expected_zdr_profile_config, zdr_profile_config); diff --git a/codex-rs/core/src/openai_tools.rs b/codex-rs/core/src/openai_tools.rs index a9fdb4f0..f7418816 100644 --- a/codex-rs/core/src/openai_tools.rs +++ b/codex-rs/core/src/openai_tools.rs @@ -67,6 +67,7 @@ pub(crate) struct ToolsConfig { pub plan_tool: bool, pub apply_patch_tool_type: Option, pub web_search_request: bool, + pub include_view_image_tool: bool, } pub(crate) struct ToolsConfigParams<'a> { @@ -77,6 +78,7 @@ pub(crate) struct ToolsConfigParams<'a> { pub(crate) include_apply_patch_tool: bool, pub(crate) include_web_search_request: bool, pub(crate) use_streamable_shell_tool: bool, + pub(crate) include_view_image_tool: bool, } impl ToolsConfig { @@ -89,6 +91,7 @@ impl ToolsConfig { include_apply_patch_tool, include_web_search_request, use_streamable_shell_tool, + include_view_image_tool, } = params; let mut shell_type = if *use_streamable_shell_tool { ConfigShellToolType::StreamableShell @@ -120,6 +123,7 @@ impl ToolsConfig { plan_tool: *include_plan_tool, apply_patch_tool_type, web_search_request: *include_web_search_request, + include_view_image_tool: *include_view_image_tool, } } } @@ -292,6 +296,30 @@ The shell tool is used to execute shell commands. }, }) } + +fn create_view_image_tool() -> OpenAiTool { + // Support only local filesystem path. + let mut properties = BTreeMap::new(); + properties.insert( + "path".to_string(), + JsonSchema::String { + description: Some("Local filesystem path to an image file".to_string()), + }, + ); + + OpenAiTool::Function(ResponsesApiTool { + name: "view_image".to_string(), + description: + "Attach a local image (by filesystem path) to the conversation context for this turn." + .to_string(), + strict: false, + parameters: JsonSchema::Object { + properties, + required: Some(vec!["path".to_string()]), + additional_properties: Some(false), + }, + }) +} /// TODO(dylan): deprecate once we get rid of json tool #[derive(Serialize, Deserialize)] pub(crate) struct ApplyPatchToolArgs { @@ -541,6 +569,11 @@ pub(crate) fn get_openai_tools( tools.push(OpenAiTool::WebSearch {}); } + // Include the view_image tool so the agent can attach images to context. + if config.include_view_image_tool { + tools.push(create_view_image_tool()); + } + if let Some(mcp_tools) = mcp_tools { // Ensure deterministic ordering to maximize prompt cache hits. // HashMap iteration order is non-deterministic, so sort by fully-qualified tool name. @@ -604,10 +637,14 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools(&config, Some(HashMap::new())); - assert_eq_tool_names(&tools, &["local_shell", "update_plan", "web_search"]); + assert_eq_tool_names( + &tools, + &["local_shell", "update_plan", "web_search", "view_image"], + ); } #[test] @@ -621,10 +658,14 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools(&config, Some(HashMap::new())); - assert_eq_tool_names(&tools, &["shell", "update_plan", "web_search"]); + assert_eq_tool_names( + &tools, + &["shell", "update_plan", "web_search", "view_image"], + ); } #[test] @@ -638,6 +679,7 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools( &config, @@ -679,11 +721,16 @@ mod tests { assert_eq_tool_names( &tools, - &["shell", "web_search", "test_server/do_something_cool"], + &[ + "shell", + "web_search", + "view_image", + "test_server/do_something_cool", + ], ); assert_eq!( - tools[2], + tools[3], OpenAiTool::Function(ResponsesApiTool { name: "test_server/do_something_cool".to_string(), parameters: JsonSchema::Object { @@ -737,6 +784,7 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: false, use_streamable_shell_tool: false, + include_view_image_tool: true, }); // Intentionally construct a map with keys that would sort alphabetically. @@ -794,6 +842,7 @@ mod tests { &tools, &[ "shell", + "view_image", "test_server/cool", "test_server/do", "test_server/something", @@ -812,6 +861,7 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools( @@ -837,10 +887,13 @@ mod tests { )])), ); - assert_eq_tool_names(&tools, &["shell", "web_search", "dash/search"]); + assert_eq_tool_names( + &tools, + &["shell", "web_search", "view_image", "dash/search"], + ); assert_eq!( - tools[2], + tools[3], OpenAiTool::Function(ResponsesApiTool { name: "dash/search".to_string(), parameters: JsonSchema::Object { @@ -870,6 +923,7 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools( @@ -893,9 +947,12 @@ mod tests { )])), ); - assert_eq_tool_names(&tools, &["shell", "web_search", "dash/paginate"]); + assert_eq_tool_names( + &tools, + &["shell", "web_search", "view_image", "dash/paginate"], + ); assert_eq!( - tools[2], + tools[3], OpenAiTool::Function(ResponsesApiTool { name: "dash/paginate".to_string(), parameters: JsonSchema::Object { @@ -923,6 +980,7 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools( @@ -946,9 +1004,9 @@ mod tests { )])), ); - assert_eq_tool_names(&tools, &["shell", "web_search", "dash/tags"]); + assert_eq_tool_names(&tools, &["shell", "web_search", "view_image", "dash/tags"]); assert_eq!( - tools[2], + tools[3], OpenAiTool::Function(ResponsesApiTool { name: "dash/tags".to_string(), parameters: JsonSchema::Object { @@ -979,6 +1037,7 @@ mod tests { include_apply_patch_tool: false, include_web_search_request: true, use_streamable_shell_tool: false, + include_view_image_tool: true, }); let tools = get_openai_tools( @@ -1002,9 +1061,9 @@ mod tests { )])), ); - assert_eq_tool_names(&tools, &["shell", "web_search", "dash/value"]); + assert_eq_tool_names(&tools, &["shell", "web_search", "view_image", "dash/value"]); assert_eq!( - tools[2], + tools[3], OpenAiTool::Function(ResponsesApiTool { name: "dash/value".to_string(), parameters: JsonSchema::Object { diff --git a/codex-rs/core/tests/suite/prompt_caching.rs b/codex-rs/core/tests/suite/prompt_caching.rs index 68605ab4..b165c0bc 100644 --- a/codex-rs/core/tests/suite/prompt_caching.rs +++ b/codex-rs/core/tests/suite/prompt_caching.rs @@ -191,7 +191,7 @@ async fn prompt_tools_are_consistent_across_requests() { let expected_instructions: &str = include_str!("../../prompt.md"); // our internal implementation is responsible for keeping tools in sync // with the OpenAI schema, so we just verify the tool presence here - let expected_tools_names: &[&str] = &["shell", "update_plan", "apply_patch"]; + let expected_tools_names: &[&str] = &["shell", "update_plan", "apply_patch", "view_image"]; let body0 = requests[0].body_json::().unwrap(); assert_eq!( body0["instructions"], diff --git a/codex-rs/exec/src/lib.rs b/codex-rs/exec/src/lib.rs index 3de95291..785272a6 100644 --- a/codex-rs/exec/src/lib.rs +++ b/codex-rs/exec/src/lib.rs @@ -148,6 +148,7 @@ pub async fn run_main(cli: Cli, codex_linux_sandbox_exe: Option) -> any base_instructions: None, include_plan_tool: None, include_apply_patch_tool: None, + include_view_image_tool: None, disable_response_storage: oss.then_some(true), show_raw_agent_reasoning: oss.then_some(true), tools_web_search_request: None, diff --git a/codex-rs/mcp-server/src/codex_message_processor.rs b/codex-rs/mcp-server/src/codex_message_processor.rs index 1623e766..aae463ad 100644 --- a/codex-rs/mcp-server/src/codex_message_processor.rs +++ b/codex-rs/mcp-server/src/codex_message_processor.rs @@ -798,6 +798,7 @@ fn derive_config_from_params( base_instructions, include_plan_tool, include_apply_patch_tool, + include_view_image_tool: None, disable_response_storage: None, show_raw_agent_reasoning: None, tools_web_search_request: None, diff --git a/codex-rs/mcp-server/src/codex_tool_config.rs b/codex-rs/mcp-server/src/codex_tool_config.rs index 69f07ff2..c29cb52c 100644 --- a/codex-rs/mcp-server/src/codex_tool_config.rs +++ b/codex-rs/mcp-server/src/codex_tool_config.rs @@ -161,6 +161,7 @@ impl CodexToolCallParam { base_instructions, include_plan_tool, include_apply_patch_tool: None, + include_view_image_tool: None, disable_response_storage: None, show_raw_agent_reasoning: None, tools_web_search_request: None, diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs index e4352478..4154160d 100644 --- a/codex-rs/tui/src/lib.rs +++ b/codex-rs/tui/src/lib.rs @@ -128,6 +128,7 @@ pub async fn run_main( base_instructions: None, include_plan_tool: Some(true), include_apply_patch_tool: None, + include_view_image_tool: None, disable_response_storage: cli.oss.then_some(true), show_raw_agent_reasoning: cli.oss.then_some(true), tools_web_search_request: cli.web_search.then_some(true),