Add "View Image" tool (#2723)

Adds a "View Image" tool so Codex can find and see images by itself:

<img width="1772" height="420" alt="Screenshot 2025-08-26 at 10 40
04 AM"
src="https://github.com/user-attachments/assets/7a459c7b-0b86-4125-82d9-05fbb35ade03"
/>
This commit is contained in:
dedrisian-oai
2025-08-27 17:41:23 -07:00
committed by GitHub
parent 3e309805ae
commit 4e9ad23864
8 changed files with 126 additions and 13 deletions

View File

@@ -518,6 +518,7 @@ impl Session {
include_apply_patch_tool: config.include_apply_patch_tool,
include_web_search_request: config.tools_web_search_request,
use_streamable_shell_tool: config.use_experimental_streamable_shell_tool,
include_view_image_tool: config.include_view_image_tool,
}),
user_instructions,
base_instructions,
@@ -1108,6 +1109,7 @@ async fn submission_loop(
include_apply_patch_tool: config.include_apply_patch_tool,
include_web_search_request: config.tools_web_search_request,
use_streamable_shell_tool: config.use_experimental_streamable_shell_tool,
include_view_image_tool: config.include_view_image_tool,
});
let new_turn_context = TurnContext {
@@ -1193,6 +1195,7 @@ async fn submission_loop(
include_web_search_request: config.tools_web_search_request,
use_streamable_shell_tool: config
.use_experimental_streamable_shell_tool,
include_view_image_tool: config.include_view_image_tool,
}),
user_instructions: turn_context.user_instructions.clone(),
base_instructions: turn_context.base_instructions.clone(),
@@ -2077,6 +2080,36 @@ async fn handle_function_call(
)
.await
}
"view_image" => {
#[derive(serde::Deserialize)]
struct SeeImageArgs {
path: String,
}
let args = match serde_json::from_str::<SeeImageArgs>(&arguments) {
Ok(a) => a,
Err(e) => {
return ResponseInputItem::FunctionCallOutput {
call_id,
output: FunctionCallOutputPayload {
content: format!("failed to parse function arguments: {e}"),
success: Some(false),
},
};
}
};
let abs = turn_context.resolve_path(Some(args.path));
let output = match sess.inject_input(vec![InputItem::LocalImage { path: abs }]) {
Ok(()) => FunctionCallOutputPayload {
content: "attached local image path".to_string(),
success: Some(true),
},
Err(_) => FunctionCallOutputPayload {
content: "unable to attach image (no active task)".to_string(),
success: Some(false),
},
};
ResponseInputItem::FunctionCallOutput { call_id, output }
}
"apply_patch" => {
let args = match serde_json::from_str::<ApplyPatchToolArgs>(&arguments) {
Ok(a) => a,

View File

@@ -178,6 +178,9 @@ pub struct Config {
pub preferred_auth_method: AuthMode,
pub use_experimental_streamable_shell_tool: bool,
/// Include the `view_image` tool that lets the agent attach a local image path to context.
pub include_view_image_tool: bool,
}
impl Config {
@@ -497,6 +500,10 @@ pub struct ToolsToml {
// Renamed from `web_search_request`; keep alias for backwards compatibility.
#[serde(default, alias = "web_search_request")]
pub web_search: Option<bool>,
/// Enable the `view_image` tool that lets the agent attach local images.
#[serde(default)]
pub view_image: Option<bool>,
}
impl ConfigToml {
@@ -586,6 +593,7 @@ pub struct ConfigOverrides {
pub base_instructions: Option<String>,
pub include_plan_tool: Option<bool>,
pub include_apply_patch_tool: Option<bool>,
pub include_view_image_tool: Option<bool>,
pub disable_response_storage: Option<bool>,
pub show_raw_agent_reasoning: Option<bool>,
pub tools_web_search_request: Option<bool>,
@@ -613,6 +621,7 @@ impl Config {
base_instructions,
include_plan_tool,
include_apply_patch_tool,
include_view_image_tool,
disable_response_storage,
show_raw_agent_reasoning,
tools_web_search_request: override_tools_web_search_request,
@@ -681,6 +690,10 @@ impl Config {
.or(cfg.tools.as_ref().and_then(|t| t.web_search))
.unwrap_or(false);
let include_view_image_tool = include_view_image_tool
.or(cfg.tools.as_ref().and_then(|t| t.view_image))
.unwrap_or(true);
let model = model
.or(config_profile.model)
.or(cfg.model)
@@ -784,6 +797,7 @@ impl Config {
use_experimental_streamable_shell_tool: cfg
.experimental_use_exec_command_tool
.unwrap_or(false),
include_view_image_tool,
};
Ok(config)
}
@@ -1152,6 +1166,7 @@ disable_response_storage = true
responses_originator_header: "codex_cli_rs".to_string(),
preferred_auth_method: AuthMode::ChatGPT,
use_experimental_streamable_shell_tool: false,
include_view_image_tool: true,
},
o3_profile_config
);
@@ -1208,6 +1223,7 @@ disable_response_storage = true
responses_originator_header: "codex_cli_rs".to_string(),
preferred_auth_method: AuthMode::ChatGPT,
use_experimental_streamable_shell_tool: false,
include_view_image_tool: true,
};
assert_eq!(expected_gpt3_profile_config, gpt3_profile_config);
@@ -1279,6 +1295,7 @@ disable_response_storage = true
responses_originator_header: "codex_cli_rs".to_string(),
preferred_auth_method: AuthMode::ChatGPT,
use_experimental_streamable_shell_tool: false,
include_view_image_tool: true,
};
assert_eq!(expected_zdr_profile_config, zdr_profile_config);

View File

@@ -67,6 +67,7 @@ pub(crate) struct ToolsConfig {
pub plan_tool: bool,
pub apply_patch_tool_type: Option<ApplyPatchToolType>,
pub web_search_request: bool,
pub include_view_image_tool: bool,
}
pub(crate) struct ToolsConfigParams<'a> {
@@ -77,6 +78,7 @@ pub(crate) struct ToolsConfigParams<'a> {
pub(crate) include_apply_patch_tool: bool,
pub(crate) include_web_search_request: bool,
pub(crate) use_streamable_shell_tool: bool,
pub(crate) include_view_image_tool: bool,
}
impl ToolsConfig {
@@ -89,6 +91,7 @@ impl ToolsConfig {
include_apply_patch_tool,
include_web_search_request,
use_streamable_shell_tool,
include_view_image_tool,
} = params;
let mut shell_type = if *use_streamable_shell_tool {
ConfigShellToolType::StreamableShell
@@ -120,6 +123,7 @@ impl ToolsConfig {
plan_tool: *include_plan_tool,
apply_patch_tool_type,
web_search_request: *include_web_search_request,
include_view_image_tool: *include_view_image_tool,
}
}
}
@@ -292,6 +296,30 @@ The shell tool is used to execute shell commands.
},
})
}
fn create_view_image_tool() -> OpenAiTool {
// Support only local filesystem path.
let mut properties = BTreeMap::new();
properties.insert(
"path".to_string(),
JsonSchema::String {
description: Some("Local filesystem path to an image file".to_string()),
},
);
OpenAiTool::Function(ResponsesApiTool {
name: "view_image".to_string(),
description:
"Attach a local image (by filesystem path) to the conversation context for this turn."
.to_string(),
strict: false,
parameters: JsonSchema::Object {
properties,
required: Some(vec!["path".to_string()]),
additional_properties: Some(false),
},
})
}
/// TODO(dylan): deprecate once we get rid of json tool
#[derive(Serialize, Deserialize)]
pub(crate) struct ApplyPatchToolArgs {
@@ -541,6 +569,11 @@ pub(crate) fn get_openai_tools(
tools.push(OpenAiTool::WebSearch {});
}
// Include the view_image tool so the agent can attach images to context.
if config.include_view_image_tool {
tools.push(create_view_image_tool());
}
if let Some(mcp_tools) = mcp_tools {
// Ensure deterministic ordering to maximize prompt cache hits.
// HashMap iteration order is non-deterministic, so sort by fully-qualified tool name.
@@ -604,10 +637,14 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(&config, Some(HashMap::new()));
assert_eq_tool_names(&tools, &["local_shell", "update_plan", "web_search"]);
assert_eq_tool_names(
&tools,
&["local_shell", "update_plan", "web_search", "view_image"],
);
}
#[test]
@@ -621,10 +658,14 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(&config, Some(HashMap::new()));
assert_eq_tool_names(&tools, &["shell", "update_plan", "web_search"]);
assert_eq_tool_names(
&tools,
&["shell", "update_plan", "web_search", "view_image"],
);
}
#[test]
@@ -638,6 +679,7 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(
&config,
@@ -679,11 +721,16 @@ mod tests {
assert_eq_tool_names(
&tools,
&["shell", "web_search", "test_server/do_something_cool"],
&[
"shell",
"web_search",
"view_image",
"test_server/do_something_cool",
],
);
assert_eq!(
tools[2],
tools[3],
OpenAiTool::Function(ResponsesApiTool {
name: "test_server/do_something_cool".to_string(),
parameters: JsonSchema::Object {
@@ -737,6 +784,7 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: false,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
// Intentionally construct a map with keys that would sort alphabetically.
@@ -794,6 +842,7 @@ mod tests {
&tools,
&[
"shell",
"view_image",
"test_server/cool",
"test_server/do",
"test_server/something",
@@ -812,6 +861,7 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(
@@ -837,10 +887,13 @@ mod tests {
)])),
);
assert_eq_tool_names(&tools, &["shell", "web_search", "dash/search"]);
assert_eq_tool_names(
&tools,
&["shell", "web_search", "view_image", "dash/search"],
);
assert_eq!(
tools[2],
tools[3],
OpenAiTool::Function(ResponsesApiTool {
name: "dash/search".to_string(),
parameters: JsonSchema::Object {
@@ -870,6 +923,7 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(
@@ -893,9 +947,12 @@ mod tests {
)])),
);
assert_eq_tool_names(&tools, &["shell", "web_search", "dash/paginate"]);
assert_eq_tool_names(
&tools,
&["shell", "web_search", "view_image", "dash/paginate"],
);
assert_eq!(
tools[2],
tools[3],
OpenAiTool::Function(ResponsesApiTool {
name: "dash/paginate".to_string(),
parameters: JsonSchema::Object {
@@ -923,6 +980,7 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(
@@ -946,9 +1004,9 @@ mod tests {
)])),
);
assert_eq_tool_names(&tools, &["shell", "web_search", "dash/tags"]);
assert_eq_tool_names(&tools, &["shell", "web_search", "view_image", "dash/tags"]);
assert_eq!(
tools[2],
tools[3],
OpenAiTool::Function(ResponsesApiTool {
name: "dash/tags".to_string(),
parameters: JsonSchema::Object {
@@ -979,6 +1037,7 @@ mod tests {
include_apply_patch_tool: false,
include_web_search_request: true,
use_streamable_shell_tool: false,
include_view_image_tool: true,
});
let tools = get_openai_tools(
@@ -1002,9 +1061,9 @@ mod tests {
)])),
);
assert_eq_tool_names(&tools, &["shell", "web_search", "dash/value"]);
assert_eq_tool_names(&tools, &["shell", "web_search", "view_image", "dash/value"]);
assert_eq!(
tools[2],
tools[3],
OpenAiTool::Function(ResponsesApiTool {
name: "dash/value".to_string(),
parameters: JsonSchema::Object {

View File

@@ -191,7 +191,7 @@ async fn prompt_tools_are_consistent_across_requests() {
let expected_instructions: &str = include_str!("../../prompt.md");
// our internal implementation is responsible for keeping tools in sync
// with the OpenAI schema, so we just verify the tool presence here
let expected_tools_names: &[&str] = &["shell", "update_plan", "apply_patch"];
let expected_tools_names: &[&str] = &["shell", "update_plan", "apply_patch", "view_image"];
let body0 = requests[0].body_json::<serde_json::Value>().unwrap();
assert_eq!(
body0["instructions"],

View File

@@ -148,6 +148,7 @@ pub async fn run_main(cli: Cli, codex_linux_sandbox_exe: Option<PathBuf>) -> any
base_instructions: None,
include_plan_tool: None,
include_apply_patch_tool: None,
include_view_image_tool: None,
disable_response_storage: oss.then_some(true),
show_raw_agent_reasoning: oss.then_some(true),
tools_web_search_request: None,

View File

@@ -798,6 +798,7 @@ fn derive_config_from_params(
base_instructions,
include_plan_tool,
include_apply_patch_tool,
include_view_image_tool: None,
disable_response_storage: None,
show_raw_agent_reasoning: None,
tools_web_search_request: None,

View File

@@ -161,6 +161,7 @@ impl CodexToolCallParam {
base_instructions,
include_plan_tool,
include_apply_patch_tool: None,
include_view_image_tool: None,
disable_response_storage: None,
show_raw_agent_reasoning: None,
tools_web_search_request: None,

View File

@@ -128,6 +128,7 @@ pub async fn run_main(
base_instructions: None,
include_plan_tool: Some(true),
include_apply_patch_tool: None,
include_view_image_tool: None,
disable_response_storage: cli.oss.then_some(true),
show_raw_agent_reasoning: cli.oss.then_some(true),
tools_web_search_request: cli.web_search.then_some(true),