It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
81 lines
2.6 KiB
Rust
81 lines
2.6 KiB
Rust
use std::time::Instant;
|
|
|
|
use tracing::error;
|
|
|
|
use crate::codex::Session;
|
|
use crate::codex::TurnContext;
|
|
use crate::protocol::EventMsg;
|
|
use crate::protocol::McpInvocation;
|
|
use crate::protocol::McpToolCallBeginEvent;
|
|
use crate::protocol::McpToolCallEndEvent;
|
|
use codex_protocol::models::FunctionCallOutputPayload;
|
|
use codex_protocol::models::ResponseInputItem;
|
|
|
|
/// Handles the specified tool call dispatches the appropriate
|
|
/// `McpToolCallBegin` and `McpToolCallEnd` events to the `Session`.
|
|
pub(crate) async fn handle_mcp_tool_call(
|
|
sess: &Session,
|
|
turn_context: &TurnContext,
|
|
call_id: String,
|
|
server: String,
|
|
tool_name: String,
|
|
arguments: String,
|
|
) -> ResponseInputItem {
|
|
// Parse the `arguments` as JSON. An empty string is OK, but invalid JSON
|
|
// is not.
|
|
let arguments_value = if arguments.trim().is_empty() {
|
|
None
|
|
} else {
|
|
match serde_json::from_str::<serde_json::Value>(&arguments) {
|
|
Ok(value) => Some(value),
|
|
Err(e) => {
|
|
error!("failed to parse tool call arguments: {e}");
|
|
return ResponseInputItem::FunctionCallOutput {
|
|
call_id: call_id.clone(),
|
|
output: FunctionCallOutputPayload {
|
|
content: format!("err: {e}"),
|
|
success: Some(false),
|
|
..Default::default()
|
|
},
|
|
};
|
|
}
|
|
}
|
|
};
|
|
|
|
let invocation = McpInvocation {
|
|
server: server.clone(),
|
|
tool: tool_name.clone(),
|
|
arguments: arguments_value.clone(),
|
|
};
|
|
|
|
let tool_call_begin_event = EventMsg::McpToolCallBegin(McpToolCallBeginEvent {
|
|
call_id: call_id.clone(),
|
|
invocation: invocation.clone(),
|
|
});
|
|
notify_mcp_tool_call_event(sess, turn_context, tool_call_begin_event).await;
|
|
|
|
let start = Instant::now();
|
|
// Perform the tool call.
|
|
let result = sess
|
|
.call_tool(&server, &tool_name, arguments_value.clone())
|
|
.await
|
|
.map_err(|e| format!("tool call error: {e:?}"));
|
|
if let Err(e) = &result {
|
|
tracing::warn!("MCP tool call error: {e:?}");
|
|
}
|
|
let tool_call_end_event = EventMsg::McpToolCallEnd(McpToolCallEndEvent {
|
|
call_id: call_id.clone(),
|
|
invocation,
|
|
duration: start.elapsed(),
|
|
result: result.clone(),
|
|
});
|
|
|
|
notify_mcp_tool_call_event(sess, turn_context, tool_call_end_event.clone()).await;
|
|
|
|
ResponseInputItem::McpToolCallOutput { call_id, result }
|
|
}
|
|
|
|
async fn notify_mcp_tool_call_event(sess: &Session, turn_context: &TurnContext, event: EventMsg) {
|
|
sess.send_event(turn_context, event).await;
|
|
}
|