[MCP] Render MCP tool call result images to the model (#5600)

It's pretty amazing we have gotten here without the ability for the
model to see image content from MCP tool calls.

This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get
adequete credit here but I also want to get this fix in ASAP so I gave
him a week to update it and haven't gotten a response so I'm going to
take it across the finish line.


This test highlights how absured the current situation is. I asked the
model to read this image using the Chrome MCP
<img width="2378" height="674" alt="image"
src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0"
/>

After this change, it correctly outputs:
> Captured the page: image dhows a dark terminal-style UI labeled
`OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and
working directory `/codex/codex-rs`
(and more)  

Before this change, it said:
> Took the full-page screenshot you asked for. It shows a long,
horizontally repeating pattern of stylized people in orange, light-blue,
and mustard clothing, holding hands in alternating poses against a white
background. No text or other graphics-just rows of flat illustration
stretching off to the right.

Without this change, the Figma, Playwright, Chrome, and other visual MCP
servers are pretty much entirely useless.

I tested this change with the openai respones api as well as a third
party completions api
This commit is contained in:
Gabriel Peal
2025-10-27 14:55:57 -07:00
committed by GitHub
parent 67a219ffc2
commit b0bdc04c30
24 changed files with 749 additions and 108 deletions

View File

@@ -14,6 +14,8 @@ use codex_core::features::Feature;
use codex_core::protocol::AskForApproval;
use codex_core::protocol::EventMsg;
use codex_core::protocol::McpInvocation;
use codex_core::protocol::McpToolCallBeginEvent;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
@@ -25,7 +27,9 @@ use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event;
use core_test_support::wait_for_event_with_timeout;
use escargot::CargoBuild;
use mcp_types::ContentBlock;
use serde_json::Value;
use serde_json::json;
use serial_test::serial;
use tempfile::tempdir;
use tokio::net::TcpStream;
@@ -35,6 +39,8 @@ use tokio::time::Instant;
use tokio::time::sleep;
use wiremock::matchers::any;
static OPENAI_PNG: &str = "";
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_server_round_trip() -> anyhow::Result<()> {
@@ -175,6 +181,352 @@ async fn stdio_server_round_trip() -> anyhow::Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_image_responses_round_trip() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let call_id = "img-1";
let server_name = "rmcp";
let tool_name = format!("mcp__{server_name}__image");
// First stream: model decides to call the image tool.
mount_sse_once_match(
&server,
any(),
responses::sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, &tool_name, "{}"),
responses::ev_completed("resp-1"),
]),
)
.await;
// Second stream: after tool execution, assistant emits a message and completes.
let final_mock = mount_sse_once_match(
&server,
any(),
responses::sse(vec![
responses::ev_assistant_message("msg-1", "rmcp image tool completed successfully."),
responses::ev_completed("resp-2"),
]),
)
.await;
// Build the stdio rmcp server and pass the image as data URL so it can construct ImageContent.
let rmcp_test_server_bin = CargoBuild::new()
.package("codex-rmcp-client")
.bin("test_stdio_server")
.run()?
.path()
.to_string_lossy()
.into_owned();
let fixture = test_codex()
.with_config(move |config| {
config.features.enable(Feature::RmcpClient);
config.mcp_servers.insert(
server_name.to_string(),
McpServerConfig {
transport: McpServerTransportConfig::Stdio {
command: rmcp_test_server_bin,
args: Vec::new(),
env: Some(HashMap::from([(
"MCP_TEST_IMAGE_DATA_URL".to_string(),
OPENAI_PNG.to_string(),
)])),
env_vars: Vec::new(),
cwd: None,
},
enabled: true,
startup_timeout_sec: Some(Duration::from_secs(10)),
tool_timeout_sec: None,
enabled_tools: None,
disabled_tools: None,
},
);
})
.build(&server)
.await?;
let session_model = fixture.session_configured.model.clone();
fixture
.codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "call the rmcp image tool".into(),
}],
final_output_json_schema: None,
cwd: fixture.cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::ReadOnly,
model: session_model,
effort: None,
summary: ReasoningSummary::Auto,
})
.await?;
// Wait for tool begin/end and final completion.
let begin_event = wait_for_event_with_timeout(
&fixture.codex,
|ev| matches!(ev, EventMsg::McpToolCallBegin(_)),
Duration::from_secs(10),
)
.await;
let EventMsg::McpToolCallBegin(begin) = begin_event else {
unreachable!("begin");
};
assert_eq!(
begin,
McpToolCallBeginEvent {
call_id: call_id.to_string(),
invocation: McpInvocation {
server: server_name.to_string(),
tool: "image".to_string(),
arguments: Some(json!({})),
},
},
);
let end_event = wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallEnd(_))
})
.await;
let EventMsg::McpToolCallEnd(end) = end_event else {
unreachable!("end");
};
assert_eq!(end.call_id, call_id);
assert_eq!(
end.invocation,
McpInvocation {
server: server_name.to_string(),
tool: "image".to_string(),
arguments: Some(json!({})),
}
);
let result = end.result.expect("rmcp image tool should return success");
assert_eq!(result.is_error, Some(false));
assert_eq!(result.content.len(), 1);
let base64_only = OPENAI_PNG
.strip_prefix("data:image/png;base64,")
.expect("data url prefix");
match &result.content[0] {
ContentBlock::ImageContent(img) => {
assert_eq!(img.mime_type, "image/png");
assert_eq!(img.r#type, "image");
assert_eq!(img.data, base64_only);
}
other => panic!("expected image content, got {other:?}"),
}
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
let output_item = final_mock.single_request().function_call_output(call_id);
assert_eq!(
output_item,
json!({
"type": "function_call_output",
"call_id": call_id,
"output": [{
"type": "input_image",
"image_url": OPENAI_PNG
}]
})
);
server.verify().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_image_completions_round_trip() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let call_id = "img-cc-1";
let server_name = "rmcp";
let tool_name = format!("mcp__{server_name}__image");
let tool_call = json!({
"choices": [
{
"delta": {
"tool_calls": [
{
"id": call_id,
"type": "function",
"function": {"name": tool_name, "arguments": "{}"}
}
]
},
"finish_reason": "tool_calls"
}
]
});
let sse_tool_call = format!(
"data: {}\n\ndata: [DONE]\n\n",
serde_json::to_string(&tool_call)?
);
let final_assistant = json!({
"choices": [
{
"delta": {"content": "rmcp image tool completed successfully."},
"finish_reason": "stop"
}
]
});
let sse_final = format!(
"data: {}\n\ndata: [DONE]\n\n",
serde_json::to_string(&final_assistant)?
);
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
struct ChatSeqResponder {
num_calls: AtomicUsize,
bodies: Vec<String>,
}
impl wiremock::Respond for ChatSeqResponder {
fn respond(&self, _: &wiremock::Request) -> wiremock::ResponseTemplate {
let idx = self.num_calls.fetch_add(1, Ordering::SeqCst);
match self.bodies.get(idx) {
Some(body) => wiremock::ResponseTemplate::new(200)
.insert_header("content-type", "text/event-stream")
.set_body_string(body.clone()),
None => panic!("no chat completion response for index {idx}"),
}
}
}
let chat_seq = ChatSeqResponder {
num_calls: AtomicUsize::new(0),
bodies: vec![sse_tool_call, sse_final],
};
wiremock::Mock::given(wiremock::matchers::method("POST"))
.and(wiremock::matchers::path("/v1/chat/completions"))
.respond_with(chat_seq)
.expect(2)
.mount(&server)
.await;
let rmcp_test_server_bin = CargoBuild::new()
.package("codex-rmcp-client")
.bin("test_stdio_server")
.run()?
.path()
.to_string_lossy()
.into_owned();
let fixture = test_codex()
.with_config(move |config| {
config.model_provider.wire_api = codex_core::WireApi::Chat;
config.features.enable(Feature::RmcpClient);
config.mcp_servers.insert(
server_name.to_string(),
McpServerConfig {
transport: McpServerTransportConfig::Stdio {
command: rmcp_test_server_bin,
args: Vec::new(),
env: Some(HashMap::from([(
"MCP_TEST_IMAGE_DATA_URL".to_string(),
OPENAI_PNG.to_string(),
)])),
env_vars: Vec::new(),
cwd: None,
},
enabled: true,
startup_timeout_sec: Some(Duration::from_secs(10)),
tool_timeout_sec: None,
enabled_tools: None,
disabled_tools: None,
},
);
})
.build(&server)
.await?;
let session_model = fixture.session_configured.model.clone();
fixture
.codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "call the rmcp image tool".into(),
}],
final_output_json_schema: None,
cwd: fixture.cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::ReadOnly,
model: session_model,
effort: None,
summary: ReasoningSummary::Auto,
})
.await?;
let begin_event = wait_for_event_with_timeout(
&fixture.codex,
|ev| matches!(ev, EventMsg::McpToolCallBegin(_)),
Duration::from_secs(10),
)
.await;
let EventMsg::McpToolCallBegin(begin) = begin_event else {
unreachable!("begin");
};
assert_eq!(
begin,
McpToolCallBeginEvent {
call_id: call_id.to_string(),
invocation: McpInvocation {
server: server_name.to_string(),
tool: "image".to_string(),
arguments: Some(json!({})),
},
},
);
let end_event = wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallEnd(_))
})
.await;
let EventMsg::McpToolCallEnd(end) = end_event else {
unreachable!("end");
};
assert!(end.result.as_ref().is_ok(), "tool call should succeed");
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
// Chat Completions assertion: the second POST should include a tool role message
// with an array `content` containing an item with the expected data URL.
let requests = server.received_requests().await.expect("requests captured");
assert!(requests.len() >= 2, "expected two chat completion calls");
let second = &requests[1];
let body: Value = serde_json::from_slice(&second.body)?;
let messages = body
.get("messages")
.and_then(Value::as_array)
.cloned()
.expect("messages array");
let tool_msg = messages
.iter()
.find(|m| {
m.get("role") == Some(&json!("tool")) && m.get("tool_call_id") == Some(&json!(call_id))
})
.cloned()
.expect("tool message present");
assert_eq!(
tool_msg,
json!({
"role": "tool",
"tool_call_id": call_id,
"content": [{"type": "image_url", "image_url": {"url": OPENAI_PNG}}]
})
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {