[MCP] Render MCP tool call result images to the model (#5600)

It's pretty amazing we have gotten here without the ability for the
model to see image content from MCP tool calls.

This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get
adequete credit here but I also want to get this fix in ASAP so I gave
him a week to update it and haven't gotten a response so I'm going to
take it across the finish line.


This test highlights how absured the current situation is. I asked the
model to read this image using the Chrome MCP
<img width="2378" height="674" alt="image"
src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0"
/>

After this change, it correctly outputs:
> Captured the page: image dhows a dark terminal-style UI labeled
`OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and
working directory `/codex/codex-rs`
(and more)  

Before this change, it said:
> Took the full-page screenshot you asked for. It shows a long,
horizontally repeating pattern of stylized people in orange, light-blue,
and mustard clothing, holding hands in alternating poses against a white
background. No text or other graphics-just rows of flat illustration
stretching off to the right.

Without this change, the Figma, Playwright, Chrome, and other visual MCP
servers are pretty much entirely useless.

I tested this change with the openai respones api as well as a third
party completions api
This commit is contained in:
Gabriel Peal
2025-10-27 14:55:57 -07:00
committed by GitHub
parent 67a219ffc2
commit b0bdc04c30
24 changed files with 749 additions and 108 deletions

View File

@@ -17,6 +17,7 @@ use crate::util::backoff;
use bytes::Bytes;
use codex_otel::otel_event_manager::OtelEventManager;
use codex_protocol::models::ContentItem;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::ReasoningItemContent;
use codex_protocol::models::ResponseItem;
use eventsource_stream::Eventsource;
@@ -159,16 +160,26 @@ pub(crate) async fn stream_chat_completions(
for (idx, item) in input.iter().enumerate() {
match item {
ResponseItem::Message { role, content, .. } => {
// Build content either as a plain string (typical for assistant text)
// or as an array of content items when images are present (user/tool multimodal).
let mut text = String::new();
let mut items: Vec<serde_json::Value> = Vec::new();
let mut saw_image = false;
for c in content {
match c {
ContentItem::InputText { text: t }
| ContentItem::OutputText { text: t } => {
text.push_str(t);
items.push(json!({"type":"text","text": t}));
}
ContentItem::InputImage { image_url } => {
saw_image = true;
items.push(json!({"type":"image_url","image_url": {"url": image_url}}));
}
_ => {}
}
}
// Skip exact-duplicate assistant messages.
if role == "assistant" {
if let Some(prev) = &last_assistant_text
@@ -179,7 +190,17 @@ pub(crate) async fn stream_chat_completions(
last_assistant_text = Some(text.clone());
}
let mut msg = json!({"role": role, "content": text});
// For assistant messages, always send a plain string for compatibility.
// For user messages, if an image is present, send an array of content items.
let content_value = if role == "assistant" {
json!(text)
} else if saw_image {
json!(items)
} else {
json!(text)
};
let mut msg = json!({"role": role, "content": content_value});
if role == "assistant"
&& let Some(reasoning) = reasoning_by_anchor_index.get(&idx)
&& let Some(obj) = msg.as_object_mut()
@@ -238,10 +259,29 @@ pub(crate) async fn stream_chat_completions(
messages.push(msg);
}
ResponseItem::FunctionCallOutput { call_id, output } => {
// Prefer structured content items when available (e.g., images)
// otherwise fall back to the legacy plain-string content.
let content_value = if let Some(items) = &output.content_items {
let mapped: Vec<serde_json::Value> = items
.iter()
.map(|it| match it {
FunctionCallOutputContentItem::InputText { text } => {
json!({"type":"text","text": text})
}
FunctionCallOutputContentItem::InputImage { image_url } => {
json!({"type":"image_url","image_url": {"url": image_url}})
}
})
.collect();
json!(mapped)
} else {
json!(output.content)
};
messages.push(json!({
"role": "tool",
"tool_call_id": call_id,
"content": output.content,
"content": content_value,
}));
}
ResponseItem::CustomToolCall {