[MCP] Render MCP tool call result images to the model (#5600)

It's pretty amazing we have gotten here without the ability for the
model to see image content from MCP tool calls.

This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get
adequete credit here but I also want to get this fix in ASAP so I gave
him a week to update it and haven't gotten a response so I'm going to
take it across the finish line.


This test highlights how absured the current situation is. I asked the
model to read this image using the Chrome MCP
<img width="2378" height="674" alt="image"
src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0"
/>

After this change, it correctly outputs:
> Captured the page: image dhows a dark terminal-style UI labeled
`OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and
working directory `/codex/codex-rs`
(and more)  

Before this change, it said:
> Took the full-page screenshot you asked for. It shows a long,
horizontally repeating pattern of stylized people in orange, light-blue,
and mustard clothing, holding hands in alternating poses against a white
background. No text or other graphics-just rows of flat illustration
stretching off to the right.

Without this change, the Figma, Playwright, Chrome, and other visual MCP
servers are pretty much entirely useless.

I tested this change with the openai respones api as well as a third
party completions api
This commit is contained in:
Gabriel Peal
2025-10-27 14:55:57 -07:00
committed by GitHub
parent 67a219ffc2
commit b0bdc04c30
24 changed files with 749 additions and 108 deletions

View File

@@ -17,6 +17,7 @@ use crate::util::backoff;
use bytes::Bytes;
use codex_otel::otel_event_manager::OtelEventManager;
use codex_protocol::models::ContentItem;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::ReasoningItemContent;
use codex_protocol::models::ResponseItem;
use eventsource_stream::Eventsource;
@@ -159,16 +160,26 @@ pub(crate) async fn stream_chat_completions(
for (idx, item) in input.iter().enumerate() {
match item {
ResponseItem::Message { role, content, .. } => {
// Build content either as a plain string (typical for assistant text)
// or as an array of content items when images are present (user/tool multimodal).
let mut text = String::new();
let mut items: Vec<serde_json::Value> = Vec::new();
let mut saw_image = false;
for c in content {
match c {
ContentItem::InputText { text: t }
| ContentItem::OutputText { text: t } => {
text.push_str(t);
items.push(json!({"type":"text","text": t}));
}
ContentItem::InputImage { image_url } => {
saw_image = true;
items.push(json!({"type":"image_url","image_url": {"url": image_url}}));
}
_ => {}
}
}
// Skip exact-duplicate assistant messages.
if role == "assistant" {
if let Some(prev) = &last_assistant_text
@@ -179,7 +190,17 @@ pub(crate) async fn stream_chat_completions(
last_assistant_text = Some(text.clone());
}
let mut msg = json!({"role": role, "content": text});
// For assistant messages, always send a plain string for compatibility.
// For user messages, if an image is present, send an array of content items.
let content_value = if role == "assistant" {
json!(text)
} else if saw_image {
json!(items)
} else {
json!(text)
};
let mut msg = json!({"role": role, "content": content_value});
if role == "assistant"
&& let Some(reasoning) = reasoning_by_anchor_index.get(&idx)
&& let Some(obj) = msg.as_object_mut()
@@ -238,10 +259,29 @@ pub(crate) async fn stream_chat_completions(
messages.push(msg);
}
ResponseItem::FunctionCallOutput { call_id, output } => {
// Prefer structured content items when available (e.g., images)
// otherwise fall back to the legacy plain-string content.
let content_value = if let Some(items) = &output.content_items {
let mapped: Vec<serde_json::Value> = items
.iter()
.map(|it| match it {
FunctionCallOutputContentItem::InputText { text } => {
json!({"type":"text","text": text})
}
FunctionCallOutputContentItem::InputImage { image_url } => {
json!({"type":"image_url","image_url": {"url": image_url}})
}
})
.collect();
json!(mapped)
} else {
json!(output.content)
};
messages.push(json!({
"role": "tool",
"tool_call_id": call_id,
"content": output.content,
"content": content_value,
}));
}
ResponseItem::CustomToolCall {

View File

@@ -2047,7 +2047,7 @@ async fn try_run_turn(
call_id: String::new(),
output: FunctionCallOutputPayload {
content: msg.to_string(),
success: None,
..Default::default()
},
};
add_completed(ProcessedResponseItem {
@@ -2061,7 +2061,7 @@ async fn try_run_turn(
call_id: String::new(),
output: FunctionCallOutputPayload {
content: message,
success: None,
..Default::default()
},
};
add_completed(ProcessedResponseItem {
@@ -2199,41 +2199,6 @@ pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -
}
})
}
pub(crate) fn convert_call_tool_result_to_function_call_output_payload(
call_tool_result: &CallToolResult,
) -> FunctionCallOutputPayload {
let CallToolResult {
content,
is_error,
structured_content,
} = call_tool_result;
// In terms of what to send back to the model, we prefer structured_content,
// if available, and fallback to content, otherwise.
let mut is_success = is_error != &Some(true);
let content = if let Some(structured_content) = structured_content
&& structured_content != &serde_json::Value::Null
&& let Ok(serialized_structured_content) = serde_json::to_string(&structured_content)
{
serialized_structured_content
} else {
match serde_json::to_string(&content) {
Ok(serialized_content) => serialized_content,
Err(err) => {
// If we could not serialize either content or structured_content to
// JSON, flag this as an error.
is_success = false;
err.to_string()
}
}
};
FunctionCallOutputPayload {
content,
success: Some(is_success),
}
}
/// Emits an ExitedReviewMode Event with optional ReviewOutput,
/// and records a developer message with the review output.
pub(crate) async fn exit_review_mode(
@@ -2439,7 +2404,7 @@ mod tests {
})),
};
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
let got = FunctionCallOutputPayload::from(&ctr);
let expected = FunctionCallOutputPayload {
content: serde_json::to_string(&json!({
"ok": true,
@@ -2447,6 +2412,7 @@ mod tests {
}))
.unwrap(),
success: Some(true),
..Default::default()
};
assert_eq!(expected, got);
@@ -2479,11 +2445,12 @@ mod tests {
structured_content: Some(serde_json::Value::Null),
};
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
let got = FunctionCallOutputPayload::from(&ctr);
let expected = FunctionCallOutputPayload {
content: serde_json::to_string(&vec![text_block("hello"), text_block("world")])
.unwrap(),
success: Some(true),
..Default::default()
};
assert_eq!(expected, got);
@@ -2497,10 +2464,11 @@ mod tests {
structured_content: Some(json!({ "message": "bad" })),
};
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
let got = FunctionCallOutputPayload::from(&ctr);
let expected = FunctionCallOutputPayload {
content: serde_json::to_string(&json!({ "message": "bad" })).unwrap(),
success: Some(false),
..Default::default()
};
assert_eq!(expected, got);
@@ -2514,10 +2482,11 @@ mod tests {
structured_content: None,
};
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
let got = FunctionCallOutputPayload::from(&ctr);
let expected = FunctionCallOutputPayload {
content: serde_json::to_string(&vec![text_block("alpha")]).unwrap(),
success: Some(true),
..Default::default()
};
assert_eq!(expected, got);

View File

@@ -136,7 +136,7 @@ impl ConversationHistory {
call_id: call_id.clone(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
));
@@ -183,7 +183,7 @@ impl ConversationHistory {
call_id: call_id.clone(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
));
@@ -565,7 +565,7 @@ mod tests {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
},
];
@@ -581,7 +581,7 @@ mod tests {
call_id: "call-2".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
},
ResponseItem::FunctionCall {
@@ -615,7 +615,7 @@ mod tests {
call_id: "call-3".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
},
];
@@ -848,7 +848,7 @@ mod tests {
call_id: "call-x".to_string(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
]
@@ -925,7 +925,7 @@ mod tests {
call_id: "shell-1".to_string(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
]
@@ -939,7 +939,7 @@ mod tests {
call_id: "orphan-1".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
}];
let mut h = create_history_with_items(items);
@@ -979,7 +979,7 @@ mod tests {
call_id: "c2".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
},
// Will get an inserted custom tool output
@@ -1021,7 +1021,7 @@ mod tests {
call_id: "c1".to_string(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
ResponseItem::CustomToolCall {
@@ -1051,7 +1051,7 @@ mod tests {
call_id: "s1".to_string(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
]
@@ -1116,7 +1116,7 @@ mod tests {
call_id: "orphan-1".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
}];
let mut h = create_history_with_items(items);
@@ -1150,7 +1150,7 @@ mod tests {
call_id: "c2".to_string(),
output: FunctionCallOutputPayload {
content: "ok".to_string(),
success: None,
..Default::default()
},
},
ResponseItem::CustomToolCall {

View File

@@ -35,6 +35,7 @@ pub(crate) async fn handle_mcp_tool_call(
output: FunctionCallOutputPayload {
content: format!("err: {e}"),
success: Some(false),
..Default::default()
},
};
}

View File

@@ -61,14 +61,11 @@ pub(crate) async fn process_items(
) => {
items_to_record_in_conversation_history.push(item);
let output = match result {
Ok(call_tool_result) => {
crate::codex::convert_call_tool_result_to_function_call_output_payload(
call_tool_result,
)
}
Ok(call_tool_result) => FunctionCallOutputPayload::from(call_tool_result),
Err(err) => FunctionCallOutputPayload {
content: err.clone(),
success: Some(false),
..Default::default()
},
};
items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {

View File

@@ -5,6 +5,7 @@ use crate::tools::TELEMETRY_PREVIEW_MAX_LINES;
use crate::tools::TELEMETRY_PREVIEW_TRUNCATION_NOTICE;
use crate::turn_diff_tracker::TurnDiffTracker;
use codex_otel::otel_event_manager::OtelEventManager;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::models::ShellToolCallParams;
@@ -65,7 +66,10 @@ impl ToolPayload {
#[derive(Clone)]
pub enum ToolOutput {
Function {
// Plain text representation of the tool output.
content: String,
// Some tool calls such as MCP calls may return structured content that can get parsed into an array of polymorphic content items.
content_items: Option<Vec<FunctionCallOutputContentItem>>,
success: Option<bool>,
},
Mcp {
@@ -90,7 +94,11 @@ impl ToolOutput {
pub fn into_response(self, call_id: &str, payload: &ToolPayload) -> ResponseInputItem {
match self {
ToolOutput::Function { content, success } => {
ToolOutput::Function {
content,
content_items,
success,
} => {
if matches!(payload, ToolPayload::Custom { .. }) {
ResponseInputItem::CustomToolCallOutput {
call_id: call_id.to_string(),
@@ -99,7 +107,11 @@ impl ToolOutput {
} else {
ResponseInputItem::FunctionCallOutput {
call_id: call_id.to_string(),
output: FunctionCallOutputPayload { content, success },
output: FunctionCallOutputPayload {
content,
content_items,
success,
},
}
}
}
@@ -163,6 +175,7 @@ mod tests {
};
let response = ToolOutput::Function {
content: "patched".to_string(),
content_items: None,
success: Some(true),
}
.into_response("call-42", &payload);
@@ -183,6 +196,7 @@ mod tests {
};
let response = ToolOutput::Function {
content: "ok".to_string(),
content_items: None,
success: Some(true),
}
.into_response("fn-1", &payload);
@@ -191,6 +205,7 @@ mod tests {
ResponseInputItem::FunctionCallOutput { call_id, output } => {
assert_eq!(call_id, "fn-1");
assert_eq!(output.content, "ok");
assert!(output.content_items.is_none());
assert_eq!(output.success, Some(true));
}
other => panic!("expected FunctionCallOutput, got {other:?}"),

View File

@@ -82,6 +82,7 @@ impl ToolHandler for ApplyPatchHandler {
let content = item?;
Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
})
}
@@ -126,6 +127,7 @@ impl ToolHandler for ApplyPatchHandler {
let content = emitter.finish(event_ctx, out).await?;
Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
})
}

View File

@@ -90,11 +90,13 @@ impl ToolHandler for GrepFilesHandler {
if search_results.is_empty() {
Ok(ToolOutput::Function {
content: "No matches found.".to_string(),
content_items: None,
success: Some(false),
})
} else {
Ok(ToolOutput::Function {
content: search_results.join("\n"),
content_items: None,
success: Some(true),
})
}

View File

@@ -106,6 +106,7 @@ impl ToolHandler for ListDirHandler {
output.extend(entries);
Ok(ToolOutput::Function {
content: output.join("\n"),
content_items: None,
success: Some(true),
})
}

View File

@@ -56,8 +56,16 @@ impl ToolHandler for McpHandler {
Ok(ToolOutput::Mcp { result })
}
codex_protocol::models::ResponseInputItem::FunctionCallOutput { output, .. } => {
let codex_protocol::models::FunctionCallOutputPayload { content, success } = output;
Ok(ToolOutput::Function { content, success })
let codex_protocol::models::FunctionCallOutputPayload {
content,
content_items,
success,
} = output;
Ok(ToolOutput::Function {
content,
content_items,
success,
})
}
_ => Err(FunctionCallError::RespondToModel(
"mcp handler received unexpected response variant".to_string(),

View File

@@ -297,7 +297,10 @@ async fn handle_list_resources(
match payload_result {
Ok(payload) => match serialize_function_output(payload) {
Ok(output) => {
let ToolOutput::Function { content, success } = &output else {
let ToolOutput::Function {
content, success, ..
} = &output
else {
unreachable!("MCP resource handler should return function output");
};
let duration = start.elapsed();
@@ -403,7 +406,10 @@ async fn handle_list_resource_templates(
match payload_result {
Ok(payload) => match serialize_function_output(payload) {
Ok(output) => {
let ToolOutput::Function { content, success } = &output else {
let ToolOutput::Function {
content, success, ..
} = &output
else {
unreachable!("MCP resource handler should return function output");
};
let duration = start.elapsed();
@@ -489,7 +495,10 @@ async fn handle_read_resource(
match payload_result {
Ok(payload) => match serialize_function_output(payload) {
Ok(output) => {
let ToolOutput::Function { content, success } = &output else {
let ToolOutput::Function {
content, success, ..
} = &output
else {
unreachable!("MCP resource handler should return function output");
};
let duration = start.elapsed();
@@ -618,6 +627,7 @@ where
Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
})
}

View File

@@ -88,6 +88,7 @@ impl ToolHandler for PlanHandler {
Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
})
}

View File

@@ -149,6 +149,7 @@ impl ToolHandler for ReadFileHandler {
};
Ok(ToolOutput::Function {
content: collected.join("\n"),
content_items: None,
success: Some(true),
})
}

View File

@@ -136,6 +136,7 @@ impl ShellHandler {
let content = item?;
return Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
});
}
@@ -179,6 +180,7 @@ impl ShellHandler {
let content = emitter.finish(event_ctx, out).await?;
return Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
});
}
@@ -226,6 +228,7 @@ impl ShellHandler {
let content = emitter.finish(event_ctx, out).await?;
Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
})
}

View File

@@ -95,6 +95,7 @@ impl ToolHandler for TestSyncHandler {
Ok(ToolOutput::Function {
content: "ok".to_string(),
content_items: None,
success: Some(true),
})
}

View File

@@ -171,6 +171,7 @@ impl ToolHandler for UnifiedExecHandler {
Ok(ToolOutput::Function {
content,
content_items: None,
success: Some(true),
})
}

View File

@@ -85,6 +85,7 @@ impl ToolHandler for ViewImageHandler {
Ok(ToolOutput::Function {
content: "attached local image path".to_string(),
content_items: None,
success: Some(true),
})
}

View File

@@ -105,7 +105,7 @@ impl ToolCallRuntime {
call_id: call.call_id.clone(),
output: FunctionCallOutputPayload {
content: "aborted".to_string(),
success: None,
..Default::default()
},
},
}

View File

@@ -181,6 +181,7 @@ impl ToolRouter {
output: codex_protocol::models::FunctionCallOutputPayload {
content: message,
success: Some(false),
..Default::default()
},
}
}