[MCP] Render MCP tool call result images to the model (#5600)
It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
This commit is contained in:
@@ -5,6 +5,7 @@ use crate::tools::TELEMETRY_PREVIEW_MAX_LINES;
|
||||
use crate::tools::TELEMETRY_PREVIEW_TRUNCATION_NOTICE;
|
||||
use crate::turn_diff_tracker::TurnDiffTracker;
|
||||
use codex_otel::otel_event_manager::OtelEventManager;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ResponseInputItem;
|
||||
use codex_protocol::models::ShellToolCallParams;
|
||||
@@ -65,7 +66,10 @@ impl ToolPayload {
|
||||
#[derive(Clone)]
|
||||
pub enum ToolOutput {
|
||||
Function {
|
||||
// Plain text representation of the tool output.
|
||||
content: String,
|
||||
// Some tool calls such as MCP calls may return structured content that can get parsed into an array of polymorphic content items.
|
||||
content_items: Option<Vec<FunctionCallOutputContentItem>>,
|
||||
success: Option<bool>,
|
||||
},
|
||||
Mcp {
|
||||
@@ -90,7 +94,11 @@ impl ToolOutput {
|
||||
|
||||
pub fn into_response(self, call_id: &str, payload: &ToolPayload) -> ResponseInputItem {
|
||||
match self {
|
||||
ToolOutput::Function { content, success } => {
|
||||
ToolOutput::Function {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
} => {
|
||||
if matches!(payload, ToolPayload::Custom { .. }) {
|
||||
ResponseInputItem::CustomToolCallOutput {
|
||||
call_id: call_id.to_string(),
|
||||
@@ -99,7 +107,11 @@ impl ToolOutput {
|
||||
} else {
|
||||
ResponseInputItem::FunctionCallOutput {
|
||||
call_id: call_id.to_string(),
|
||||
output: FunctionCallOutputPayload { content, success },
|
||||
output: FunctionCallOutputPayload {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -163,6 +175,7 @@ mod tests {
|
||||
};
|
||||
let response = ToolOutput::Function {
|
||||
content: "patched".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
}
|
||||
.into_response("call-42", &payload);
|
||||
@@ -183,6 +196,7 @@ mod tests {
|
||||
};
|
||||
let response = ToolOutput::Function {
|
||||
content: "ok".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
}
|
||||
.into_response("fn-1", &payload);
|
||||
@@ -191,6 +205,7 @@ mod tests {
|
||||
ResponseInputItem::FunctionCallOutput { call_id, output } => {
|
||||
assert_eq!(call_id, "fn-1");
|
||||
assert_eq!(output.content, "ok");
|
||||
assert!(output.content_items.is_none());
|
||||
assert_eq!(output.success, Some(true));
|
||||
}
|
||||
other => panic!("expected FunctionCallOutput, got {other:?}"),
|
||||
|
||||
@@ -82,6 +82,7 @@ impl ToolHandler for ApplyPatchHandler {
|
||||
let content = item?;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
@@ -126,6 +127,7 @@ impl ToolHandler for ApplyPatchHandler {
|
||||
let content = emitter.finish(event_ctx, out).await?;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -90,11 +90,13 @@ impl ToolHandler for GrepFilesHandler {
|
||||
if search_results.is_empty() {
|
||||
Ok(ToolOutput::Function {
|
||||
content: "No matches found.".to_string(),
|
||||
content_items: None,
|
||||
success: Some(false),
|
||||
})
|
||||
} else {
|
||||
Ok(ToolOutput::Function {
|
||||
content: search_results.join("\n"),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -106,6 +106,7 @@ impl ToolHandler for ListDirHandler {
|
||||
output.extend(entries);
|
||||
Ok(ToolOutput::Function {
|
||||
content: output.join("\n"),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -56,8 +56,16 @@ impl ToolHandler for McpHandler {
|
||||
Ok(ToolOutput::Mcp { result })
|
||||
}
|
||||
codex_protocol::models::ResponseInputItem::FunctionCallOutput { output, .. } => {
|
||||
let codex_protocol::models::FunctionCallOutputPayload { content, success } = output;
|
||||
Ok(ToolOutput::Function { content, success })
|
||||
let codex_protocol::models::FunctionCallOutputPayload {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
} = output;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
})
|
||||
}
|
||||
_ => Err(FunctionCallError::RespondToModel(
|
||||
"mcp handler received unexpected response variant".to_string(),
|
||||
|
||||
@@ -297,7 +297,10 @@ async fn handle_list_resources(
|
||||
match payload_result {
|
||||
Ok(payload) => match serialize_function_output(payload) {
|
||||
Ok(output) => {
|
||||
let ToolOutput::Function { content, success } = &output else {
|
||||
let ToolOutput::Function {
|
||||
content, success, ..
|
||||
} = &output
|
||||
else {
|
||||
unreachable!("MCP resource handler should return function output");
|
||||
};
|
||||
let duration = start.elapsed();
|
||||
@@ -403,7 +406,10 @@ async fn handle_list_resource_templates(
|
||||
match payload_result {
|
||||
Ok(payload) => match serialize_function_output(payload) {
|
||||
Ok(output) => {
|
||||
let ToolOutput::Function { content, success } = &output else {
|
||||
let ToolOutput::Function {
|
||||
content, success, ..
|
||||
} = &output
|
||||
else {
|
||||
unreachable!("MCP resource handler should return function output");
|
||||
};
|
||||
let duration = start.elapsed();
|
||||
@@ -489,7 +495,10 @@ async fn handle_read_resource(
|
||||
match payload_result {
|
||||
Ok(payload) => match serialize_function_output(payload) {
|
||||
Ok(output) => {
|
||||
let ToolOutput::Function { content, success } = &output else {
|
||||
let ToolOutput::Function {
|
||||
content, success, ..
|
||||
} = &output
|
||||
else {
|
||||
unreachable!("MCP resource handler should return function output");
|
||||
};
|
||||
let duration = start.elapsed();
|
||||
@@ -618,6 +627,7 @@ where
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -88,6 +88,7 @@ impl ToolHandler for PlanHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -149,6 +149,7 @@ impl ToolHandler for ReadFileHandler {
|
||||
};
|
||||
Ok(ToolOutput::Function {
|
||||
content: collected.join("\n"),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -136,6 +136,7 @@ impl ShellHandler {
|
||||
let content = item?;
|
||||
return Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
});
|
||||
}
|
||||
@@ -179,6 +180,7 @@ impl ShellHandler {
|
||||
let content = emitter.finish(event_ctx, out).await?;
|
||||
return Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
});
|
||||
}
|
||||
@@ -226,6 +228,7 @@ impl ShellHandler {
|
||||
let content = emitter.finish(event_ctx, out).await?;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -95,6 +95,7 @@ impl ToolHandler for TestSyncHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content: "ok".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -171,6 +171,7 @@ impl ToolHandler for UnifiedExecHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -85,6 +85,7 @@ impl ToolHandler for ViewImageHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content: "attached local image path".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ impl ToolCallRuntime {
|
||||
call_id: call.call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -181,6 +181,7 @@ impl ToolRouter {
|
||||
output: codex_protocol::models::FunctionCallOutputPayload {
|
||||
content: message,
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user