[MCP] Render MCP tool call result images to the model (#5600)
It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
This commit is contained in:
@@ -2047,7 +2047,7 @@ async fn try_run_turn(
|
||||
call_id: String::new(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: msg.to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
add_completed(ProcessedResponseItem {
|
||||
@@ -2061,7 +2061,7 @@ async fn try_run_turn(
|
||||
call_id: String::new(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: message,
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
add_completed(ProcessedResponseItem {
|
||||
@@ -2199,41 +2199,6 @@ pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -
|
||||
}
|
||||
})
|
||||
}
|
||||
pub(crate) fn convert_call_tool_result_to_function_call_output_payload(
|
||||
call_tool_result: &CallToolResult,
|
||||
) -> FunctionCallOutputPayload {
|
||||
let CallToolResult {
|
||||
content,
|
||||
is_error,
|
||||
structured_content,
|
||||
} = call_tool_result;
|
||||
|
||||
// In terms of what to send back to the model, we prefer structured_content,
|
||||
// if available, and fallback to content, otherwise.
|
||||
let mut is_success = is_error != &Some(true);
|
||||
let content = if let Some(structured_content) = structured_content
|
||||
&& structured_content != &serde_json::Value::Null
|
||||
&& let Ok(serialized_structured_content) = serde_json::to_string(&structured_content)
|
||||
{
|
||||
serialized_structured_content
|
||||
} else {
|
||||
match serde_json::to_string(&content) {
|
||||
Ok(serialized_content) => serialized_content,
|
||||
Err(err) => {
|
||||
// If we could not serialize either content or structured_content to
|
||||
// JSON, flag this as an error.
|
||||
is_success = false;
|
||||
err.to_string()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
FunctionCallOutputPayload {
|
||||
content,
|
||||
success: Some(is_success),
|
||||
}
|
||||
}
|
||||
|
||||
/// Emits an ExitedReviewMode Event with optional ReviewOutput,
|
||||
/// and records a developer message with the review output.
|
||||
pub(crate) async fn exit_review_mode(
|
||||
@@ -2439,7 +2404,7 @@ mod tests {
|
||||
})),
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&json!({
|
||||
"ok": true,
|
||||
@@ -2447,6 +2412,7 @@ mod tests {
|
||||
}))
|
||||
.unwrap(),
|
||||
success: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
@@ -2479,11 +2445,12 @@ mod tests {
|
||||
structured_content: Some(serde_json::Value::Null),
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&vec![text_block("hello"), text_block("world")])
|
||||
.unwrap(),
|
||||
success: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
@@ -2497,10 +2464,11 @@ mod tests {
|
||||
structured_content: Some(json!({ "message": "bad" })),
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&json!({ "message": "bad" })).unwrap(),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
@@ -2514,10 +2482,11 @@ mod tests {
|
||||
structured_content: None,
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&vec![text_block("alpha")]).unwrap(),
|
||||
success: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
|
||||
Reference in New Issue
Block a user