[MCP] Render MCP tool call result images to the model (#5600)

It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
2025-10-27 14:55:57 -07:00
parent 67a219ffc2
commit b0bdc04c30
24 changed files with 749 additions and 108 deletions
--- a/codex-rs/core/src/chat_completions.rs
+++ b/codex-rs/core/src/chat_completions.rs
@@ -17,6 +17,7 @@ use crate::util::backoff;
 use bytes::Bytes;
 use codex_otel::otel_event_manager::OtelEventManager;
 use codex_protocol::models::ContentItem;
+use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::ReasoningItemContent;
 use codex_protocol::models::ResponseItem;
 use eventsource_stream::Eventsource;
@@ -159,16 +160,26 @@ pub(crate) async fn stream_chat_completions(
    for (idx, item) in input.iter().enumerate() {
        match item {
            ResponseItem::Message { role, content, .. } => {
+                // Build content either as a plain string (typical for assistant text)
+                // or as an array of content items when images are present (user/tool multimodal).
                let mut text = String::new();
+                let mut items: Vec<serde_json::Value> = Vec::new();
+                let mut saw_image = false;
+
                for c in content {
                    match c {
                        ContentItem::InputText { text: t }
                        | ContentItem::OutputText { text: t } => {
                            text.push_str(t);
+                            items.push(json!({"type":"text","text": t}));
+                        }
+                        ContentItem::InputImage { image_url } => {
+                            saw_image = true;
+                            items.push(json!({"type":"image_url","image_url": {"url": image_url}}));
                        }
-                        _ => {}
                    }
                }
+
                // Skip exact-duplicate assistant messages.
                if role == "assistant" {
                    if let Some(prev) = &last_assistant_text
@@ -179,7 +190,17 @@ pub(crate) async fn stream_chat_completions(
                    last_assistant_text = Some(text.clone());
                }

-                let mut msg = json!({"role": role, "content": text});
+                // For assistant messages, always send a plain string for compatibility.
+                // For user messages, if an image is present, send an array of content items.
+                let content_value = if role == "assistant" {
+                    json!(text)
+                } else if saw_image {
+                    json!(items)
+                } else {
+                    json!(text)
+                };
+
+                let mut msg = json!({"role": role, "content": content_value});
                if role == "assistant"
                    && let Some(reasoning) = reasoning_by_anchor_index.get(&idx)
                    && let Some(obj) = msg.as_object_mut()
@@ -238,10 +259,29 @@ pub(crate) async fn stream_chat_completions(
                messages.push(msg);
            }
            ResponseItem::FunctionCallOutput { call_id, output } => {
+                // Prefer structured content items when available (e.g., images)
+                // otherwise fall back to the legacy plain-string content.
+                let content_value = if let Some(items) = &output.content_items {
+                    let mapped: Vec<serde_json::Value> = items
+                        .iter()
+                        .map(|it| match it {
+                            FunctionCallOutputContentItem::InputText { text } => {
+                                json!({"type":"text","text": text})
+                            }
+                            FunctionCallOutputContentItem::InputImage { image_url } => {
+                                json!({"type":"image_url","image_url": {"url": image_url}})
+                            }
+                        })
+                        .collect();
+                    json!(mapped)
+                } else {
+                    json!(output.content)
+                };
+
                messages.push(json!({
                    "role": "tool",
                    "tool_call_id": call_id,
-                    "content": output.content,
+                    "content": content_value,
                }));
            }
            ResponseItem::CustomToolCall {
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -2047,7 +2047,7 @@ async fn try_run_turn(
                            call_id: String::new(),
                            output: FunctionCallOutputPayload {
                                content: msg.to_string(),
-                                success: None,
+                                ..Default::default()
                            },
                        };
                        add_completed(ProcessedResponseItem {
@@ -2061,7 +2061,7 @@ async fn try_run_turn(
                            call_id: String::new(),
                            output: FunctionCallOutputPayload {
                                content: message,
-                                success: None,
+                                ..Default::default()
                            },
                        };
                        add_completed(ProcessedResponseItem {
@@ -2199,41 +2199,6 @@ pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -
        }
    })
 }
-pub(crate) fn convert_call_tool_result_to_function_call_output_payload(
-    call_tool_result: &CallToolResult,
-) -> FunctionCallOutputPayload {
-    let CallToolResult {
-        content,
-        is_error,
-        structured_content,
-    } = call_tool_result;
-
-    // In terms of what to send back to the model, we prefer structured_content,
-    // if available, and fallback to content, otherwise.
-    let mut is_success = is_error != &Some(true);
-    let content = if let Some(structured_content) = structured_content
-        && structured_content != &serde_json::Value::Null
-        && let Ok(serialized_structured_content) = serde_json::to_string(&structured_content)
-    {
-        serialized_structured_content
-    } else {
-        match serde_json::to_string(&content) {
-            Ok(serialized_content) => serialized_content,
-            Err(err) => {
-                // If we could not serialize either content or structured_content to
-                // JSON, flag this as an error.
-                is_success = false;
-                err.to_string()
-            }
-        }
-    };
-
-    FunctionCallOutputPayload {
-        content,
-        success: Some(is_success),
-    }
-}
-
 /// Emits an ExitedReviewMode Event with optional ReviewOutput,
 /// and records a developer message with the review output.
 pub(crate) async fn exit_review_mode(
@@ -2439,7 +2404,7 @@ mod tests {
            })),
        };

-        let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
+        let got = FunctionCallOutputPayload::from(&ctr);
        let expected = FunctionCallOutputPayload {
            content: serde_json::to_string(&json!({
                "ok": true,
@@ -2447,6 +2412,7 @@ mod tests {
            }))
            .unwrap(),
            success: Some(true),
+            ..Default::default()
        };

        assert_eq!(expected, got);
@@ -2479,11 +2445,12 @@ mod tests {
            structured_content: Some(serde_json::Value::Null),
        };

-        let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
+        let got = FunctionCallOutputPayload::from(&ctr);
        let expected = FunctionCallOutputPayload {
            content: serde_json::to_string(&vec![text_block("hello"), text_block("world")])
                .unwrap(),
            success: Some(true),
+            ..Default::default()
        };

        assert_eq!(expected, got);
@@ -2497,10 +2464,11 @@ mod tests {
            structured_content: Some(json!({ "message": "bad" })),
        };

-        let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
+        let got = FunctionCallOutputPayload::from(&ctr);
        let expected = FunctionCallOutputPayload {
            content: serde_json::to_string(&json!({ "message": "bad" })).unwrap(),
            success: Some(false),
+            ..Default::default()
        };

        assert_eq!(expected, got);
@@ -2514,10 +2482,11 @@ mod tests {
            structured_content: None,
        };

-        let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
+        let got = FunctionCallOutputPayload::from(&ctr);
        let expected = FunctionCallOutputPayload {
            content: serde_json::to_string(&vec![text_block("alpha")]).unwrap(),
            success: Some(true),
+            ..Default::default()
        };

        assert_eq!(expected, got);
--- a/codex-rs/core/src/conversation_history.rs
+++ b/codex-rs/core/src/conversation_history.rs
@@ -136,7 +136,7 @@ impl ConversationHistory {
                                call_id: call_id.clone(),
                                output: FunctionCallOutputPayload {
                                    content: "aborted".to_string(),
-                                    success: None,
+                                    ..Default::default()
                                },
                            },
                        ));
@@ -183,7 +183,7 @@ impl ConversationHistory {
                                    call_id: call_id.clone(),
                                    output: FunctionCallOutputPayload {
                                        content: "aborted".to_string(),
-                                        success: None,
+                                        ..Default::default()
                                    },
                                },
                            ));
@@ -565,7 +565,7 @@ mod tests {
                call_id: "call-1".to_string(),
                output: FunctionCallOutputPayload {
                    content: "ok".to_string(),
-                    success: None,
+                    ..Default::default()
                },
            },
        ];
@@ -581,7 +581,7 @@ mod tests {
                call_id: "call-2".to_string(),
                output: FunctionCallOutputPayload {
                    content: "ok".to_string(),
-                    success: None,
+                    ..Default::default()
                },
            },
            ResponseItem::FunctionCall {
@@ -615,7 +615,7 @@ mod tests {
                call_id: "call-3".to_string(),
                output: FunctionCallOutputPayload {
                    content: "ok".to_string(),
-                    success: None,
+                    ..Default::default()
                },
            },
        ];
@@ -848,7 +848,7 @@ mod tests {
                    call_id: "call-x".to_string(),
                    output: FunctionCallOutputPayload {
                        content: "aborted".to_string(),
-                        success: None,
+                        ..Default::default()
                    },
                },
            ]
@@ -925,7 +925,7 @@ mod tests {
                    call_id: "shell-1".to_string(),
                    output: FunctionCallOutputPayload {
                        content: "aborted".to_string(),
-                        success: None,
+                        ..Default::default()
                    },
                },
            ]
@@ -939,7 +939,7 @@ mod tests {
            call_id: "orphan-1".to_string(),
            output: FunctionCallOutputPayload {
                content: "ok".to_string(),
-                success: None,
+                ..Default::default()
            },
        }];
        let mut h = create_history_with_items(items);
@@ -979,7 +979,7 @@ mod tests {
                call_id: "c2".to_string(),
                output: FunctionCallOutputPayload {
                    content: "ok".to_string(),
-                    success: None,
+                    ..Default::default()
                },
            },
            // Will get an inserted custom tool output
@@ -1021,7 +1021,7 @@ mod tests {
                    call_id: "c1".to_string(),
                    output: FunctionCallOutputPayload {
                        content: "aborted".to_string(),
-                        success: None,
+                        ..Default::default()
                    },
                },
                ResponseItem::CustomToolCall {
@@ -1051,7 +1051,7 @@ mod tests {
                    call_id: "s1".to_string(),
                    output: FunctionCallOutputPayload {
                        content: "aborted".to_string(),
-                        success: None,
+                        ..Default::default()
                    },
                },
            ]
@@ -1116,7 +1116,7 @@ mod tests {
            call_id: "orphan-1".to_string(),
            output: FunctionCallOutputPayload {
                content: "ok".to_string(),
-                success: None,
+                ..Default::default()
            },
        }];
        let mut h = create_history_with_items(items);
@@ -1150,7 +1150,7 @@ mod tests {
                call_id: "c2".to_string(),
                output: FunctionCallOutputPayload {
                    content: "ok".to_string(),
-                    success: None,
+                    ..Default::default()
                },
            },
            ResponseItem::CustomToolCall {
--- a/codex-rs/core/src/mcp_tool_call.rs
+++ b/codex-rs/core/src/mcp_tool_call.rs
@@ -35,6 +35,7 @@ pub(crate) async fn handle_mcp_tool_call(
                    output: FunctionCallOutputPayload {
                        content: format!("err: {e}"),
                        success: Some(false),
+                        ..Default::default()
                    },
                };
            }
--- a/codex-rs/core/src/response_processing.rs
+++ b/codex-rs/core/src/response_processing.rs
@@ -61,14 +61,11 @@ pub(crate) async fn process_items(
            ) => {
                items_to_record_in_conversation_history.push(item);
                let output = match result {
-                    Ok(call_tool_result) => {
-                        crate::codex::convert_call_tool_result_to_function_call_output_payload(
-                            call_tool_result,
-                        )
-                    }
+                    Ok(call_tool_result) => FunctionCallOutputPayload::from(call_tool_result),
                    Err(err) => FunctionCallOutputPayload {
                        content: err.clone(),
                        success: Some(false),
+                        ..Default::default()
                    },
                };
                items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
--- a/codex-rs/core/src/tools/context.rs
+++ b/codex-rs/core/src/tools/context.rs
@@ -5,6 +5,7 @@ use crate::tools::TELEMETRY_PREVIEW_MAX_LINES;
 use crate::tools::TELEMETRY_PREVIEW_TRUNCATION_NOTICE;
 use crate::turn_diff_tracker::TurnDiffTracker;
 use codex_otel::otel_event_manager::OtelEventManager;
+use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ResponseInputItem;
 use codex_protocol::models::ShellToolCallParams;
@@ -65,7 +66,10 @@ impl ToolPayload {
 #[derive(Clone)]
 pub enum ToolOutput {
    Function {
+        // Plain text representation of the tool output.
        content: String,
+        // Some tool calls such as MCP calls may return structured content that can get parsed into an array of polymorphic content items.
+        content_items: Option<Vec<FunctionCallOutputContentItem>>,
        success: Option<bool>,
    },
    Mcp {
@@ -90,7 +94,11 @@ impl ToolOutput {

    pub fn into_response(self, call_id: &str, payload: &ToolPayload) -> ResponseInputItem {
        match self {
-            ToolOutput::Function { content, success } => {
+            ToolOutput::Function {
+                content,
+                content_items,
+                success,
+            } => {
                if matches!(payload, ToolPayload::Custom { .. }) {
                    ResponseInputItem::CustomToolCallOutput {
                        call_id: call_id.to_string(),
@@ -99,7 +107,11 @@ impl ToolOutput {
                } else {
                    ResponseInputItem::FunctionCallOutput {
                        call_id: call_id.to_string(),
-                        output: FunctionCallOutputPayload { content, success },
+                        output: FunctionCallOutputPayload {
+                            content,
+                            content_items,
+                            success,
+                        },
                    }
                }
            }
@@ -163,6 +175,7 @@ mod tests {
        };
        let response = ToolOutput::Function {
            content: "patched".to_string(),
+            content_items: None,
            success: Some(true),
        }
        .into_response("call-42", &payload);
@@ -183,6 +196,7 @@ mod tests {
        };
        let response = ToolOutput::Function {
            content: "ok".to_string(),
+            content_items: None,
            success: Some(true),
        }
        .into_response("fn-1", &payload);
@@ -191,6 +205,7 @@ mod tests {
            ResponseInputItem::FunctionCallOutput { call_id, output } => {
                assert_eq!(call_id, "fn-1");
                assert_eq!(output.content, "ok");
+                assert!(output.content_items.is_none());
                assert_eq!(output.success, Some(true));
            }
            other => panic!("expected FunctionCallOutput, got {other:?}"),
--- a/codex-rs/core/src/tools/handlers/apply_patch.rs
+++ b/codex-rs/core/src/tools/handlers/apply_patch.rs
@@ -82,6 +82,7 @@ impl ToolHandler for ApplyPatchHandler {
                        let content = item?;
                        Ok(ToolOutput::Function {
                            content,
+                            content_items: None,
                            success: Some(true),
                        })
                    }
@@ -126,6 +127,7 @@ impl ToolHandler for ApplyPatchHandler {
                        let content = emitter.finish(event_ctx, out).await?;
                        Ok(ToolOutput::Function {
                            content,
+                            content_items: None,
                            success: Some(true),
                        })
                    }
--- a/codex-rs/core/src/tools/handlers/grep_files.rs
+++ b/codex-rs/core/src/tools/handlers/grep_files.rs
@@ -90,11 +90,13 @@ impl ToolHandler for GrepFilesHandler {
        if search_results.is_empty() {
            Ok(ToolOutput::Function {
                content: "No matches found.".to_string(),
+                content_items: None,
                success: Some(false),
            })
        } else {
            Ok(ToolOutput::Function {
                content: search_results.join("\n"),
+                content_items: None,
                success: Some(true),
            })
        }
--- a/codex-rs/core/src/tools/handlers/list_dir.rs
+++ b/codex-rs/core/src/tools/handlers/list_dir.rs
@@ -106,6 +106,7 @@ impl ToolHandler for ListDirHandler {
        output.extend(entries);
        Ok(ToolOutput::Function {
            content: output.join("\n"),
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/handlers/mcp.rs
+++ b/codex-rs/core/src/tools/handlers/mcp.rs
@@ -56,8 +56,16 @@ impl ToolHandler for McpHandler {
                Ok(ToolOutput::Mcp { result })
            }
            codex_protocol::models::ResponseInputItem::FunctionCallOutput { output, .. } => {
-                let codex_protocol::models::FunctionCallOutputPayload { content, success } = output;
-                Ok(ToolOutput::Function { content, success })
+                let codex_protocol::models::FunctionCallOutputPayload {
+                    content,
+                    content_items,
+                    success,
+                } = output;
+                Ok(ToolOutput::Function {
+                    content,
+                    content_items,
+                    success,
+                })
            }
            _ => Err(FunctionCallError::RespondToModel(
                "mcp handler received unexpected response variant".to_string(),
--- a/codex-rs/core/src/tools/handlers/mcp_resource.rs
+++ b/codex-rs/core/src/tools/handlers/mcp_resource.rs
@@ -297,7 +297,10 @@ async fn handle_list_resources(
    match payload_result {
        Ok(payload) => match serialize_function_output(payload) {
            Ok(output) => {
-                let ToolOutput::Function { content, success } = &output else {
+                let ToolOutput::Function {
+                    content, success, ..
+                } = &output
+                else {
                    unreachable!("MCP resource handler should return function output");
                };
                let duration = start.elapsed();
@@ -403,7 +406,10 @@ async fn handle_list_resource_templates(
    match payload_result {
        Ok(payload) => match serialize_function_output(payload) {
            Ok(output) => {
-                let ToolOutput::Function { content, success } = &output else {
+                let ToolOutput::Function {
+                    content, success, ..
+                } = &output
+                else {
                    unreachable!("MCP resource handler should return function output");
                };
                let duration = start.elapsed();
@@ -489,7 +495,10 @@ async fn handle_read_resource(
    match payload_result {
        Ok(payload) => match serialize_function_output(payload) {
            Ok(output) => {
-                let ToolOutput::Function { content, success } = &output else {
+                let ToolOutput::Function {
+                    content, success, ..
+                } = &output
+                else {
                    unreachable!("MCP resource handler should return function output");
                };
                let duration = start.elapsed();
@@ -618,6 +627,7 @@ where

    Ok(ToolOutput::Function {
        content,
+        content_items: None,
        success: Some(true),
    })
 }
--- a/codex-rs/core/src/tools/handlers/plan.rs
+++ b/codex-rs/core/src/tools/handlers/plan.rs
@@ -88,6 +88,7 @@ impl ToolHandler for PlanHandler {

        Ok(ToolOutput::Function {
            content,
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/handlers/read_file.rs
+++ b/codex-rs/core/src/tools/handlers/read_file.rs
@@ -149,6 +149,7 @@ impl ToolHandler for ReadFileHandler {
        };
        Ok(ToolOutput::Function {
            content: collected.join("\n"),
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/handlers/shell.rs
+++ b/codex-rs/core/src/tools/handlers/shell.rs
@@ -136,6 +136,7 @@ impl ShellHandler {
                        let content = item?;
                        return Ok(ToolOutput::Function {
                            content,
+                            content_items: None,
                            success: Some(true),
                        });
                    }
@@ -179,6 +180,7 @@ impl ShellHandler {
                        let content = emitter.finish(event_ctx, out).await?;
                        return Ok(ToolOutput::Function {
                            content,
+                            content_items: None,
                            success: Some(true),
                        });
                    }
@@ -226,6 +228,7 @@ impl ShellHandler {
        let content = emitter.finish(event_ctx, out).await?;
        Ok(ToolOutput::Function {
            content,
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/handlers/test_sync.rs
+++ b/codex-rs/core/src/tools/handlers/test_sync.rs
@@ -95,6 +95,7 @@ impl ToolHandler for TestSyncHandler {

        Ok(ToolOutput::Function {
            content: "ok".to_string(),
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/handlers/unified_exec.rs
+++ b/codex-rs/core/src/tools/handlers/unified_exec.rs
@@ -171,6 +171,7 @@ impl ToolHandler for UnifiedExecHandler {

        Ok(ToolOutput::Function {
            content,
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/handlers/view_image.rs
+++ b/codex-rs/core/src/tools/handlers/view_image.rs
@@ -85,6 +85,7 @@ impl ToolHandler for ViewImageHandler {

        Ok(ToolOutput::Function {
            content: "attached local image path".to_string(),
+            content_items: None,
            success: Some(true),
        })
    }
--- a/codex-rs/core/src/tools/parallel.rs
+++ b/codex-rs/core/src/tools/parallel.rs
@@ -105,7 +105,7 @@ impl ToolCallRuntime {
                call_id: call.call_id.clone(),
                output: FunctionCallOutputPayload {
                    content: "aborted".to_string(),
-                    success: None,
+                    ..Default::default()
                },
            },
        }
--- a/codex-rs/core/src/tools/router.rs
+++ b/codex-rs/core/src/tools/router.rs
@@ -181,6 +181,7 @@ impl ToolRouter {
                output: codex_protocol::models::FunctionCallOutputPayload {
                    content: message,
                    success: Some(false),
+                    ..Default::default()
                },
            }
        }