[MCP] Render MCP tool call result images to the model (#5600)

It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
2025-10-27 14:55:57 -07:00
parent 67a219ffc2
commit b0bdc04c30
24 changed files with 749 additions and 108 deletions
--- a/codex-rs/core/tests/suite/rmcp_client.rs
+++ b/codex-rs/core/tests/suite/rmcp_client.rs
@@ -14,6 +14,8 @@ use codex_core::features::Feature;

 use codex_core::protocol::AskForApproval;
 use codex_core::protocol::EventMsg;
+use codex_core::protocol::McpInvocation;
+use codex_core::protocol::McpToolCallBeginEvent;
 use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
@@ -25,7 +27,9 @@ use core_test_support::test_codex::test_codex;
 use core_test_support::wait_for_event;
 use core_test_support::wait_for_event_with_timeout;
 use escargot::CargoBuild;
+use mcp_types::ContentBlock;
 use serde_json::Value;
+use serde_json::json;
 use serial_test::serial;
 use tempfile::tempdir;
 use tokio::net::TcpStream;
@@ -35,6 +39,8 @@ use tokio::time::Instant;
 use tokio::time::sleep;
 use wiremock::matchers::any;

+static OPENAI_PNG: &str = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAD0AAAA9CAYAAAAeYmHpAAAE6klEQVR4Aeyau44UVxCGx1fZsmRLlm3Zoe0XcGQ5cUiCCIgJeS9CHgAhMkISQnIuGQgJEkBcxLW+nqnZ6uqqc+nuWRC7q/P3qetf9e+MtOwyX25O4Nep6JPyop++0qev9HrfgZ+F6r2DuB/vHOrt/UIkqdDHYvujOW6fO7h/CNEI+a5jc+pBR8uy0jVFsziYu5HtfSUk+Io34q921hLNctFSX0gwww+S8wce8K1LfCU+cYW4888aov8NxqvQILUPPReLOrm6zyLxa4i+6VZuFbJo8d1MOHZm+7VUtB/aIvhPWc/3SWg49JcwFLlHxuXKjtyloo+YNhuW3VS+WPBuUEMvCFKjEDVgFBQHXrnazpqiSxNZCkQ1kYiozsbm9Oz7l4i2Il7vGccGNWAc3XosDrZe/9P3ZnMmzHNEQw4smf8RQ87XEAMsC7Az0Au+dgXerfH4+sHvEc0SYGic8WBBUGqFH2gN7yDrazy7m2pbRTeRmU3+MjZmr1h6LJgPbGy23SI6GlYT0brQ71IY8Us4PNQCm+zepSbaD2BY9xCaAsD9IIj/IzFmKMSdHHonwdZATbTnYREf6/VZGER98N9yCWIvXQwXDoDdhZJoT8jwLnJXDB9w4Sb3e6nK5ndzlkTLnP3JBu4LKkbrYrU69gCVceV0JvpyuW1xlsUVngzhwMetn/XamtTORF9IO5YnWNiyeF9zCAfqR3fUW+vZZKLtgP+ts8BmQRBREAdRDhH3o8QuRh/YucNFz2BEjxbRN6LGzphfKmvP6v6QhqIQyZ8XNJ0W0X83MR1PEcJBNO2KC2Z1TW/v244scp9FwRViZxIOBF0Lctk7ZVSavdLvRlV1hz/ysUi9sr8CIcB3nvWBwA93ykTz18eAYxQ6N/K2DkPA1lv3iXCwmDUT7YkjIby9siXueIJj9H+pzSqJ9oIuJWTUgSSt4WO7o/9GGg0viR4VinNRUDoIj34xoCd6pxD3aK3zfdbnx5v1J3ZNNEJsE0sBG7N27ReDrJc4sFxz7dI/ZAbOmmiKvHBitQXpAdR6+F7v+/ol/tOouUV01EeMZQF2BoQDn6dP4XNr+j9GZEtEK1/L8pFw7bd3a53tsTa7WD+054jOFmPg1XBKPQgnqFfmFcy32ZRvjmiIIQTYFvyDxQ8nH8WIwwGwlyDjDznnilYyFr6njrlZwsKkBpO59A7OwgdzPEWRm+G+oeb7IfyNuzjEEVLrOVxJsxvxwF8kmCM6I2QYmJunz4u4TrADpfl7mlbRTWQ7VmrBzh3+C9f6Grc3YoGN9dg/SXFthpRsT6vobfXRs2VBlgBHXVMLHjDNbIZv1sZ9+X3hB09cXdH1JKViyG0+W9bWZDa/r2f9zAFR71sTzGpMSWz2iI4YssWjWo3REy1MDGjdwe5e0dFSiAC1JakBvu4/CUS8Eh6dqHdU0Or0ioY3W5ClSqDXAy7/6SRfgw8vt4I+tbvvNtFT2kVDhY5+IGb1rCqYaXNF08vSALsXCPmt0kQNqJT1p5eI1mkIV/BxCY1z85lOzeFbPBQHURkkPTlwTYK9gTVE25l84IbFFN+YJDHjdpn0gq6mrHht0dkcjbM4UL9283O5p77GN+SPW/QwVB4IUYg7Or+Kp7naR6qktP98LNF2UxWo9yObPIT9KYg+hK4i56no4rfnM0qeyFf6AwAAAP//trwR3wAAAAZJREFUAwBZ0sR75itw5gAAAABJRU5ErkJggg==";
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
 #[serial(mcp_test_value)]
 async fn stdio_server_round_trip() -> anyhow::Result<()> {
@@ -175,6 +181,352 @@ async fn stdio_server_round_trip() -> anyhow::Result<()> {
    Ok(())
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+#[serial(mcp_test_value)]
+async fn stdio_image_responses_round_trip() -> anyhow::Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+
+    let call_id = "img-1";
+    let server_name = "rmcp";
+    let tool_name = format!("mcp__{server_name}__image");
+
+    // First stream: model decides to call the image tool.
+    mount_sse_once_match(
+        &server,
+        any(),
+        responses::sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call(call_id, &tool_name, "{}"),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    // Second stream: after tool execution, assistant emits a message and completes.
+    let final_mock = mount_sse_once_match(
+        &server,
+        any(),
+        responses::sse(vec![
+            responses::ev_assistant_message("msg-1", "rmcp image tool completed successfully."),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    // Build the stdio rmcp server and pass the image as data URL so it can construct ImageContent.
+    let rmcp_test_server_bin = CargoBuild::new()
+        .package("codex-rmcp-client")
+        .bin("test_stdio_server")
+        .run()?
+        .path()
+        .to_string_lossy()
+        .into_owned();
+
+    let fixture = test_codex()
+        .with_config(move |config| {
+            config.features.enable(Feature::RmcpClient);
+            config.mcp_servers.insert(
+                server_name.to_string(),
+                McpServerConfig {
+                    transport: McpServerTransportConfig::Stdio {
+                        command: rmcp_test_server_bin,
+                        args: Vec::new(),
+                        env: Some(HashMap::from([(
+                            "MCP_TEST_IMAGE_DATA_URL".to_string(),
+                            OPENAI_PNG.to_string(),
+                        )])),
+                        env_vars: Vec::new(),
+                        cwd: None,
+                    },
+                    enabled: true,
+                    startup_timeout_sec: Some(Duration::from_secs(10)),
+                    tool_timeout_sec: None,
+                    enabled_tools: None,
+                    disabled_tools: None,
+                },
+            );
+        })
+        .build(&server)
+        .await?;
+    let session_model = fixture.session_configured.model.clone();
+
+    fixture
+        .codex
+        .submit(Op::UserTurn {
+            items: vec![UserInput::Text {
+                text: "call the rmcp image tool".into(),
+            }],
+            final_output_json_schema: None,
+            cwd: fixture.cwd.path().to_path_buf(),
+            approval_policy: AskForApproval::Never,
+            sandbox_policy: SandboxPolicy::ReadOnly,
+            model: session_model,
+            effort: None,
+            summary: ReasoningSummary::Auto,
+        })
+        .await?;
+
+    // Wait for tool begin/end and final completion.
+    let begin_event = wait_for_event_with_timeout(
+        &fixture.codex,
+        |ev| matches!(ev, EventMsg::McpToolCallBegin(_)),
+        Duration::from_secs(10),
+    )
+    .await;
+    let EventMsg::McpToolCallBegin(begin) = begin_event else {
+        unreachable!("begin");
+    };
+    assert_eq!(
+        begin,
+        McpToolCallBeginEvent {
+            call_id: call_id.to_string(),
+            invocation: McpInvocation {
+                server: server_name.to_string(),
+                tool: "image".to_string(),
+                arguments: Some(json!({})),
+            },
+        },
+    );
+
+    let end_event = wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallEnd(_))
+    })
+    .await;
+    let EventMsg::McpToolCallEnd(end) = end_event else {
+        unreachable!("end");
+    };
+    assert_eq!(end.call_id, call_id);
+    assert_eq!(
+        end.invocation,
+        McpInvocation {
+            server: server_name.to_string(),
+            tool: "image".to_string(),
+            arguments: Some(json!({})),
+        }
+    );
+    let result = end.result.expect("rmcp image tool should return success");
+    assert_eq!(result.is_error, Some(false));
+    assert_eq!(result.content.len(), 1);
+    let base64_only = OPENAI_PNG
+        .strip_prefix("data:image/png;base64,")
+        .expect("data url prefix");
+    match &result.content[0] {
+        ContentBlock::ImageContent(img) => {
+            assert_eq!(img.mime_type, "image/png");
+            assert_eq!(img.r#type, "image");
+            assert_eq!(img.data, base64_only);
+        }
+        other => panic!("expected image content, got {other:?}"),
+    }
+
+    wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    let output_item = final_mock.single_request().function_call_output(call_id);
+    assert_eq!(
+        output_item,
+        json!({
+            "type": "function_call_output",
+            "call_id": call_id,
+            "output": [{
+                "type": "input_image",
+                "image_url": OPENAI_PNG
+            }]
+        })
+    );
+    server.verify().await;
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+#[serial(mcp_test_value)]
+async fn stdio_image_completions_round_trip() -> anyhow::Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+
+    let call_id = "img-cc-1";
+    let server_name = "rmcp";
+    let tool_name = format!("mcp__{server_name}__image");
+
+    let tool_call = json!({
+        "choices": [
+            {
+                "delta": {
+                    "tool_calls": [
+                        {
+                            "id": call_id,
+                            "type": "function",
+                            "function": {"name": tool_name, "arguments": "{}"}
+                        }
+                    ]
+                },
+                "finish_reason": "tool_calls"
+            }
+        ]
+    });
+    let sse_tool_call = format!(
+        "data: {}\n\ndata: [DONE]\n\n",
+        serde_json::to_string(&tool_call)?
+    );
+
+    let final_assistant = json!({
+        "choices": [
+            {
+                "delta": {"content": "rmcp image tool completed successfully."},
+                "finish_reason": "stop"
+            }
+        ]
+    });
+    let sse_final = format!(
+        "data: {}\n\ndata: [DONE]\n\n",
+        serde_json::to_string(&final_assistant)?
+    );
+
+    use std::sync::atomic::AtomicUsize;
+    use std::sync::atomic::Ordering;
+    struct ChatSeqResponder {
+        num_calls: AtomicUsize,
+        bodies: Vec<String>,
+    }
+    impl wiremock::Respond for ChatSeqResponder {
+        fn respond(&self, _: &wiremock::Request) -> wiremock::ResponseTemplate {
+            let idx = self.num_calls.fetch_add(1, Ordering::SeqCst);
+            match self.bodies.get(idx) {
+                Some(body) => wiremock::ResponseTemplate::new(200)
+                    .insert_header("content-type", "text/event-stream")
+                    .set_body_string(body.clone()),
+                None => panic!("no chat completion response for index {idx}"),
+            }
+        }
+    }
+
+    let chat_seq = ChatSeqResponder {
+        num_calls: AtomicUsize::new(0),
+        bodies: vec![sse_tool_call, sse_final],
+    };
+    wiremock::Mock::given(wiremock::matchers::method("POST"))
+        .and(wiremock::matchers::path("/v1/chat/completions"))
+        .respond_with(chat_seq)
+        .expect(2)
+        .mount(&server)
+        .await;
+
+    let rmcp_test_server_bin = CargoBuild::new()
+        .package("codex-rmcp-client")
+        .bin("test_stdio_server")
+        .run()?
+        .path()
+        .to_string_lossy()
+        .into_owned();
+
+    let fixture = test_codex()
+        .with_config(move |config| {
+            config.model_provider.wire_api = codex_core::WireApi::Chat;
+            config.features.enable(Feature::RmcpClient);
+            config.mcp_servers.insert(
+                server_name.to_string(),
+                McpServerConfig {
+                    transport: McpServerTransportConfig::Stdio {
+                        command: rmcp_test_server_bin,
+                        args: Vec::new(),
+                        env: Some(HashMap::from([(
+                            "MCP_TEST_IMAGE_DATA_URL".to_string(),
+                            OPENAI_PNG.to_string(),
+                        )])),
+                        env_vars: Vec::new(),
+                        cwd: None,
+                    },
+                    enabled: true,
+                    startup_timeout_sec: Some(Duration::from_secs(10)),
+                    tool_timeout_sec: None,
+                    enabled_tools: None,
+                    disabled_tools: None,
+                },
+            );
+        })
+        .build(&server)
+        .await?;
+    let session_model = fixture.session_configured.model.clone();
+
+    fixture
+        .codex
+        .submit(Op::UserTurn {
+            items: vec![UserInput::Text {
+                text: "call the rmcp image tool".into(),
+            }],
+            final_output_json_schema: None,
+            cwd: fixture.cwd.path().to_path_buf(),
+            approval_policy: AskForApproval::Never,
+            sandbox_policy: SandboxPolicy::ReadOnly,
+            model: session_model,
+            effort: None,
+            summary: ReasoningSummary::Auto,
+        })
+        .await?;
+
+    let begin_event = wait_for_event_with_timeout(
+        &fixture.codex,
+        |ev| matches!(ev, EventMsg::McpToolCallBegin(_)),
+        Duration::from_secs(10),
+    )
+    .await;
+    let EventMsg::McpToolCallBegin(begin) = begin_event else {
+        unreachable!("begin");
+    };
+    assert_eq!(
+        begin,
+        McpToolCallBeginEvent {
+            call_id: call_id.to_string(),
+            invocation: McpInvocation {
+                server: server_name.to_string(),
+                tool: "image".to_string(),
+                arguments: Some(json!({})),
+            },
+        },
+    );
+
+    let end_event = wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallEnd(_))
+    })
+    .await;
+    let EventMsg::McpToolCallEnd(end) = end_event else {
+        unreachable!("end");
+    };
+    assert!(end.result.as_ref().is_ok(), "tool call should succeed");
+
+    wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    // Chat Completions assertion: the second POST should include a tool role message
+    // with an array `content` containing an item with the expected data URL.
+    let requests = server.received_requests().await.expect("requests captured");
+    assert!(requests.len() >= 2, "expected two chat completion calls");
+    let second = &requests[1];
+    let body: Value = serde_json::from_slice(&second.body)?;
+    let messages = body
+        .get("messages")
+        .and_then(Value::as_array)
+        .cloned()
+        .expect("messages array");
+    let tool_msg = messages
+        .iter()
+        .find(|m| {
+            m.get("role") == Some(&json!("tool")) && m.get("tool_call_id") == Some(&json!(call_id))
+        })
+        .cloned()
+        .expect("tool message present");
+    assert_eq!(
+        tool_msg,
+        json!({
+            "role": "tool",
+            "tool_call_id": call_id,
+            "content": [{"type": "image_url", "image_url": {"url": OPENAI_PNG}}]
+        })
+    );
+
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
 #[serial(mcp_test_value)]
 async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {