fix: introduce ResponseInputItem::McpToolCallOutput variant (#1151)

The output of an MCP server tool call can be one of several types, but to date, we treated all outputs as text by showing the serialized JSON as the "tool output" in Codex: 25a9949c49/codex-rs/mcp-types/src/lib.rs (L96-L101) This PR adds support for the `ImageContent` variant so we can now display an image output from an MCP tool call. In making this change, we introduce a new `ResponseInputItem::McpToolCallOutput` variant so that we can work with the `mcp_types::CallToolResult` directly when the function call is made to an MCP server. Though arguably the more significant change is the introduction of `HistoryCell::CompletedMcpToolCallWithImageOutput`, which is a cell that uses `ratatui_image` to render an image into the terminal. To support this, we introduce `ImageRenderCache`, cache a `ratatui_image::picker::Picker`, and `ensure_image_cache()` to cache the appropriate scaled image data and dimensions based on the current terminal size. To test, I created a minimal `package.json`: ```json { "name": "kitty-mcp", "version": "1.0.0", "type": "module", "description": "MCP that returns image of kitty", "main": "index.js", "dependencies": { "@modelcontextprotocol/sdk": "^1.12.0" } } ``` with the following `index.js` to define the MCP server: ```js #!/usr/bin/env node import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { readFile } from "node:fs/promises"; import { join } from "node:path"; const IMAGE_URI = "image://Ada.png"; const server = new McpServer({ name: "Demo", version: "1.0.0", }); server.tool( "get-cat-image", "If you need a cat image, this tool will provide one.", async () => ({ content: [ { type: "image", data: await getAdaPngBase64(), mimeType: "image/png" }, ], }) ); server.resource("Ada the Cat", IMAGE_URI, async (uri) => { const base64Image = await getAdaPngBase64(); return { contents: [ { uri: uri.href, mimeType: "image/png", blob: base64Image, }, ], }; }); async function getAdaPngBase64() { const __dirname = new URL(".", import.meta.url).pathname; // From 9705ce2c59/assets/Ada.png const filePath = join(__dirname, "Ada.png"); const imageData = await readFile(filePath); const base64Image = imageData.toString("base64"); return base64Image; } const transport = new StdioServerTransport(); await server.connect(transport); ``` With the local changes from this PR, I added the following to my `config.toml`: ```toml [mcp_servers.kitty] command = "node" args = ["/Users/mbolin/code/kitty-mcp/index.js"] ``` Running the TUI from source: ``` cargo run --bin codex -- --model o3 'I need a picture of a cat' ``` I get: <img width="732" alt="image" src="https://github.com/user-attachments/assets/bf80b721-9ca0-4d81-aec7-77d6899e2869" /> Now, that said, I have only tested in iTerm and there is definitely some funny business with getting an accurate character-to-pixel ratio (sometimes the `CompletedMcpToolCallWithImageOutput` thinks it needs 10 rows to render instead of 4), so there is still work to be done here.
2025-05-28 19:03:17 -07:00
parent 25a9949c49
commit a768a6a41d
9 changed files with 936 additions and 87 deletions
--- a/codex-rs/core/src/mcp_tool_call.rs
+++ b/codex-rs/core/src/mcp_tool_call.rs
@@ -50,51 +50,18 @@ pub(crate) async fn handle_mcp_tool_call(
    notify_mcp_tool_call_event(sess, sub_id, tool_call_begin_event).await;

    // Perform the tool call.
-    let (tool_call_end_event, tool_call_err) = match sess
+    let result = sess
        .call_tool(&server, &tool_name, arguments_value, timeout)
        .await
-    {
-        Ok(result) => (
-            EventMsg::McpToolCallEnd(McpToolCallEndEvent {
-                call_id,
-                success: !result.is_error.unwrap_or(false),
-                result: Some(result),
-            }),
-            None,
-        ),
-        Err(e) => (
-            EventMsg::McpToolCallEnd(McpToolCallEndEvent {
-                call_id,
-                success: false,
-                result: None,
-            }),
-            Some(e),
-        ),
-    };
+        .map_err(|e| format!("tool call error: {e}"));
+    let tool_call_end_event = EventMsg::McpToolCallEnd(McpToolCallEndEvent {
+        call_id: call_id.clone(),
+        result: result.clone(),
+    });

    notify_mcp_tool_call_event(sess, sub_id, tool_call_end_event.clone()).await;
-    let EventMsg::McpToolCallEnd(McpToolCallEndEvent {
-        call_id,
-        success,
-        result,
-    }) = tool_call_end_event
-    else {
-        unimplemented!("unexpected event type");
-    };

-    ResponseInputItem::FunctionCallOutput {
-        call_id,
-        output: FunctionCallOutputPayload {
-            content: result.map_or_else(
-                || format!("err: {tool_call_err:?}"),
-                |result| {
-                    serde_json::to_string(&result)
-                        .unwrap_or_else(|e| format!("JSON serialization error: {e}"))
-                },
-            ),
-            success: Some(success),
-        },
-    }
+    ResponseInputItem::McpToolCallOutput { call_id, result }
 }

 async fn notify_mcp_tool_call_event(sess: &Session, sub_id: &str, event: EventMsg) {
--- a/codex-rs/core/src/models.rs
+++ b/codex-rs/core/src/models.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;

 use base64::Engine;
+use mcp_types::CallToolResult;
 use serde::Deserialize;
 use serde::Serialize;
 use serde::ser::Serializer;
@@ -18,6 +19,10 @@ pub enum ResponseInputItem {
        call_id: String,
        output: FunctionCallOutputPayload,
    },
+    McpToolCallOutput {
+        call_id: String,
+        result: Result<CallToolResult, String>,
+    },
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -77,6 +82,19 @@ impl From<ResponseInputItem> for ResponseItem {
            ResponseInputItem::FunctionCallOutput { call_id, output } => {
                Self::FunctionCallOutput { call_id, output }
            }
+            ResponseInputItem::McpToolCallOutput { call_id, result } => Self::FunctionCallOutput {
+                call_id,
+                output: FunctionCallOutputPayload {
+                    success: Some(result.is_ok()),
+                    content: result.map_or_else(
+                        |tool_call_err| format!("err: {tool_call_err:?}"),
+                        |result| {
+                            serde_json::to_string(&result)
+                                .unwrap_or_else(|e| format!("JSON serialization error: {e}"))
+                        },
+                    ),
+                },
+            },
        }
    }
 }
--- a/codex-rs/core/src/protocol.rs
+++ b/codex-rs/core/src/protocol.rs
@@ -396,10 +396,17 @@ pub struct McpToolCallBeginEvent {
 pub struct McpToolCallEndEvent {
    /// Identifier for the corresponding McpToolCallBegin that finished.
    pub call_id: String,
-    /// Whether the tool call was successful. If `false`, `result` might not be present.
-    pub success: bool,
    /// Result of the tool call. Note this could be an error.
-    pub result: Option<CallToolResult>,
+    pub result: Result<CallToolResult, String>,
+}
+
+impl McpToolCallEndEvent {
+    pub fn is_success(&self) -> bool {
+        match &self.result {
+            Ok(result) => !result.is_error.unwrap_or(false),
+            Err(_) => false,
+        }
+    }
 }

 #[derive(Debug, Clone, Deserialize, Serialize)]