fix: introduce ResponseInputItem::McpToolCallOutput variant (#1151)

The output of an MCP server tool call can be one of several types, but
to date, we treated all outputs as text by showing the serialized JSON
as the "tool output" in Codex:


25a9949c49/codex-rs/mcp-types/src/lib.rs (L96-L101)

This PR adds support for the `ImageContent` variant so we can now
display an image output from an MCP tool call.

In making this change, we introduce a new
`ResponseInputItem::McpToolCallOutput` variant so that we can work with
the `mcp_types::CallToolResult` directly when the function call is made
to an MCP server.

Though arguably the more significant change is the introduction of
`HistoryCell::CompletedMcpToolCallWithImageOutput`, which is a cell that
uses `ratatui_image` to render an image into the terminal. To support
this, we introduce `ImageRenderCache`, cache a
`ratatui_image::picker::Picker`, and `ensure_image_cache()` to cache the
appropriate scaled image data and dimensions based on the current
terminal size.

To test, I created a minimal `package.json`:

```json
{
  "name": "kitty-mcp",
  "version": "1.0.0",
  "type": "module",
  "description": "MCP that returns image of kitty",
  "main": "index.js",
  "dependencies": {
    "@modelcontextprotocol/sdk": "^1.12.0"
  }
}
```

with the following `index.js` to define the MCP server:

```js
#!/usr/bin/env node

import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { readFile } from "node:fs/promises";
import { join } from "node:path";

const IMAGE_URI = "image://Ada.png";

const server = new McpServer({
  name: "Demo",
  version: "1.0.0",
});

server.tool(
  "get-cat-image",
  "If you need a cat image, this tool will provide one.",
  async () => ({
    content: [
      { type: "image", data: await getAdaPngBase64(), mimeType: "image/png" },
    ],
  })
);

server.resource("Ada the Cat", IMAGE_URI, async (uri) => {
  const base64Image = await getAdaPngBase64();
  return {
    contents: [
      {
        uri: uri.href,
        mimeType: "image/png",
        blob: base64Image,
      },
    ],
  };
});

async function getAdaPngBase64() {
  const __dirname = new URL(".", import.meta.url).pathname;
  // From 9705ce2c59/assets/Ada.png
  const filePath = join(__dirname, "Ada.png");
  const imageData = await readFile(filePath);
  const base64Image = imageData.toString("base64");
  return base64Image;
}

const transport = new StdioServerTransport();
await server.connect(transport);
```

With the local changes from this PR, I added the following to my
`config.toml`:

```toml
[mcp_servers.kitty]
command = "node"
args = ["/Users/mbolin/code/kitty-mcp/index.js"]
```

Running the TUI from source:

```
cargo run --bin codex -- --model o3 'I need a picture of a cat'
```

I get:

<img width="732" alt="image"
src="https://github.com/user-attachments/assets/bf80b721-9ca0-4d81-aec7-77d6899e2869"
/>

Now, that said, I have only tested in iTerm and there is definitely some
funny business with getting an accurate character-to-pixel ratio
(sometimes the `CompletedMcpToolCallWithImageOutput` thinks it needs 10
rows to render instead of 4), so there is still work to be done here.
This commit is contained in:
Michael Bolin
2025-05-28 19:03:17 -07:00
committed by GitHub
parent 25a9949c49
commit a768a6a41d
9 changed files with 936 additions and 87 deletions

View File

@@ -50,51 +50,18 @@ pub(crate) async fn handle_mcp_tool_call(
notify_mcp_tool_call_event(sess, sub_id, tool_call_begin_event).await;
// Perform the tool call.
let (tool_call_end_event, tool_call_err) = match sess
let result = sess
.call_tool(&server, &tool_name, arguments_value, timeout)
.await
{
Ok(result) => (
EventMsg::McpToolCallEnd(McpToolCallEndEvent {
call_id,
success: !result.is_error.unwrap_or(false),
result: Some(result),
}),
None,
),
Err(e) => (
EventMsg::McpToolCallEnd(McpToolCallEndEvent {
call_id,
success: false,
result: None,
}),
Some(e),
),
};
.map_err(|e| format!("tool call error: {e}"));
let tool_call_end_event = EventMsg::McpToolCallEnd(McpToolCallEndEvent {
call_id: call_id.clone(),
result: result.clone(),
});
notify_mcp_tool_call_event(sess, sub_id, tool_call_end_event.clone()).await;
let EventMsg::McpToolCallEnd(McpToolCallEndEvent {
call_id,
success,
result,
}) = tool_call_end_event
else {
unimplemented!("unexpected event type");
};
ResponseInputItem::FunctionCallOutput {
call_id,
output: FunctionCallOutputPayload {
content: result.map_or_else(
|| format!("err: {tool_call_err:?}"),
|result| {
serde_json::to_string(&result)
.unwrap_or_else(|e| format!("JSON serialization error: {e}"))
},
),
success: Some(success),
},
}
ResponseInputItem::McpToolCallOutput { call_id, result }
}
async fn notify_mcp_tool_call_event(sess: &Session, sub_id: &str, event: EventMsg) {

View File

@@ -1,6 +1,7 @@
use std::collections::HashMap;
use base64::Engine;
use mcp_types::CallToolResult;
use serde::Deserialize;
use serde::Serialize;
use serde::ser::Serializer;
@@ -18,6 +19,10 @@ pub enum ResponseInputItem {
call_id: String,
output: FunctionCallOutputPayload,
},
McpToolCallOutput {
call_id: String,
result: Result<CallToolResult, String>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -77,6 +82,19 @@ impl From<ResponseInputItem> for ResponseItem {
ResponseInputItem::FunctionCallOutput { call_id, output } => {
Self::FunctionCallOutput { call_id, output }
}
ResponseInputItem::McpToolCallOutput { call_id, result } => Self::FunctionCallOutput {
call_id,
output: FunctionCallOutputPayload {
success: Some(result.is_ok()),
content: result.map_or_else(
|tool_call_err| format!("err: {tool_call_err:?}"),
|result| {
serde_json::to_string(&result)
.unwrap_or_else(|e| format!("JSON serialization error: {e}"))
},
),
},
},
}
}
}

View File

@@ -396,10 +396,17 @@ pub struct McpToolCallBeginEvent {
pub struct McpToolCallEndEvent {
/// Identifier for the corresponding McpToolCallBegin that finished.
pub call_id: String,
/// Whether the tool call was successful. If `false`, `result` might not be present.
pub success: bool,
/// Result of the tool call. Note this could be an error.
pub result: Option<CallToolResult>,
pub result: Result<CallToolResult, String>,
}
impl McpToolCallEndEvent {
pub fn is_success(&self) -> bool {
match &self.result {
Ok(result) => !result.is_error.unwrap_or(false),
Err(_) => false,
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]