[MCP] Render MCP tool call result images to the model (#5600)
It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
This commit is contained in:
@@ -17,6 +17,7 @@ use crate::util::backoff;
|
||||
use bytes::Bytes;
|
||||
use codex_otel::otel_event_manager::OtelEventManager;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::ReasoningItemContent;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use eventsource_stream::Eventsource;
|
||||
@@ -159,16 +160,26 @@ pub(crate) async fn stream_chat_completions(
|
||||
for (idx, item) in input.iter().enumerate() {
|
||||
match item {
|
||||
ResponseItem::Message { role, content, .. } => {
|
||||
// Build content either as a plain string (typical for assistant text)
|
||||
// or as an array of content items when images are present (user/tool multimodal).
|
||||
let mut text = String::new();
|
||||
let mut items: Vec<serde_json::Value> = Vec::new();
|
||||
let mut saw_image = false;
|
||||
|
||||
for c in content {
|
||||
match c {
|
||||
ContentItem::InputText { text: t }
|
||||
| ContentItem::OutputText { text: t } => {
|
||||
text.push_str(t);
|
||||
items.push(json!({"type":"text","text": t}));
|
||||
}
|
||||
ContentItem::InputImage { image_url } => {
|
||||
saw_image = true;
|
||||
items.push(json!({"type":"image_url","image_url": {"url": image_url}}));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip exact-duplicate assistant messages.
|
||||
if role == "assistant" {
|
||||
if let Some(prev) = &last_assistant_text
|
||||
@@ -179,7 +190,17 @@ pub(crate) async fn stream_chat_completions(
|
||||
last_assistant_text = Some(text.clone());
|
||||
}
|
||||
|
||||
let mut msg = json!({"role": role, "content": text});
|
||||
// For assistant messages, always send a plain string for compatibility.
|
||||
// For user messages, if an image is present, send an array of content items.
|
||||
let content_value = if role == "assistant" {
|
||||
json!(text)
|
||||
} else if saw_image {
|
||||
json!(items)
|
||||
} else {
|
||||
json!(text)
|
||||
};
|
||||
|
||||
let mut msg = json!({"role": role, "content": content_value});
|
||||
if role == "assistant"
|
||||
&& let Some(reasoning) = reasoning_by_anchor_index.get(&idx)
|
||||
&& let Some(obj) = msg.as_object_mut()
|
||||
@@ -238,10 +259,29 @@ pub(crate) async fn stream_chat_completions(
|
||||
messages.push(msg);
|
||||
}
|
||||
ResponseItem::FunctionCallOutput { call_id, output } => {
|
||||
// Prefer structured content items when available (e.g., images)
|
||||
// otherwise fall back to the legacy plain-string content.
|
||||
let content_value = if let Some(items) = &output.content_items {
|
||||
let mapped: Vec<serde_json::Value> = items
|
||||
.iter()
|
||||
.map(|it| match it {
|
||||
FunctionCallOutputContentItem::InputText { text } => {
|
||||
json!({"type":"text","text": text})
|
||||
}
|
||||
FunctionCallOutputContentItem::InputImage { image_url } => {
|
||||
json!({"type":"image_url","image_url": {"url": image_url}})
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
json!(mapped)
|
||||
} else {
|
||||
json!(output.content)
|
||||
};
|
||||
|
||||
messages.push(json!({
|
||||
"role": "tool",
|
||||
"tool_call_id": call_id,
|
||||
"content": output.content,
|
||||
"content": content_value,
|
||||
}));
|
||||
}
|
||||
ResponseItem::CustomToolCall {
|
||||
|
||||
@@ -2047,7 +2047,7 @@ async fn try_run_turn(
|
||||
call_id: String::new(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: msg.to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
add_completed(ProcessedResponseItem {
|
||||
@@ -2061,7 +2061,7 @@ async fn try_run_turn(
|
||||
call_id: String::new(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: message,
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
add_completed(ProcessedResponseItem {
|
||||
@@ -2199,41 +2199,6 @@ pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -
|
||||
}
|
||||
})
|
||||
}
|
||||
pub(crate) fn convert_call_tool_result_to_function_call_output_payload(
|
||||
call_tool_result: &CallToolResult,
|
||||
) -> FunctionCallOutputPayload {
|
||||
let CallToolResult {
|
||||
content,
|
||||
is_error,
|
||||
structured_content,
|
||||
} = call_tool_result;
|
||||
|
||||
// In terms of what to send back to the model, we prefer structured_content,
|
||||
// if available, and fallback to content, otherwise.
|
||||
let mut is_success = is_error != &Some(true);
|
||||
let content = if let Some(structured_content) = structured_content
|
||||
&& structured_content != &serde_json::Value::Null
|
||||
&& let Ok(serialized_structured_content) = serde_json::to_string(&structured_content)
|
||||
{
|
||||
serialized_structured_content
|
||||
} else {
|
||||
match serde_json::to_string(&content) {
|
||||
Ok(serialized_content) => serialized_content,
|
||||
Err(err) => {
|
||||
// If we could not serialize either content or structured_content to
|
||||
// JSON, flag this as an error.
|
||||
is_success = false;
|
||||
err.to_string()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
FunctionCallOutputPayload {
|
||||
content,
|
||||
success: Some(is_success),
|
||||
}
|
||||
}
|
||||
|
||||
/// Emits an ExitedReviewMode Event with optional ReviewOutput,
|
||||
/// and records a developer message with the review output.
|
||||
pub(crate) async fn exit_review_mode(
|
||||
@@ -2439,7 +2404,7 @@ mod tests {
|
||||
})),
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&json!({
|
||||
"ok": true,
|
||||
@@ -2447,6 +2412,7 @@ mod tests {
|
||||
}))
|
||||
.unwrap(),
|
||||
success: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
@@ -2479,11 +2445,12 @@ mod tests {
|
||||
structured_content: Some(serde_json::Value::Null),
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&vec![text_block("hello"), text_block("world")])
|
||||
.unwrap(),
|
||||
success: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
@@ -2497,10 +2464,11 @@ mod tests {
|
||||
structured_content: Some(json!({ "message": "bad" })),
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&json!({ "message": "bad" })).unwrap(),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
@@ -2514,10 +2482,11 @@ mod tests {
|
||||
structured_content: None,
|
||||
};
|
||||
|
||||
let got = convert_call_tool_result_to_function_call_output_payload(&ctr);
|
||||
let got = FunctionCallOutputPayload::from(&ctr);
|
||||
let expected = FunctionCallOutputPayload {
|
||||
content: serde_json::to_string(&vec![text_block("alpha")]).unwrap(),
|
||||
success: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert_eq!(expected, got);
|
||||
|
||||
@@ -136,7 +136,7 @@ impl ConversationHistory {
|
||||
call_id: call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
));
|
||||
@@ -183,7 +183,7 @@ impl ConversationHistory {
|
||||
call_id: call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
));
|
||||
@@ -565,7 +565,7 @@ mod tests {
|
||||
call_id: "call-1".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
];
|
||||
@@ -581,7 +581,7 @@ mod tests {
|
||||
call_id: "call-2".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
ResponseItem::FunctionCall {
|
||||
@@ -615,7 +615,7 @@ mod tests {
|
||||
call_id: "call-3".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
];
|
||||
@@ -848,7 +848,7 @@ mod tests {
|
||||
call_id: "call-x".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
]
|
||||
@@ -925,7 +925,7 @@ mod tests {
|
||||
call_id: "shell-1".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
]
|
||||
@@ -939,7 +939,7 @@ mod tests {
|
||||
call_id: "orphan-1".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
}];
|
||||
let mut h = create_history_with_items(items);
|
||||
@@ -979,7 +979,7 @@ mod tests {
|
||||
call_id: "c2".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
// Will get an inserted custom tool output
|
||||
@@ -1021,7 +1021,7 @@ mod tests {
|
||||
call_id: "c1".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
ResponseItem::CustomToolCall {
|
||||
@@ -1051,7 +1051,7 @@ mod tests {
|
||||
call_id: "s1".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
]
|
||||
@@ -1116,7 +1116,7 @@ mod tests {
|
||||
call_id: "orphan-1".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
}];
|
||||
let mut h = create_history_with_items(items);
|
||||
@@ -1150,7 +1150,7 @@ mod tests {
|
||||
call_id: "c2".to_string(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
ResponseItem::CustomToolCall {
|
||||
|
||||
@@ -35,6 +35,7 @@ pub(crate) async fn handle_mcp_tool_call(
|
||||
output: FunctionCallOutputPayload {
|
||||
content: format!("err: {e}"),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -61,14 +61,11 @@ pub(crate) async fn process_items(
|
||||
) => {
|
||||
items_to_record_in_conversation_history.push(item);
|
||||
let output = match result {
|
||||
Ok(call_tool_result) => {
|
||||
crate::codex::convert_call_tool_result_to_function_call_output_payload(
|
||||
call_tool_result,
|
||||
)
|
||||
}
|
||||
Ok(call_tool_result) => FunctionCallOutputPayload::from(call_tool_result),
|
||||
Err(err) => FunctionCallOutputPayload {
|
||||
content: err.clone(),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
|
||||
|
||||
@@ -5,6 +5,7 @@ use crate::tools::TELEMETRY_PREVIEW_MAX_LINES;
|
||||
use crate::tools::TELEMETRY_PREVIEW_TRUNCATION_NOTICE;
|
||||
use crate::turn_diff_tracker::TurnDiffTracker;
|
||||
use codex_otel::otel_event_manager::OtelEventManager;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ResponseInputItem;
|
||||
use codex_protocol::models::ShellToolCallParams;
|
||||
@@ -65,7 +66,10 @@ impl ToolPayload {
|
||||
#[derive(Clone)]
|
||||
pub enum ToolOutput {
|
||||
Function {
|
||||
// Plain text representation of the tool output.
|
||||
content: String,
|
||||
// Some tool calls such as MCP calls may return structured content that can get parsed into an array of polymorphic content items.
|
||||
content_items: Option<Vec<FunctionCallOutputContentItem>>,
|
||||
success: Option<bool>,
|
||||
},
|
||||
Mcp {
|
||||
@@ -90,7 +94,11 @@ impl ToolOutput {
|
||||
|
||||
pub fn into_response(self, call_id: &str, payload: &ToolPayload) -> ResponseInputItem {
|
||||
match self {
|
||||
ToolOutput::Function { content, success } => {
|
||||
ToolOutput::Function {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
} => {
|
||||
if matches!(payload, ToolPayload::Custom { .. }) {
|
||||
ResponseInputItem::CustomToolCallOutput {
|
||||
call_id: call_id.to_string(),
|
||||
@@ -99,7 +107,11 @@ impl ToolOutput {
|
||||
} else {
|
||||
ResponseInputItem::FunctionCallOutput {
|
||||
call_id: call_id.to_string(),
|
||||
output: FunctionCallOutputPayload { content, success },
|
||||
output: FunctionCallOutputPayload {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -163,6 +175,7 @@ mod tests {
|
||||
};
|
||||
let response = ToolOutput::Function {
|
||||
content: "patched".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
}
|
||||
.into_response("call-42", &payload);
|
||||
@@ -183,6 +196,7 @@ mod tests {
|
||||
};
|
||||
let response = ToolOutput::Function {
|
||||
content: "ok".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
}
|
||||
.into_response("fn-1", &payload);
|
||||
@@ -191,6 +205,7 @@ mod tests {
|
||||
ResponseInputItem::FunctionCallOutput { call_id, output } => {
|
||||
assert_eq!(call_id, "fn-1");
|
||||
assert_eq!(output.content, "ok");
|
||||
assert!(output.content_items.is_none());
|
||||
assert_eq!(output.success, Some(true));
|
||||
}
|
||||
other => panic!("expected FunctionCallOutput, got {other:?}"),
|
||||
|
||||
@@ -82,6 +82,7 @@ impl ToolHandler for ApplyPatchHandler {
|
||||
let content = item?;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
@@ -126,6 +127,7 @@ impl ToolHandler for ApplyPatchHandler {
|
||||
let content = emitter.finish(event_ctx, out).await?;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -90,11 +90,13 @@ impl ToolHandler for GrepFilesHandler {
|
||||
if search_results.is_empty() {
|
||||
Ok(ToolOutput::Function {
|
||||
content: "No matches found.".to_string(),
|
||||
content_items: None,
|
||||
success: Some(false),
|
||||
})
|
||||
} else {
|
||||
Ok(ToolOutput::Function {
|
||||
content: search_results.join("\n"),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -106,6 +106,7 @@ impl ToolHandler for ListDirHandler {
|
||||
output.extend(entries);
|
||||
Ok(ToolOutput::Function {
|
||||
content: output.join("\n"),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -56,8 +56,16 @@ impl ToolHandler for McpHandler {
|
||||
Ok(ToolOutput::Mcp { result })
|
||||
}
|
||||
codex_protocol::models::ResponseInputItem::FunctionCallOutput { output, .. } => {
|
||||
let codex_protocol::models::FunctionCallOutputPayload { content, success } = output;
|
||||
Ok(ToolOutput::Function { content, success })
|
||||
let codex_protocol::models::FunctionCallOutputPayload {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
} = output;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items,
|
||||
success,
|
||||
})
|
||||
}
|
||||
_ => Err(FunctionCallError::RespondToModel(
|
||||
"mcp handler received unexpected response variant".to_string(),
|
||||
|
||||
@@ -297,7 +297,10 @@ async fn handle_list_resources(
|
||||
match payload_result {
|
||||
Ok(payload) => match serialize_function_output(payload) {
|
||||
Ok(output) => {
|
||||
let ToolOutput::Function { content, success } = &output else {
|
||||
let ToolOutput::Function {
|
||||
content, success, ..
|
||||
} = &output
|
||||
else {
|
||||
unreachable!("MCP resource handler should return function output");
|
||||
};
|
||||
let duration = start.elapsed();
|
||||
@@ -403,7 +406,10 @@ async fn handle_list_resource_templates(
|
||||
match payload_result {
|
||||
Ok(payload) => match serialize_function_output(payload) {
|
||||
Ok(output) => {
|
||||
let ToolOutput::Function { content, success } = &output else {
|
||||
let ToolOutput::Function {
|
||||
content, success, ..
|
||||
} = &output
|
||||
else {
|
||||
unreachable!("MCP resource handler should return function output");
|
||||
};
|
||||
let duration = start.elapsed();
|
||||
@@ -489,7 +495,10 @@ async fn handle_read_resource(
|
||||
match payload_result {
|
||||
Ok(payload) => match serialize_function_output(payload) {
|
||||
Ok(output) => {
|
||||
let ToolOutput::Function { content, success } = &output else {
|
||||
let ToolOutput::Function {
|
||||
content, success, ..
|
||||
} = &output
|
||||
else {
|
||||
unreachable!("MCP resource handler should return function output");
|
||||
};
|
||||
let duration = start.elapsed();
|
||||
@@ -618,6 +627,7 @@ where
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -88,6 +88,7 @@ impl ToolHandler for PlanHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -149,6 +149,7 @@ impl ToolHandler for ReadFileHandler {
|
||||
};
|
||||
Ok(ToolOutput::Function {
|
||||
content: collected.join("\n"),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -136,6 +136,7 @@ impl ShellHandler {
|
||||
let content = item?;
|
||||
return Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
});
|
||||
}
|
||||
@@ -179,6 +180,7 @@ impl ShellHandler {
|
||||
let content = emitter.finish(event_ctx, out).await?;
|
||||
return Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
});
|
||||
}
|
||||
@@ -226,6 +228,7 @@ impl ShellHandler {
|
||||
let content = emitter.finish(event_ctx, out).await?;
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -95,6 +95,7 @@ impl ToolHandler for TestSyncHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content: "ok".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -171,6 +171,7 @@ impl ToolHandler for UnifiedExecHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content,
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -85,6 +85,7 @@ impl ToolHandler for ViewImageHandler {
|
||||
|
||||
Ok(ToolOutput::Function {
|
||||
content: "attached local image path".to_string(),
|
||||
content_items: None,
|
||||
success: Some(true),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ impl ToolCallRuntime {
|
||||
call_id: call.call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "aborted".to_string(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -181,6 +181,7 @@ impl ToolRouter {
|
||||
output: codex_protocol::models::FunctionCallOutputPayload {
|
||||
content: message,
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,8 @@ use codex_core::features::Feature;
|
||||
|
||||
use codex_core::protocol::AskForApproval;
|
||||
use codex_core::protocol::EventMsg;
|
||||
use codex_core::protocol::McpInvocation;
|
||||
use codex_core::protocol::McpToolCallBeginEvent;
|
||||
use codex_core::protocol::Op;
|
||||
use codex_core::protocol::SandboxPolicy;
|
||||
use codex_protocol::config_types::ReasoningSummary;
|
||||
@@ -25,7 +27,9 @@ use core_test_support::test_codex::test_codex;
|
||||
use core_test_support::wait_for_event;
|
||||
use core_test_support::wait_for_event_with_timeout;
|
||||
use escargot::CargoBuild;
|
||||
use mcp_types::ContentBlock;
|
||||
use serde_json::Value;
|
||||
use serde_json::json;
|
||||
use serial_test::serial;
|
||||
use tempfile::tempdir;
|
||||
use tokio::net::TcpStream;
|
||||
@@ -35,6 +39,8 @@ use tokio::time::Instant;
|
||||
use tokio::time::sleep;
|
||||
use wiremock::matchers::any;
|
||||
|
||||
static OPENAI_PNG: &str = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAD0AAAA9CAYAAAAeYmHpAAAE6klEQVR4Aeyau44UVxCGx1fZsmRLlm3Zoe0XcGQ5cUiCCIgJeS9CHgAhMkISQnIuGQgJEkBcxLW+nqnZ6uqqc+nuWRC7q/P3qetf9e+MtOwyX25O4Nep6JPyop++0qev9HrfgZ+F6r2DuB/vHOrt/UIkqdDHYvujOW6fO7h/CNEI+a5jc+pBR8uy0jVFsziYu5HtfSUk+Io34q921hLNctFSX0gwww+S8wce8K1LfCU+cYW4888aov8NxqvQILUPPReLOrm6zyLxa4i+6VZuFbJo8d1MOHZm+7VUtB/aIvhPWc/3SWg49JcwFLlHxuXKjtyloo+YNhuW3VS+WPBuUEMvCFKjEDVgFBQHXrnazpqiSxNZCkQ1kYiozsbm9Oz7l4i2Il7vGccGNWAc3XosDrZe/9P3ZnMmzHNEQw4smf8RQ87XEAMsC7Az0Au+dgXerfH4+sHvEc0SYGic8WBBUGqFH2gN7yDrazy7m2pbRTeRmU3+MjZmr1h6LJgPbGy23SI6GlYT0brQ71IY8Us4PNQCm+zepSbaD2BY9xCaAsD9IIj/IzFmKMSdHHonwdZATbTnYREf6/VZGER98N9yCWIvXQwXDoDdhZJoT8jwLnJXDB9w4Sb3e6nK5ndzlkTLnP3JBu4LKkbrYrU69gCVceV0JvpyuW1xlsUVngzhwMetn/XamtTORF9IO5YnWNiyeF9zCAfqR3fUW+vZZKLtgP+ts8BmQRBREAdRDhH3o8QuRh/YucNFz2BEjxbRN6LGzphfKmvP6v6QhqIQyZ8XNJ0W0X83MR1PEcJBNO2KC2Z1TW/v244scp9FwRViZxIOBF0Lctk7ZVSavdLvRlV1hz/ysUi9sr8CIcB3nvWBwA93ykTz18eAYxQ6N/K2DkPA1lv3iXCwmDUT7YkjIby9siXueIJj9H+pzSqJ9oIuJWTUgSSt4WO7o/9GGg0viR4VinNRUDoIj34xoCd6pxD3aK3zfdbnx5v1J3ZNNEJsE0sBG7N27ReDrJc4sFxz7dI/ZAbOmmiKvHBitQXpAdR6+F7v+/ol/tOouUV01EeMZQF2BoQDn6dP4XNr+j9GZEtEK1/L8pFw7bd3a53tsTa7WD+054jOFmPg1XBKPQgnqFfmFcy32ZRvjmiIIQTYFvyDxQ8nH8WIwwGwlyDjDznnilYyFr6njrlZwsKkBpO59A7OwgdzPEWRm+G+oeb7IfyNuzjEEVLrOVxJsxvxwF8kmCM6I2QYmJunz4u4TrADpfl7mlbRTWQ7VmrBzh3+C9f6Grc3YoGN9dg/SXFthpRsT6vobfXRs2VBlgBHXVMLHjDNbIZv1sZ9+X3hB09cXdH1JKViyG0+W9bWZDa/r2f9zAFR71sTzGpMSWz2iI4YssWjWo3REy1MDGjdwe5e0dFSiAC1JakBvu4/CUS8Eh6dqHdU0Or0ioY3W5ClSqDXAy7/6SRfgw8vt4I+tbvvNtFT2kVDhY5+IGb1rCqYaXNF08vSALsXCPmt0kQNqJT1p5eI1mkIV/BxCY1z85lOzeFbPBQHURkkPTlwTYK9gTVE25l84IbFFN+YJDHjdpn0gq6mrHht0dkcjbM4UL9283O5p77GN+SPW/QwVB4IUYg7Or+Kp7naR6qktP98LNF2UxWo9yObPIT9KYg+hK4i56no4rfnM0qeyFf6AwAAAP//trwR3wAAAAZJREFUAwBZ0sR75itw5gAAAABJRU5ErkJggg==";
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
#[serial(mcp_test_value)]
|
||||
async fn stdio_server_round_trip() -> anyhow::Result<()> {
|
||||
@@ -175,6 +181,352 @@ async fn stdio_server_round_trip() -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
#[serial(mcp_test_value)]
|
||||
async fn stdio_image_responses_round_trip() -> anyhow::Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
|
||||
let call_id = "img-1";
|
||||
let server_name = "rmcp";
|
||||
let tool_name = format!("mcp__{server_name}__image");
|
||||
|
||||
// First stream: model decides to call the image tool.
|
||||
mount_sse_once_match(
|
||||
&server,
|
||||
any(),
|
||||
responses::sse(vec![
|
||||
responses::ev_response_created("resp-1"),
|
||||
responses::ev_function_call(call_id, &tool_name, "{}"),
|
||||
responses::ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
// Second stream: after tool execution, assistant emits a message and completes.
|
||||
let final_mock = mount_sse_once_match(
|
||||
&server,
|
||||
any(),
|
||||
responses::sse(vec![
|
||||
responses::ev_assistant_message("msg-1", "rmcp image tool completed successfully."),
|
||||
responses::ev_completed("resp-2"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Build the stdio rmcp server and pass the image as data URL so it can construct ImageContent.
|
||||
let rmcp_test_server_bin = CargoBuild::new()
|
||||
.package("codex-rmcp-client")
|
||||
.bin("test_stdio_server")
|
||||
.run()?
|
||||
.path()
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
|
||||
let fixture = test_codex()
|
||||
.with_config(move |config| {
|
||||
config.features.enable(Feature::RmcpClient);
|
||||
config.mcp_servers.insert(
|
||||
server_name.to_string(),
|
||||
McpServerConfig {
|
||||
transport: McpServerTransportConfig::Stdio {
|
||||
command: rmcp_test_server_bin,
|
||||
args: Vec::new(),
|
||||
env: Some(HashMap::from([(
|
||||
"MCP_TEST_IMAGE_DATA_URL".to_string(),
|
||||
OPENAI_PNG.to_string(),
|
||||
)])),
|
||||
env_vars: Vec::new(),
|
||||
cwd: None,
|
||||
},
|
||||
enabled: true,
|
||||
startup_timeout_sec: Some(Duration::from_secs(10)),
|
||||
tool_timeout_sec: None,
|
||||
enabled_tools: None,
|
||||
disabled_tools: None,
|
||||
},
|
||||
);
|
||||
})
|
||||
.build(&server)
|
||||
.await?;
|
||||
let session_model = fixture.session_configured.model.clone();
|
||||
|
||||
fixture
|
||||
.codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "call the rmcp image tool".into(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: fixture.cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::ReadOnly,
|
||||
model: session_model,
|
||||
effort: None,
|
||||
summary: ReasoningSummary::Auto,
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Wait for tool begin/end and final completion.
|
||||
let begin_event = wait_for_event_with_timeout(
|
||||
&fixture.codex,
|
||||
|ev| matches!(ev, EventMsg::McpToolCallBegin(_)),
|
||||
Duration::from_secs(10),
|
||||
)
|
||||
.await;
|
||||
let EventMsg::McpToolCallBegin(begin) = begin_event else {
|
||||
unreachable!("begin");
|
||||
};
|
||||
assert_eq!(
|
||||
begin,
|
||||
McpToolCallBeginEvent {
|
||||
call_id: call_id.to_string(),
|
||||
invocation: McpInvocation {
|
||||
server: server_name.to_string(),
|
||||
tool: "image".to_string(),
|
||||
arguments: Some(json!({})),
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
let end_event = wait_for_event(&fixture.codex, |ev| {
|
||||
matches!(ev, EventMsg::McpToolCallEnd(_))
|
||||
})
|
||||
.await;
|
||||
let EventMsg::McpToolCallEnd(end) = end_event else {
|
||||
unreachable!("end");
|
||||
};
|
||||
assert_eq!(end.call_id, call_id);
|
||||
assert_eq!(
|
||||
end.invocation,
|
||||
McpInvocation {
|
||||
server: server_name.to_string(),
|
||||
tool: "image".to_string(),
|
||||
arguments: Some(json!({})),
|
||||
}
|
||||
);
|
||||
let result = end.result.expect("rmcp image tool should return success");
|
||||
assert_eq!(result.is_error, Some(false));
|
||||
assert_eq!(result.content.len(), 1);
|
||||
let base64_only = OPENAI_PNG
|
||||
.strip_prefix("data:image/png;base64,")
|
||||
.expect("data url prefix");
|
||||
match &result.content[0] {
|
||||
ContentBlock::ImageContent(img) => {
|
||||
assert_eq!(img.mime_type, "image/png");
|
||||
assert_eq!(img.r#type, "image");
|
||||
assert_eq!(img.data, base64_only);
|
||||
}
|
||||
other => panic!("expected image content, got {other:?}"),
|
||||
}
|
||||
|
||||
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
||||
|
||||
let output_item = final_mock.single_request().function_call_output(call_id);
|
||||
assert_eq!(
|
||||
output_item,
|
||||
json!({
|
||||
"type": "function_call_output",
|
||||
"call_id": call_id,
|
||||
"output": [{
|
||||
"type": "input_image",
|
||||
"image_url": OPENAI_PNG
|
||||
}]
|
||||
})
|
||||
);
|
||||
server.verify().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
#[serial(mcp_test_value)]
|
||||
async fn stdio_image_completions_round_trip() -> anyhow::Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
|
||||
let call_id = "img-cc-1";
|
||||
let server_name = "rmcp";
|
||||
let tool_name = format!("mcp__{server_name}__image");
|
||||
|
||||
let tool_call = json!({
|
||||
"choices": [
|
||||
{
|
||||
"delta": {
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": call_id,
|
||||
"type": "function",
|
||||
"function": {"name": tool_name, "arguments": "{}"}
|
||||
}
|
||||
]
|
||||
},
|
||||
"finish_reason": "tool_calls"
|
||||
}
|
||||
]
|
||||
});
|
||||
let sse_tool_call = format!(
|
||||
"data: {}\n\ndata: [DONE]\n\n",
|
||||
serde_json::to_string(&tool_call)?
|
||||
);
|
||||
|
||||
let final_assistant = json!({
|
||||
"choices": [
|
||||
{
|
||||
"delta": {"content": "rmcp image tool completed successfully."},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
]
|
||||
});
|
||||
let sse_final = format!(
|
||||
"data: {}\n\ndata: [DONE]\n\n",
|
||||
serde_json::to_string(&final_assistant)?
|
||||
);
|
||||
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
struct ChatSeqResponder {
|
||||
num_calls: AtomicUsize,
|
||||
bodies: Vec<String>,
|
||||
}
|
||||
impl wiremock::Respond for ChatSeqResponder {
|
||||
fn respond(&self, _: &wiremock::Request) -> wiremock::ResponseTemplate {
|
||||
let idx = self.num_calls.fetch_add(1, Ordering::SeqCst);
|
||||
match self.bodies.get(idx) {
|
||||
Some(body) => wiremock::ResponseTemplate::new(200)
|
||||
.insert_header("content-type", "text/event-stream")
|
||||
.set_body_string(body.clone()),
|
||||
None => panic!("no chat completion response for index {idx}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let chat_seq = ChatSeqResponder {
|
||||
num_calls: AtomicUsize::new(0),
|
||||
bodies: vec![sse_tool_call, sse_final],
|
||||
};
|
||||
wiremock::Mock::given(wiremock::matchers::method("POST"))
|
||||
.and(wiremock::matchers::path("/v1/chat/completions"))
|
||||
.respond_with(chat_seq)
|
||||
.expect(2)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let rmcp_test_server_bin = CargoBuild::new()
|
||||
.package("codex-rmcp-client")
|
||||
.bin("test_stdio_server")
|
||||
.run()?
|
||||
.path()
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
|
||||
let fixture = test_codex()
|
||||
.with_config(move |config| {
|
||||
config.model_provider.wire_api = codex_core::WireApi::Chat;
|
||||
config.features.enable(Feature::RmcpClient);
|
||||
config.mcp_servers.insert(
|
||||
server_name.to_string(),
|
||||
McpServerConfig {
|
||||
transport: McpServerTransportConfig::Stdio {
|
||||
command: rmcp_test_server_bin,
|
||||
args: Vec::new(),
|
||||
env: Some(HashMap::from([(
|
||||
"MCP_TEST_IMAGE_DATA_URL".to_string(),
|
||||
OPENAI_PNG.to_string(),
|
||||
)])),
|
||||
env_vars: Vec::new(),
|
||||
cwd: None,
|
||||
},
|
||||
enabled: true,
|
||||
startup_timeout_sec: Some(Duration::from_secs(10)),
|
||||
tool_timeout_sec: None,
|
||||
enabled_tools: None,
|
||||
disabled_tools: None,
|
||||
},
|
||||
);
|
||||
})
|
||||
.build(&server)
|
||||
.await?;
|
||||
let session_model = fixture.session_configured.model.clone();
|
||||
|
||||
fixture
|
||||
.codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "call the rmcp image tool".into(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: fixture.cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::ReadOnly,
|
||||
model: session_model,
|
||||
effort: None,
|
||||
summary: ReasoningSummary::Auto,
|
||||
})
|
||||
.await?;
|
||||
|
||||
let begin_event = wait_for_event_with_timeout(
|
||||
&fixture.codex,
|
||||
|ev| matches!(ev, EventMsg::McpToolCallBegin(_)),
|
||||
Duration::from_secs(10),
|
||||
)
|
||||
.await;
|
||||
let EventMsg::McpToolCallBegin(begin) = begin_event else {
|
||||
unreachable!("begin");
|
||||
};
|
||||
assert_eq!(
|
||||
begin,
|
||||
McpToolCallBeginEvent {
|
||||
call_id: call_id.to_string(),
|
||||
invocation: McpInvocation {
|
||||
server: server_name.to_string(),
|
||||
tool: "image".to_string(),
|
||||
arguments: Some(json!({})),
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
let end_event = wait_for_event(&fixture.codex, |ev| {
|
||||
matches!(ev, EventMsg::McpToolCallEnd(_))
|
||||
})
|
||||
.await;
|
||||
let EventMsg::McpToolCallEnd(end) = end_event else {
|
||||
unreachable!("end");
|
||||
};
|
||||
assert!(end.result.as_ref().is_ok(), "tool call should succeed");
|
||||
|
||||
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
||||
|
||||
// Chat Completions assertion: the second POST should include a tool role message
|
||||
// with an array `content` containing an item with the expected data URL.
|
||||
let requests = server.received_requests().await.expect("requests captured");
|
||||
assert!(requests.len() >= 2, "expected two chat completion calls");
|
||||
let second = &requests[1];
|
||||
let body: Value = serde_json::from_slice(&second.body)?;
|
||||
let messages = body
|
||||
.get("messages")
|
||||
.and_then(Value::as_array)
|
||||
.cloned()
|
||||
.expect("messages array");
|
||||
let tool_msg = messages
|
||||
.iter()
|
||||
.find(|m| {
|
||||
m.get("role") == Some(&json!("tool")) && m.get("tool_call_id") == Some(&json!(call_id))
|
||||
})
|
||||
.cloned()
|
||||
.expect("tool message present");
|
||||
assert_eq!(
|
||||
tool_msg,
|
||||
json!({
|
||||
"role": "tool",
|
||||
"tool_call_id": call_id,
|
||||
"content": [{"type": "image_url", "image_url": {"url": OPENAI_PNG}}]
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
#[serial(mcp_test_value)]
|
||||
async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::collections::HashMap;
|
||||
use base64::Engine;
|
||||
use codex_utils_image::load_and_resize_to_fit;
|
||||
use mcp_types::CallToolResult;
|
||||
use mcp_types::ContentBlock;
|
||||
use serde::Deserialize;
|
||||
use serde::Deserializer;
|
||||
use serde::Serialize;
|
||||
@@ -83,9 +84,8 @@ pub enum ResponseItem {
|
||||
// NOTE: The input schema for `function_call_output` objects that clients send to the
|
||||
// OpenAI /v1/responses endpoint is NOT the same shape as the objects the server returns on the
|
||||
// SSE stream. When *sending* we must wrap the string output inside an object that includes a
|
||||
// required `success` boolean. The upstream TypeScript CLI does this implicitly. To ensure we
|
||||
// serialize exactly the expected shape we introduce a dedicated payload struct and flatten it
|
||||
// here.
|
||||
// required `success` boolean. To ensure we serialize exactly the expected shape we introduce
|
||||
// a dedicated payload struct and flatten it here.
|
||||
FunctionCallOutput {
|
||||
call_id: String,
|
||||
output: FunctionCallOutputPayload,
|
||||
@@ -160,19 +160,17 @@ impl From<ResponseInputItem> for ResponseItem {
|
||||
ResponseInputItem::FunctionCallOutput { call_id, output } => {
|
||||
Self::FunctionCallOutput { call_id, output }
|
||||
}
|
||||
ResponseInputItem::McpToolCallOutput { call_id, result } => Self::FunctionCallOutput {
|
||||
call_id,
|
||||
output: FunctionCallOutputPayload {
|
||||
success: Some(result.is_ok()),
|
||||
content: result.map_or_else(
|
||||
|tool_call_err| format!("err: {tool_call_err:?}"),
|
||||
|result| {
|
||||
serde_json::to_string(&result)
|
||||
.unwrap_or_else(|e| format!("JSON serialization error: {e}"))
|
||||
},
|
||||
),
|
||||
},
|
||||
},
|
||||
ResponseInputItem::McpToolCallOutput { call_id, result } => {
|
||||
let output = match result {
|
||||
Ok(result) => FunctionCallOutputPayload::from(&result),
|
||||
Err(tool_call_err) => FunctionCallOutputPayload {
|
||||
content: format!("err: {tool_call_err:?}"),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
Self::FunctionCallOutput { call_id, output }
|
||||
}
|
||||
ResponseInputItem::CustomToolCallOutput { call_id, output } => {
|
||||
Self::CustomToolCallOutput { call_id, output }
|
||||
}
|
||||
@@ -290,31 +288,53 @@ pub struct ShellToolCallParams {
|
||||
pub justification: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
/// Responses API compatible content items that can be returned by a tool call.
|
||||
/// This is a subset of ContentItem with the types we support as function call outputs.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum FunctionCallOutputContentItem {
|
||||
// Do not rename, these are serialized and used directly in the responses API.
|
||||
InputText { text: String },
|
||||
// Do not rename, these are serialized and used directly in the responses API.
|
||||
InputImage { image_url: String },
|
||||
}
|
||||
|
||||
/// The payload we send back to OpenAI when reporting a tool call result.
|
||||
///
|
||||
/// `content` preserves the historical plain-string payload so downstream
|
||||
/// integrations (tests, logging, etc.) can keep treating tool output as
|
||||
/// `String`. When an MCP server returns richer data we additionally populate
|
||||
/// `content_items` with the structured form that the Responses/Chat
|
||||
/// Completions APIs understand.
|
||||
#[derive(Debug, Default, Clone, PartialEq, JsonSchema, TS)]
|
||||
pub struct FunctionCallOutputPayload {
|
||||
pub content: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub content_items: Option<Vec<FunctionCallOutputContentItem>>,
|
||||
// TODO(jif) drop this.
|
||||
pub success: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum FunctionCallOutputPayloadSerde {
|
||||
Text(String),
|
||||
Items(Vec<FunctionCallOutputContentItem>),
|
||||
}
|
||||
|
||||
// The Responses API expects two *different* shapes depending on success vs failure:
|
||||
// • success → output is a plain string (no nested object)
|
||||
// • failure → output is an object { content, success:false }
|
||||
// The upstream TypeScript CLI implements this by special‑casing the serialize path.
|
||||
// We replicate that behavior with a manual Serialize impl.
|
||||
|
||||
impl Serialize for FunctionCallOutputPayload {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
// The upstream TypeScript CLI always serializes `output` as a *plain string* regardless
|
||||
// of whether the function call succeeded or failed. The boolean is purely informational
|
||||
// for local bookkeeping and is NOT sent to the OpenAI endpoint. Sending the nested object
|
||||
// form `{ content, success:false }` triggers the 400 we are still seeing. Mirror the JS CLI
|
||||
// exactly: always emit a bare string.
|
||||
|
||||
serializer.serialize_str(&self.content)
|
||||
if let Some(items) = &self.content_items {
|
||||
items.serialize(serializer)
|
||||
} else {
|
||||
serializer.serialize_str(&self.content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,14 +343,106 @@ impl<'de> Deserialize<'de> for FunctionCallOutputPayload {
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let s = String::deserialize(deserializer)?;
|
||||
Ok(FunctionCallOutputPayload {
|
||||
content: s,
|
||||
success: None,
|
||||
})
|
||||
match FunctionCallOutputPayloadSerde::deserialize(deserializer)? {
|
||||
FunctionCallOutputPayloadSerde::Text(content) => Ok(FunctionCallOutputPayload {
|
||||
content,
|
||||
..Default::default()
|
||||
}),
|
||||
FunctionCallOutputPayloadSerde::Items(items) => {
|
||||
let content = serde_json::to_string(&items).map_err(serde::de::Error::custom)?;
|
||||
Ok(FunctionCallOutputPayload {
|
||||
content,
|
||||
content_items: Some(items),
|
||||
success: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&CallToolResult> for FunctionCallOutputPayload {
|
||||
fn from(call_tool_result: &CallToolResult) -> Self {
|
||||
let CallToolResult {
|
||||
content,
|
||||
structured_content,
|
||||
is_error,
|
||||
} = call_tool_result;
|
||||
|
||||
let is_success = is_error != &Some(true);
|
||||
|
||||
if let Some(structured_content) = structured_content
|
||||
&& !structured_content.is_null()
|
||||
{
|
||||
match serde_json::to_string(structured_content) {
|
||||
Ok(serialized_structured_content) => {
|
||||
return FunctionCallOutputPayload {
|
||||
content: serialized_structured_content,
|
||||
success: Some(is_success),
|
||||
..Default::default()
|
||||
};
|
||||
}
|
||||
Err(err) => {
|
||||
return FunctionCallOutputPayload {
|
||||
content: err.to_string(),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let serialized_content = match serde_json::to_string(content) {
|
||||
Ok(serialized_content) => serialized_content,
|
||||
Err(err) => {
|
||||
return FunctionCallOutputPayload {
|
||||
content: err.to_string(),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let content_items = convert_content_blocks_to_items(content);
|
||||
|
||||
FunctionCallOutputPayload {
|
||||
content: serialized_content,
|
||||
content_items,
|
||||
success: Some(is_success),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_content_blocks_to_items(
|
||||
blocks: &[ContentBlock],
|
||||
) -> Option<Vec<FunctionCallOutputContentItem>> {
|
||||
let mut saw_image = false;
|
||||
let mut items = Vec::with_capacity(blocks.len());
|
||||
|
||||
for block in blocks {
|
||||
match block {
|
||||
ContentBlock::TextContent(text) => {
|
||||
items.push(FunctionCallOutputContentItem::InputText {
|
||||
text: text.text.clone(),
|
||||
});
|
||||
}
|
||||
ContentBlock::ImageContent(image) => {
|
||||
saw_image = true;
|
||||
// Just in case the content doesn't include a data URL, add it.
|
||||
let image_url = if image.data.starts_with("data:") {
|
||||
image.data.clone()
|
||||
} else {
|
||||
format!("data:{};base64,{}", image.mime_type, image.data)
|
||||
};
|
||||
items.push(FunctionCallOutputContentItem::InputImage { image_url });
|
||||
}
|
||||
// TODO: render audio, resource, and embedded resource content to the model.
|
||||
_ => return None,
|
||||
}
|
||||
}
|
||||
|
||||
if saw_image { Some(items) } else { None }
|
||||
}
|
||||
|
||||
// Implement Display so callers can treat the payload like a plain string when logging or doing
|
||||
// trivial substring checks in tests (existing tests call `.contains()` on the output). Display
|
||||
// returns the raw `content` field.
|
||||
@@ -354,6 +466,8 @@ impl std::ops::Deref for FunctionCallOutputPayload {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use anyhow::Result;
|
||||
use mcp_types::ImageContent;
|
||||
use mcp_types::TextContent;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
@@ -362,7 +476,7 @@ mod tests {
|
||||
call_id: "call1".into(),
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "ok".into(),
|
||||
success: None,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
@@ -381,6 +495,7 @@ mod tests {
|
||||
output: FunctionCallOutputPayload {
|
||||
content: "bad".into(),
|
||||
success: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
@@ -391,6 +506,81 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serializes_image_outputs_as_array() -> Result<()> {
|
||||
let call_tool_result = CallToolResult {
|
||||
content: vec![
|
||||
ContentBlock::TextContent(TextContent {
|
||||
annotations: None,
|
||||
text: "caption".into(),
|
||||
r#type: "text".into(),
|
||||
}),
|
||||
ContentBlock::ImageContent(ImageContent {
|
||||
annotations: None,
|
||||
data: "BASE64".into(),
|
||||
mime_type: "image/png".into(),
|
||||
r#type: "image".into(),
|
||||
}),
|
||||
],
|
||||
is_error: None,
|
||||
structured_content: None,
|
||||
};
|
||||
|
||||
let payload = FunctionCallOutputPayload::from(&call_tool_result);
|
||||
assert_eq!(payload.success, Some(true));
|
||||
let items = payload.content_items.clone().expect("content items");
|
||||
assert_eq!(
|
||||
items,
|
||||
vec![
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "caption".into(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,BASE64".into(),
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
let item = ResponseInputItem::FunctionCallOutput {
|
||||
call_id: "call1".into(),
|
||||
output: payload,
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&item)?;
|
||||
let v: serde_json::Value = serde_json::from_str(&json)?;
|
||||
|
||||
let output = v.get("output").expect("output field");
|
||||
assert!(output.is_array(), "expected array output");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserializes_array_payload_into_items() -> Result<()> {
|
||||
let json = r#"[
|
||||
{"type": "input_text", "text": "note"},
|
||||
{"type": "input_image", "image_url": "data:image/png;base64,XYZ"}
|
||||
]"#;
|
||||
|
||||
let payload: FunctionCallOutputPayload = serde_json::from_str(json)?;
|
||||
|
||||
assert_eq!(payload.success, None);
|
||||
let expected_items = vec![
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "note".into(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,XYZ".into(),
|
||||
},
|
||||
];
|
||||
assert_eq!(payload.content_items, Some(expected_items.clone()));
|
||||
|
||||
let expected_content = serde_json::to_string(&expected_items)?;
|
||||
assert_eq!(payload.content, expected_content);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserialize_shell_tool_call_params() -> Result<()> {
|
||||
let json = r#"{
|
||||
|
||||
@@ -814,7 +814,7 @@ pub struct AgentReasoningDeltaEvent {
|
||||
pub delta: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS, PartialEq)]
|
||||
pub struct McpInvocation {
|
||||
/// Name of the MCP server as defined in the config.
|
||||
pub server: String,
|
||||
@@ -824,14 +824,14 @@ pub struct McpInvocation {
|
||||
pub arguments: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS, PartialEq)]
|
||||
pub struct McpToolCallBeginEvent {
|
||||
/// Identifier so this can be paired with the McpToolCallEnd event.
|
||||
pub call_id: String,
|
||||
pub invocation: McpInvocation,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS, PartialEq)]
|
||||
pub struct McpToolCallEndEvent {
|
||||
/// Identifier for the corresponding McpToolCallBegin that finished.
|
||||
pub call_id: String,
|
||||
|
||||
@@ -40,7 +40,7 @@ pub fn stdio() -> (tokio::io::Stdin, tokio::io::Stdout) {
|
||||
}
|
||||
impl TestToolServer {
|
||||
fn new() -> Self {
|
||||
let tools = vec![Self::echo_tool()];
|
||||
let tools = vec![Self::echo_tool(), Self::image_tool()];
|
||||
let resources = vec![Self::memo_resource()];
|
||||
let resource_templates = vec![Self::memo_template()];
|
||||
Self {
|
||||
@@ -70,6 +70,22 @@ impl TestToolServer {
|
||||
)
|
||||
}
|
||||
|
||||
fn image_tool() -> Tool {
|
||||
#[expect(clippy::expect_used)]
|
||||
let schema: JsonObject = serde_json::from_value(serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"additionalProperties": false
|
||||
}))
|
||||
.expect("image tool schema should deserialize");
|
||||
|
||||
Tool::new(
|
||||
Cow::Borrowed("image"),
|
||||
Cow::Borrowed("Return a single image content block."),
|
||||
Arc::new(schema),
|
||||
)
|
||||
}
|
||||
|
||||
fn memo_resource() -> Resource {
|
||||
let raw = RawResource {
|
||||
uri: MEMO_URI.to_string(),
|
||||
@@ -214,6 +230,35 @@ impl ServerHandler for TestToolServer {
|
||||
meta: None,
|
||||
})
|
||||
}
|
||||
"image" => {
|
||||
// Read a data URL (e.g. data:image/png;base64,AAA...) from env and convert to
|
||||
// an MCP image content block. Tests set MCP_TEST_IMAGE_DATA_URL.
|
||||
let data_url = std::env::var("MCP_TEST_IMAGE_DATA_URL").map_err(|_| {
|
||||
McpError::invalid_params(
|
||||
"missing MCP_TEST_IMAGE_DATA_URL env var for image tool",
|
||||
None,
|
||||
)
|
||||
})?;
|
||||
|
||||
fn parse_data_url(url: &str) -> Option<(String, String)> {
|
||||
let rest = url.strip_prefix("data:")?;
|
||||
let (mime_and_opts, data) = rest.split_once(',')?;
|
||||
let (mime, _opts) =
|
||||
mime_and_opts.split_once(';').unwrap_or((mime_and_opts, ""));
|
||||
Some((mime.to_string(), data.to_string()))
|
||||
}
|
||||
|
||||
let (mime_type, data_b64) = parse_data_url(&data_url).ok_or_else(|| {
|
||||
McpError::invalid_params(
|
||||
format!("invalid data URL for image tool: {data_url}"),
|
||||
None,
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(CallToolResult::success(vec![rmcp::model::Content::image(
|
||||
data_b64, mime_type,
|
||||
)]))
|
||||
}
|
||||
other => Err(McpError::invalid_params(
|
||||
format!("unknown tool: {other}"),
|
||||
None,
|
||||
|
||||
@@ -465,7 +465,7 @@ struct CompletedMcpToolCallWithImageOutput {
|
||||
}
|
||||
impl HistoryCell for CompletedMcpToolCallWithImageOutput {
|
||||
fn display_lines(&self, _width: u16) -> Vec<Line<'static>> {
|
||||
vec!["tool result (image output omitted)".into()]
|
||||
vec!["tool result (image output)".into()]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user