It's pretty amazing we have gotten here without the ability for the model to see image content from MCP tool calls. This PR builds off of 4391 and fixes #4819. I would like @KKcorps to get adequete credit here but I also want to get this fix in ASAP so I gave him a week to update it and haven't gotten a response so I'm going to take it across the finish line. This test highlights how absured the current situation is. I asked the model to read this image using the Chrome MCP <img width="2378" height="674" alt="image" src="https://github.com/user-attachments/assets/9ef52608-72a2-4423-9f5e-7ae36b2b56e0" /> After this change, it correctly outputs: > Captured the page: image dhows a dark terminal-style UI labeled `OpenAI Codex (v0.0.0)` with prompt `model: gpt-5-codex medium` and working directory `/codex/codex-rs` (and more) Before this change, it said: > Took the full-page screenshot you asked for. It shows a long, horizontally repeating pattern of stylized people in orange, light-blue, and mustard clothing, holding hands in alternating poses against a white background. No text or other graphics-just rows of flat illustration stretching off to the right. Without this change, the Figma, Playwright, Chrome, and other visual MCP servers are pretty much entirely useless. I tested this change with the openai respones api as well as a third party completions api
112 lines
4.6 KiB
Rust
112 lines
4.6 KiB
Rust
use crate::codex::Session;
|
|
use crate::codex::TurnContext;
|
|
use crate::conversation_history::ConversationHistory;
|
|
use codex_protocol::models::FunctionCallOutputPayload;
|
|
use codex_protocol::models::ResponseInputItem;
|
|
use codex_protocol::models::ResponseItem;
|
|
use tracing::warn;
|
|
|
|
/// Process streamed `ResponseItem`s from the model into the pair of:
|
|
/// - items we should record in conversation history; and
|
|
/// - `ResponseInputItem`s to send back to the model on the next turn.
|
|
pub(crate) async fn process_items(
|
|
processed_items: Vec<crate::codex::ProcessedResponseItem>,
|
|
is_review_mode: bool,
|
|
review_thread_history: &mut ConversationHistory,
|
|
sess: &Session,
|
|
turn_context: &TurnContext,
|
|
) -> (Vec<ResponseInputItem>, Vec<ResponseItem>) {
|
|
let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
|
|
let mut responses = Vec::<ResponseInputItem>::new();
|
|
for processed_response_item in processed_items {
|
|
let crate::codex::ProcessedResponseItem { item, response } = processed_response_item;
|
|
match (&item, &response) {
|
|
(ResponseItem::Message { role, .. }, None) if role == "assistant" => {
|
|
// If the model returned a message, we need to record it.
|
|
items_to_record_in_conversation_history.push(item);
|
|
}
|
|
(
|
|
ResponseItem::LocalShellCall { .. },
|
|
Some(ResponseInputItem::FunctionCallOutput { call_id, output }),
|
|
) => {
|
|
items_to_record_in_conversation_history.push(item);
|
|
items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
|
|
call_id: call_id.clone(),
|
|
output: output.clone(),
|
|
});
|
|
}
|
|
(
|
|
ResponseItem::FunctionCall { .. },
|
|
Some(ResponseInputItem::FunctionCallOutput { call_id, output }),
|
|
) => {
|
|
items_to_record_in_conversation_history.push(item);
|
|
items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
|
|
call_id: call_id.clone(),
|
|
output: output.clone(),
|
|
});
|
|
}
|
|
(
|
|
ResponseItem::CustomToolCall { .. },
|
|
Some(ResponseInputItem::CustomToolCallOutput { call_id, output }),
|
|
) => {
|
|
items_to_record_in_conversation_history.push(item);
|
|
items_to_record_in_conversation_history.push(ResponseItem::CustomToolCallOutput {
|
|
call_id: call_id.clone(),
|
|
output: output.clone(),
|
|
});
|
|
}
|
|
(
|
|
ResponseItem::FunctionCall { .. },
|
|
Some(ResponseInputItem::McpToolCallOutput { call_id, result }),
|
|
) => {
|
|
items_to_record_in_conversation_history.push(item);
|
|
let output = match result {
|
|
Ok(call_tool_result) => FunctionCallOutputPayload::from(call_tool_result),
|
|
Err(err) => FunctionCallOutputPayload {
|
|
content: err.clone(),
|
|
success: Some(false),
|
|
..Default::default()
|
|
},
|
|
};
|
|
items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
|
|
call_id: call_id.clone(),
|
|
output,
|
|
});
|
|
}
|
|
(
|
|
ResponseItem::Reasoning {
|
|
id,
|
|
summary,
|
|
content,
|
|
encrypted_content,
|
|
},
|
|
None,
|
|
) => {
|
|
items_to_record_in_conversation_history.push(ResponseItem::Reasoning {
|
|
id: id.clone(),
|
|
summary: summary.clone(),
|
|
content: content.clone(),
|
|
encrypted_content: encrypted_content.clone(),
|
|
});
|
|
}
|
|
_ => {
|
|
warn!("Unexpected response item: {item:?} with response: {response:?}");
|
|
}
|
|
};
|
|
if let Some(response) = response {
|
|
responses.push(response);
|
|
}
|
|
}
|
|
|
|
// Only attempt to take the lock if there is something to record.
|
|
if !items_to_record_in_conversation_history.is_empty() {
|
|
if is_review_mode {
|
|
review_thread_history.record_items(items_to_record_in_conversation_history.iter());
|
|
} else {
|
|
sess.record_conversation_items(turn_context, &items_to_record_in_conversation_history)
|
|
.await;
|
|
}
|
|
}
|
|
(responses, items_to_record_in_conversation_history)
|
|
}
|