fix: make McpConnectionManager tolerant of MCPs that fail to start (#854)

I added a typo in my `config.toml` such that the `command` for one of my
`mcp_servers` did not exist and I verified that the error was surfaced
in the TUI (and that I was still able to use Codex).


![image](https://github.com/user-attachments/assets/f13cc08c-f4c6-40ec-9ab4-a9d75e03152f)
This commit is contained in:
Michael Bolin
2025-05-08 23:45:54 -07:00
committed by GitHub
parent b940adae8e
commit 27198bfe11
2 changed files with 56 additions and 20 deletions

View File

@@ -561,15 +561,35 @@ async fn submission_loop(
let writable_roots = Mutex::new(get_writable_roots(&cwd)); let writable_roots = Mutex::new(get_writable_roots(&cwd));
let mcp_connection_manager = // Error messages to dispatch after SessionConfigured is sent.
let mut mcp_connection_errors = Vec::<Event>::new();
let (mcp_connection_manager, failed_clients) =
match McpConnectionManager::new(config.mcp_servers.clone()).await { match McpConnectionManager::new(config.mcp_servers.clone()).await {
Ok(mgr) => mgr, Ok((mgr, failures)) => (mgr, failures),
Err(e) => { Err(e) => {
error!("Failed to create MCP connection manager: {e:#}"); let message = format!("Failed to create MCP connection manager: {e:#}");
McpConnectionManager::default() error!("{message}");
mcp_connection_errors.push(Event {
id: sub.id.clone(),
msg: EventMsg::Error { message },
});
(McpConnectionManager::default(), Default::default())
} }
}; };
// Surface individual client start-up failures to the user.
if !failed_clients.is_empty() {
for (server_name, err) in failed_clients {
let message =
format!("MCP client for `{server_name}` failed to start: {err:#}");
error!("{message}");
mcp_connection_errors.push(Event {
id: sub.id.clone(),
msg: EventMsg::Error { message },
});
}
}
// Attempt to create a RolloutRecorder *before* moving the // Attempt to create a RolloutRecorder *before* moving the
// `instructions` value into the Session struct. // `instructions` value into the Session struct.
let rollout_recorder = match RolloutRecorder::new(instructions.clone()).await { let rollout_recorder = match RolloutRecorder::new(instructions.clone()).await {
@@ -596,12 +616,15 @@ async fn submission_loop(
})); }));
// ack // ack
let event = Event { let events = std::iter::once(Event {
id: sub.id, id: sub.id.clone(),
msg: EventMsg::SessionConfigured { model }, msg: EventMsg::SessionConfigured { model },
}; })
if tx_event.send(event).await.is_err() { .chain(mcp_connection_errors.into_iter());
return; for event in events {
if let Err(e) = tx_event.send(event).await {
error!("failed to send event: {e:?}");
}
} }
} }
Op::UserInput { items } => { Op::UserInput { items } => {

View File

@@ -29,6 +29,10 @@ const MCP_TOOL_NAME_DELIMITER: &str = "__OAI_CODEX_MCP__";
/// Timeout for the `tools/list` request. /// Timeout for the `tools/list` request.
const LIST_TOOLS_TIMEOUT: Duration = Duration::from_secs(10); const LIST_TOOLS_TIMEOUT: Duration = Duration::from_secs(10);
/// Map that holds a startup error for every MCP server that could **not** be
/// spawned successfully.
pub type ClientStartErrors = HashMap<String, anyhow::Error>;
fn fully_qualified_tool_name(server: &str, tool: &str) -> String { fn fully_qualified_tool_name(server: &str, tool: &str) -> String {
format!("{server}{MCP_TOOL_NAME_DELIMITER}{tool}") format!("{server}{MCP_TOOL_NAME_DELIMITER}{tool}")
} }
@@ -60,40 +64,49 @@ impl McpConnectionManager {
/// * `mcp_servers` Map loaded from the user configuration where *keys* /// * `mcp_servers` Map loaded from the user configuration where *keys*
/// are human-readable server identifiers and *values* are the spawn /// are human-readable server identifiers and *values* are the spawn
/// instructions. /// instructions.
pub async fn new(mcp_servers: HashMap<String, McpServerConfig>) -> Result<Self> { ///
/// Servers that fail to start are reported in `ClientStartErrors`: the
/// user should be informed about these errors.
pub async fn new(
mcp_servers: HashMap<String, McpServerConfig>,
) -> Result<(Self, ClientStartErrors)> {
// Early exit if no servers are configured. // Early exit if no servers are configured.
if mcp_servers.is_empty() { if mcp_servers.is_empty() {
return Ok(Self::default()); return Ok((Self::default(), ClientStartErrors::default()));
} }
// Spin up all servers concurrently. // Launch all configured servers concurrently.
let mut join_set = JoinSet::new(); let mut join_set = JoinSet::new();
// Spawn tasks to launch each server.
for (server_name, cfg) in mcp_servers { for (server_name, cfg) in mcp_servers {
// TODO: Verify server name: require `^[a-zA-Z0-9_-]+$`? // TODO: Verify server name: require `^[a-zA-Z0-9_-]+$`?
join_set.spawn(async move { join_set.spawn(async move {
let McpServerConfig { command, args, env } = cfg; let McpServerConfig { command, args, env } = cfg;
let client_res = McpClient::new_stdio_client(command, args, env).await; let client_res = McpClient::new_stdio_client(command, args, env).await;
(server_name, client_res) (server_name, client_res)
}); });
} }
let mut clients: HashMap<String, std::sync::Arc<McpClient>> = let mut clients: HashMap<String, std::sync::Arc<McpClient>> =
HashMap::with_capacity(join_set.len()); HashMap::with_capacity(join_set.len());
let mut errors = ClientStartErrors::new();
while let Some(res) = join_set.join_next().await { while let Some(res) = join_set.join_next().await {
let (server_name, client_res) = res?; let (server_name, client_res) = res?; // JoinError propagation
let client = client_res match client_res {
.with_context(|| format!("failed to spawn MCP server `{server_name}`"))?; Ok(client) => {
clients.insert(server_name, std::sync::Arc::new(client));
clients.insert(server_name, std::sync::Arc::new(client)); }
Err(e) => {
errors.insert(server_name, e.into());
}
}
} }
let tools = list_all_tools(&clients).await?; let tools = list_all_tools(&clients).await?;
Ok(Self { clients, tools }) Ok((Self { clients, tools }, errors))
} }
/// Returns a single map that contains **all** tools. Each key is the /// Returns a single map that contains **all** tools. Each key is the