chore: try to make it easier to debug the flakiness of test_shell_command_approval_triggers_elicitation (#2848)

`test_shell_command_approval_triggers_elicitation()` is one of a number
of integration tests that we have observed to be flaky on GitHub CI, so
this PR tries to reduce the flakiness _and_ to provide us with more
information when it flakes. Specifically:

- Changed the command that we use to trigger the elicitation from `git
init` to `python3 -c 'import pathlib; pathlib.Path(r"{}").touch()'`
because running `git` seems more likely to invite variance.
- Increased the timeout to wait for the task response from 10s to 20s.
- Added more logging.
This commit is contained in:
Michael Bolin
2025-08-28 12:33:33 -07:00
committed by GitHub
parent 74d2741729
commit 1e9e703b96
2 changed files with 33 additions and 20 deletions

View File

@@ -283,6 +283,7 @@ impl McpProcess {
}
async fn send_jsonrpc_message(&mut self, message: JSONRPCMessage) -> anyhow::Result<()> {
eprintln!("writing message to stdin: {message:?}");
let payload = serde_json::to_string(&message)?;
self.stdin.write_all(payload.as_bytes()).await?;
self.stdin.write_all(b"\n").await?;
@@ -294,13 +295,15 @@ impl McpProcess {
let mut line = String::new();
self.stdout.read_line(&mut line).await?;
let message = serde_json::from_str::<JSONRPCMessage>(&line)?;
eprintln!("read message from stdout: {message:?}");
Ok(message)
}
pub async fn read_stream_until_request_message(&mut self) -> anyhow::Result<JSONRPCRequest> {
eprintln!("in read_stream_until_request_message()");
loop {
let message = self.read_jsonrpc_message().await?;
eprint!("message: {message:?}");
match message {
JSONRPCMessage::Notification(_) => {
@@ -323,10 +326,10 @@ impl McpProcess {
&mut self,
request_id: RequestId,
) -> anyhow::Result<JSONRPCResponse> {
eprintln!("in read_stream_until_response_message({request_id:?})");
loop {
let message = self.read_jsonrpc_message().await?;
eprint!("message: {message:?}");
match message {
JSONRPCMessage::Notification(_) => {
eprintln!("notification: {message:?}");
@@ -352,8 +355,6 @@ impl McpProcess {
) -> anyhow::Result<mcp_types::JSONRPCError> {
loop {
let message = self.read_jsonrpc_message().await?;
eprint!("message: {message:?}");
match message {
JSONRPCMessage::Notification(_) => {
eprintln!("notification: {message:?}");
@@ -377,10 +378,10 @@ impl McpProcess {
&mut self,
method: &str,
) -> anyhow::Result<JSONRPCNotification> {
eprintln!("in read_stream_until_notification_message({method})");
loop {
let message = self.read_jsonrpc_message().await?;
eprint!("message: {message:?}");
match message {
JSONRPCMessage::Notification(notification) => {
if notification.method == method {
@@ -405,10 +406,10 @@ impl McpProcess {
pub async fn read_stream_until_legacy_task_complete_notification(
&mut self,
) -> anyhow::Result<JSONRPCNotification> {
eprintln!("in read_stream_until_legacy_task_complete_notification()");
loop {
let message = self.read_jsonrpc_message().await?;
eprint!("message: {message:?}");
match message {
JSONRPCMessage::Notification(notification) => {
let is_match = if notification.method == "codex/event" {
@@ -427,6 +428,8 @@ impl McpProcess {
if is_match {
return Ok(notification);
} else {
eprintln!("ignoring notification: {notification:?}");
}
}
JSONRPCMessage::Request(_) => {

View File

@@ -30,7 +30,8 @@ use mcp_test_support::create_final_assistant_message_sse_response;
use mcp_test_support::create_mock_chat_completions_server;
use mcp_test_support::create_shell_sse_response;
const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);
// Allow ample time on slower CI or under load to avoid flakes.
const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
/// Test that a shell command that is not on the "trusted" list triggers an
/// elicitation request to the MCP and that sending the approval runs the
@@ -52,9 +53,22 @@ async fn test_shell_command_approval_triggers_elicitation() {
}
async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
// We use `git init` because it will not be on the "trusted" list.
let shell_command = vec!["git".to_string(), "init".to_string()];
// Use a simple, untrusted command that creates a file so we can
// observe a side-effect.
//
// Crossplatform approach: run a tiny Python snippet to touch the file
// using `python3 -c ...` on all platforms.
let workdir_for_shell_function_call = TempDir::new()?;
let created_filename = "created_by_shell_tool.txt";
let created_file = workdir_for_shell_function_call
.path()
.join(created_filename);
let shell_command = vec![
"python3".to_string(),
"-c".to_string(),
format!("import pathlib; pathlib.Path('{created_filename}').touch()"),
];
let McpHandle {
process: mut mcp_process,
@@ -67,7 +81,7 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
Some(5_000),
"call1234",
)?,
create_final_assistant_message_sse_response("Enjoy your new git repo!")?,
create_final_assistant_message_sse_response("File created!")?,
])
.await?;
@@ -122,8 +136,7 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
.expect("task_complete_notification timeout")
.expect("task_complete_notification resp");
// Verify the original `codex` tool call completes and that `git init` ran
// successfully.
// Verify the original `codex` tool call completes and that the file was created.
let codex_response = timeout(
DEFAULT_READ_TIMEOUT,
mcp_process.read_stream_until_response_message(RequestId::Integer(codex_request_id)),
@@ -136,7 +149,7 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
result: json!({
"content": [
{
"text": "Enjoy your new git repo!",
"text": "File created!",
"type": "text"
}
]
@@ -145,10 +158,7 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
codex_response
);
assert!(
workdir_for_shell_function_call.path().join(".git").is_dir(),
".git folder should have been created"
);
assert!(created_file.is_file(), "created file should exist");
Ok(())
}