fix: try to fix flakiness in test_shell_command_approval_triggers_elicitation (#2344)

I still see flakiness in
`test_shell_command_approval_triggers_elicitation()` on occasion where
`MockServer` claims it has not received all of its expected requests.

I recently introduced a similar type of test in #2264,
`test_codex_jsonrpc_conversation_flow()`, which I have not seen flake
(yet!), so this PR pulls over two things I did in that test:

- increased `worker_threads` from `2` to `4`
- added an assertion to make sure the `task_complete` notification is
received

Honestly, I'm still not sure why `MockServer` claims it sometimes does
not receive all its expected requests given that we assert that the
final `JSONRPCResponse` is read on the stream, but let's give this a
shot.

Assuming this fixes things, my hypothesis is that the increase in
`worker_threads` helps because perhaps there are async tasks in
`MockServer` that do not reliably complete fully when there are not
enough threads available? If that is correct, it seems like the test
would still be flaky, though perhaps with lower frequency?
This commit is contained in:
Michael Bolin
2025-08-15 09:17:20 -07:00
committed by GitHub
parent 6730592433
commit 265fd89e31
2 changed files with 53 additions and 1 deletions

View File

@@ -35,7 +35,7 @@ const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs
/// Test that a shell command that is not on the "trusted" list triggers an
/// elicitation request to the MCP and that sending the approval runs the
/// command, as expected.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn test_shell_command_approval_triggers_elicitation() {
if env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
println!(
@@ -114,6 +114,16 @@ async fn shell_command_approval_triggers_elicitation() -> anyhow::Result<()> {
)
.await?;
// Verify task_complete notification arrives before the tool call completes.
#[expect(clippy::expect_used)]
let _task_complete = timeout(
DEFAULT_READ_TIMEOUT,
mcp_process.read_stream_until_legacy_task_complete_notification(),
)
.await
.expect("task_complete_notification timeout")
.expect("task_complete_notification resp");
// Verify the original `codex` tool call completes and that `git init` ran
// successfully.
let codex_response = timeout(

View File

@@ -474,4 +474,46 @@ impl McpProcess {
}))
.await
}
/// Reads notifications until a legacy TaskComplete event is observed:
/// Method "codex/event" with params.msg.type == "task_complete".
pub async fn read_stream_until_legacy_task_complete_notification(
&mut self,
) -> anyhow::Result<JSONRPCNotification> {
loop {
let message = self.read_jsonrpc_message().await?;
eprint!("message: {message:?}");
match message {
JSONRPCMessage::Notification(notification) => {
let is_match = if notification.method == "codex/event" {
if let Some(params) = &notification.params {
params
.get("msg")
.and_then(|m| m.get("type"))
.and_then(|t| t.as_str())
== Some("task_complete")
} else {
false
}
} else {
false
};
if is_match {
return Ok(notification);
}
}
JSONRPCMessage::Request(_) => {
anyhow::bail!("unexpected JSONRPCMessage::Request: {message:?}");
}
JSONRPCMessage::Error(_) => {
anyhow::bail!("unexpected JSONRPCMessage::Error: {message:?}");
}
JSONRPCMessage::Response(_) => {
anyhow::bail!("unexpected JSONRPCMessage::Response: {message:?}");
}
}
}
}
}