From a2fdfce02a870773b3b037954b0fad6222020389 Mon Sep 17 00:00:00 2001 From: Luca King Date: Fri, 7 Nov 2025 19:54:35 -0600 Subject: [PATCH] Kill shell tool process groups on timeout (#5258) ## Summary - launch shell tool processes in their own process group so Codex owns the full tree - on timeout or ctrl-c, send SIGKILL to the process group before terminating the tracked child - document that the default shell/unified_exec timeout remains 1000 ms ## Original Bug Long-lived shell tool commands hang indefinitely because the timeout handler only terminated the direct child process; any grandchildren it spawned kept running and held the PTY open, preventing Codex from regaining control. ## Repro Original Bug Install next.js and run `next dev` (which is a long-running shell process with children). On openai:main, it will cause the agent to permanently get stuck here until human intervention. On this branch, this command will be terminated successfully after timeout_ms which will unblock the agent. This is a critical fix for unmonitored / lightly monitored agents that don't have immediate human observation to unblock them. --------- Co-authored-by: Michael Bolin Co-authored-by: Michael Bolin --- codex-rs/core/src/exec.rs | 81 ++++++++++++++++++++++++++++++++++++++ codex-rs/core/src/spawn.rs | 32 +++++++++------ 2 files changed, 101 insertions(+), 12 deletions(-) diff --git a/codex-rs/core/src/exec.rs b/codex-rs/core/src/exec.rs index 31a61177..012fde95 100644 --- a/codex-rs/core/src/exec.rs +++ b/codex-rs/core/src/exec.rs @@ -518,6 +518,7 @@ async fn consume_truncated_output( } Err(_) => { // timeout + kill_child_process_group(&mut child)?; child.start_kill()?; // Debatable whether `child.wait().await` should be called here. (synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + TIMEOUT_CODE), true) @@ -525,6 +526,7 @@ async fn consume_truncated_output( } } _ = tokio::signal::ctrl_c() => { + kill_child_process_group(&mut child)?; child.start_kill()?; (synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + SIGKILL_CODE), false) } @@ -621,6 +623,38 @@ fn synthetic_exit_status(code: i32) -> ExitStatus { std::process::ExitStatus::from_raw(code as u32) } +#[cfg(unix)] +fn kill_child_process_group(child: &mut Child) -> io::Result<()> { + use std::io::ErrorKind; + + if let Some(pid) = child.id() { + let pid = pid as libc::pid_t; + let pgid = unsafe { libc::getpgid(pid) }; + if pgid == -1 { + let err = std::io::Error::last_os_error(); + if err.kind() != ErrorKind::NotFound { + return Err(err); + } + return Ok(()); + } + + let result = unsafe { libc::killpg(pgid, libc::SIGKILL) }; + if result == -1 { + let err = std::io::Error::last_os_error(); + if err.kind() != ErrorKind::NotFound { + return Err(err); + } + } + } + + Ok(()) +} + +#[cfg(not(unix))] +fn kill_child_process_group(_: &mut Child) -> io::Result<()> { + Ok(()) +} + #[cfg(test)] mod tests { use super::*; @@ -693,4 +727,51 @@ mod tests { let output = make_exec_output(exit_code, "", "", ""); assert!(is_likely_sandbox_denied(SandboxType::LinuxSeccomp, &output)); } + + #[cfg(unix)] + #[tokio::test] + async fn kill_child_process_group_kills_grandchildren_on_timeout() -> Result<()> { + let command = vec![ + "/bin/bash".to_string(), + "-c".to_string(), + "sleep 60 & echo $!; sleep 60".to_string(), + ]; + let env: HashMap = std::env::vars().collect(); + let params = ExecParams { + command, + cwd: std::env::current_dir()?, + timeout_ms: Some(500), + env, + with_escalated_permissions: None, + justification: None, + arg0: None, + }; + + let output = exec(params, SandboxType::None, &SandboxPolicy::ReadOnly, None).await?; + assert!(output.timed_out); + + let stdout = output.stdout.from_utf8_lossy().text; + let pid_line = stdout.lines().next().unwrap_or("").trim(); + let pid: i32 = pid_line.parse().map_err(|error| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse pid from stdout '{pid_line}': {error}"), + ) + })?; + + let mut killed = false; + for _ in 0..20 { + // Use kill(pid, 0) to check if the process is alive. + if unsafe { libc::kill(pid, 0) } == -1 + && let Some(libc::ESRCH) = std::io::Error::last_os_error().raw_os_error() + { + killed = true; + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + + assert!(killed, "grandchild process with pid {pid} is still alive"); + Ok(()) + } } diff --git a/codex-rs/core/src/spawn.rs b/codex-rs/core/src/spawn.rs index c3a91266..d738f122 100644 --- a/codex-rs/core/src/spawn.rs +++ b/codex-rs/core/src/spawn.rs @@ -64,24 +64,32 @@ pub(crate) async fn spawn_child_async( // any child processes that were spawned as part of a `"shell"` tool call // to also be terminated. - // This relies on prctl(2), so it only works on Linux. - #[cfg(target_os = "linux")] + #[cfg(unix)] unsafe { + #[cfg(target_os = "linux")] let parent_pid = libc::getpid(); cmd.pre_exec(move || { - // This prctl call effectively requests, "deliver SIGTERM when my - // current parent dies." - if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) == -1 { + if libc::setpgid(0, 0) == -1 { return Err(std::io::Error::last_os_error()); } - // Though if there was a race condition and this pre_exec() block is - // run _after_ the parent (i.e., the Codex process) has already - // exited, then parent will be the closest configured "subreaper" - // ancestor process, or PID 1 (init). If the Codex process has exited - // already, so should the child process. - if libc::getppid() != parent_pid { - libc::raise(libc::SIGTERM); + // This relies on prctl(2), so it only works on Linux. + #[cfg(target_os = "linux")] + { + // This prctl call effectively requests, "deliver SIGTERM when my + // current parent dies." + if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) == -1 { + return Err(std::io::Error::last_os_error()); + } + + // Though if there was a race condition and this pre_exec() block is + // run _after_ the parent (i.e., the Codex process) has already + // exited, then parent will be the closest configured "subreaper" + // ancestor process, or PID 1 (init). If the Codex process has exited + // already, so should the child process. + if libc::getppid() != parent_pid { + libc::raise(libc::SIGTERM); + } } Ok(()) });