Kill shell tool process groups on timeout (#5258)

## Summary
- launch shell tool processes in their own process group so Codex owns
the full tree
- on timeout or ctrl-c, send SIGKILL to the process group before
terminating the tracked child
- document that the default shell/unified_exec timeout remains 1000 ms

## Original Bug
Long-lived shell tool commands hang indefinitely because the timeout
handler only terminated the direct child process; any grandchildren it
spawned kept running and held the PTY open, preventing Codex from
regaining control.

## Repro Original Bug
Install next.js and run `next dev` (which is a long-running shell
process with children). On openai:main, it will cause the agent to
permanently get stuck here until human intervention. On this branch,
this command will be terminated successfully after timeout_ms which will
unblock the agent. This is a critical fix for unmonitored / lightly
monitored agents that don't have immediate human observation to unblock
them.

---------

Co-authored-by: Michael Bolin <mbolin@openai.com>
Co-authored-by: Michael Bolin <bolinfest@gmail.com>
This commit is contained in:
Luca King
2025-11-07 19:54:35 -06:00
committed by GitHub
parent 91b16b8682
commit a2fdfce02a
2 changed files with 101 additions and 12 deletions

View File

@@ -518,6 +518,7 @@ async fn consume_truncated_output(
}
Err(_) => {
// timeout
kill_child_process_group(&mut child)?;
child.start_kill()?;
// Debatable whether `child.wait().await` should be called here.
(synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + TIMEOUT_CODE), true)
@@ -525,6 +526,7 @@ async fn consume_truncated_output(
}
}
_ = tokio::signal::ctrl_c() => {
kill_child_process_group(&mut child)?;
child.start_kill()?;
(synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + SIGKILL_CODE), false)
}
@@ -621,6 +623,38 @@ fn synthetic_exit_status(code: i32) -> ExitStatus {
std::process::ExitStatus::from_raw(code as u32)
}
#[cfg(unix)]
fn kill_child_process_group(child: &mut Child) -> io::Result<()> {
use std::io::ErrorKind;
if let Some(pid) = child.id() {
let pid = pid as libc::pid_t;
let pgid = unsafe { libc::getpgid(pid) };
if pgid == -1 {
let err = std::io::Error::last_os_error();
if err.kind() != ErrorKind::NotFound {
return Err(err);
}
return Ok(());
}
let result = unsafe { libc::killpg(pgid, libc::SIGKILL) };
if result == -1 {
let err = std::io::Error::last_os_error();
if err.kind() != ErrorKind::NotFound {
return Err(err);
}
}
}
Ok(())
}
#[cfg(not(unix))]
fn kill_child_process_group(_: &mut Child) -> io::Result<()> {
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
@@ -693,4 +727,51 @@ mod tests {
let output = make_exec_output(exit_code, "", "", "");
assert!(is_likely_sandbox_denied(SandboxType::LinuxSeccomp, &output));
}
#[cfg(unix)]
#[tokio::test]
async fn kill_child_process_group_kills_grandchildren_on_timeout() -> Result<()> {
let command = vec![
"/bin/bash".to_string(),
"-c".to_string(),
"sleep 60 & echo $!; sleep 60".to_string(),
];
let env: HashMap<String, String> = std::env::vars().collect();
let params = ExecParams {
command,
cwd: std::env::current_dir()?,
timeout_ms: Some(500),
env,
with_escalated_permissions: None,
justification: None,
arg0: None,
};
let output = exec(params, SandboxType::None, &SandboxPolicy::ReadOnly, None).await?;
assert!(output.timed_out);
let stdout = output.stdout.from_utf8_lossy().text;
let pid_line = stdout.lines().next().unwrap_or("").trim();
let pid: i32 = pid_line.parse().map_err(|error| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Failed to parse pid from stdout '{pid_line}': {error}"),
)
})?;
let mut killed = false;
for _ in 0..20 {
// Use kill(pid, 0) to check if the process is alive.
if unsafe { libc::kill(pid, 0) } == -1
&& let Some(libc::ESRCH) = std::io::Error::last_os_error().raw_os_error()
{
killed = true;
break;
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
assert!(killed, "grandchild process with pid {pid} is still alive");
Ok(())
}
}

View File

@@ -64,24 +64,32 @@ pub(crate) async fn spawn_child_async(
// any child processes that were spawned as part of a `"shell"` tool call
// to also be terminated.
// This relies on prctl(2), so it only works on Linux.
#[cfg(target_os = "linux")]
#[cfg(unix)]
unsafe {
#[cfg(target_os = "linux")]
let parent_pid = libc::getpid();
cmd.pre_exec(move || {
// This prctl call effectively requests, "deliver SIGTERM when my
// current parent dies."
if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) == -1 {
if libc::setpgid(0, 0) == -1 {
return Err(std::io::Error::last_os_error());
}
// Though if there was a race condition and this pre_exec() block is
// run _after_ the parent (i.e., the Codex process) has already
// exited, then parent will be the closest configured "subreaper"
// ancestor process, or PID 1 (init). If the Codex process has exited
// already, so should the child process.
if libc::getppid() != parent_pid {
libc::raise(libc::SIGTERM);
// This relies on prctl(2), so it only works on Linux.
#[cfg(target_os = "linux")]
{
// This prctl call effectively requests, "deliver SIGTERM when my
// current parent dies."
if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) == -1 {
return Err(std::io::Error::last_os_error());
}
// Though if there was a race condition and this pre_exec() block is
// run _after_ the parent (i.e., the Codex process) has already
// exited, then parent will be the closest configured "subreaper"
// ancestor process, or PID 1 (init). If the Codex process has exited
// already, so should the child process.
if libc::getppid() != parent_pid {
libc::raise(libc::SIGTERM);
}
}
Ok(())
});