Kill shell tool process groups on timeout (#5258)
## Summary - launch shell tool processes in their own process group so Codex owns the full tree - on timeout or ctrl-c, send SIGKILL to the process group before terminating the tracked child - document that the default shell/unified_exec timeout remains 1000 ms ## Original Bug Long-lived shell tool commands hang indefinitely because the timeout handler only terminated the direct child process; any grandchildren it spawned kept running and held the PTY open, preventing Codex from regaining control. ## Repro Original Bug Install next.js and run `next dev` (which is a long-running shell process with children). On openai:main, it will cause the agent to permanently get stuck here until human intervention. On this branch, this command will be terminated successfully after timeout_ms which will unblock the agent. This is a critical fix for unmonitored / lightly monitored agents that don't have immediate human observation to unblock them. --------- Co-authored-by: Michael Bolin <mbolin@openai.com> Co-authored-by: Michael Bolin <bolinfest@gmail.com>
This commit is contained in:
@@ -64,24 +64,32 @@ pub(crate) async fn spawn_child_async(
|
||||
// any child processes that were spawned as part of a `"shell"` tool call
|
||||
// to also be terminated.
|
||||
|
||||
// This relies on prctl(2), so it only works on Linux.
|
||||
#[cfg(target_os = "linux")]
|
||||
#[cfg(unix)]
|
||||
unsafe {
|
||||
#[cfg(target_os = "linux")]
|
||||
let parent_pid = libc::getpid();
|
||||
cmd.pre_exec(move || {
|
||||
// This prctl call effectively requests, "deliver SIGTERM when my
|
||||
// current parent dies."
|
||||
if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) == -1 {
|
||||
if libc::setpgid(0, 0) == -1 {
|
||||
return Err(std::io::Error::last_os_error());
|
||||
}
|
||||
|
||||
// Though if there was a race condition and this pre_exec() block is
|
||||
// run _after_ the parent (i.e., the Codex process) has already
|
||||
// exited, then parent will be the closest configured "subreaper"
|
||||
// ancestor process, or PID 1 (init). If the Codex process has exited
|
||||
// already, so should the child process.
|
||||
if libc::getppid() != parent_pid {
|
||||
libc::raise(libc::SIGTERM);
|
||||
// This relies on prctl(2), so it only works on Linux.
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
// This prctl call effectively requests, "deliver SIGTERM when my
|
||||
// current parent dies."
|
||||
if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) == -1 {
|
||||
return Err(std::io::Error::last_os_error());
|
||||
}
|
||||
|
||||
// Though if there was a race condition and this pre_exec() block is
|
||||
// run _after_ the parent (i.e., the Codex process) has already
|
||||
// exited, then parent will be the closest configured "subreaper"
|
||||
// ancestor process, or PID 1 (init). If the Codex process has exited
|
||||
// already, so should the child process.
|
||||
if libc::getppid() != parent_pid {
|
||||
libc::raise(libc::SIGTERM);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user