fix(raw-exec-process-group): improve test reliability (#434)
This PR improves the reliability of `raw-exec-process-group.test`, addressing [#415](https://github.com/openai/codex/issues/415) Before: The test would fail sporadically in CI because it checked for process termination immediately after abort, without accounting for the time it takes for processes to fully terminate. Now: We've added a robust `ensureProcessGone` helper that: - Polls the process status with a 500ms timeout - Retries every 50ms if the process is still alive - Provides clear error messages if termination takes too long We now wait for the child process to fully exit after sending abort signals, instead of assuming instant death, fixing flakiness caused by asynchronous process termination. Changes: - Added `ensureProcessGone` helper function with retry logic - Improved error handling and timeout management See [this bash demo](https://gist.github.com/jdocherty/a84dbca2fbf7b47e5f95c87a07034ae8) for a minimal reproduction of why process death is asynchronous and why the test needs to retry after aborting.
This commit is contained in:
@@ -6,7 +6,6 @@ import { exec as rawExec } from "../src/utils/agent/sandbox/raw-exec.js";
|
|||||||
// the direct child. The original logic only sent `SIGTERM` to the immediate
|
// the direct child. The original logic only sent `SIGTERM` to the immediate
|
||||||
// child which meant that grandchildren (for instance when running through a
|
// child which meant that grandchildren (for instance when running through a
|
||||||
// `bash -c` wrapper) were left running and turned into "zombie" processes.
|
// `bash -c` wrapper) were left running and turned into "zombie" processes.
|
||||||
|
|
||||||
// Strategy:
|
// Strategy:
|
||||||
// 1. Start a Bash shell that spawns a long‑running `sleep`, prints the PID
|
// 1. Start a Bash shell that spawns a long‑running `sleep`, prints the PID
|
||||||
// of that `sleep`, and then waits forever. This guarantees we can later
|
// of that `sleep`, and then waits forever. This guarantees we can later
|
||||||
@@ -15,7 +14,6 @@ import { exec as rawExec } from "../src/utils/agent/sandbox/raw-exec.js";
|
|||||||
// 3. After `rawExec()` resolves we probe the previously printed PID with
|
// 3. After `rawExec()` resolves we probe the previously printed PID with
|
||||||
// `process.kill(pid, 0)`. If the call throws `ESRCH` the process no
|
// `process.kill(pid, 0)`. If the call throws `ESRCH` the process no
|
||||||
// longer exists – the desired outcome. Otherwise the test fails.
|
// longer exists – the desired outcome. Otherwise the test fails.
|
||||||
|
|
||||||
// The negative‑PID process‑group trick employed by the fixed implementation is
|
// The negative‑PID process‑group trick employed by the fixed implementation is
|
||||||
// POSIX‑only. On Windows we skip the test.
|
// POSIX‑only. On Windows we skip the test.
|
||||||
|
|
||||||
@@ -26,49 +24,59 @@ describe("rawExec – abort kills entire process group", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const abortController = new AbortController();
|
const abortController = new AbortController();
|
||||||
|
|
||||||
// Bash script: spawn `sleep 30` in background, print its PID, then wait.
|
// Bash script: spawn `sleep 30` in background, print its PID, then wait.
|
||||||
const script = "sleep 30 & pid=$!; echo $pid; wait $pid";
|
const script = "sleep 30 & pid=$!; echo $pid; wait $pid";
|
||||||
const cmd = ["bash", "-c", script];
|
const cmd = ["bash", "-c", script];
|
||||||
|
|
||||||
// Kick off the command.
|
// Start a bash shell that:
|
||||||
const execPromise = rawExec(cmd, {}, [], abortController.signal);
|
// - spawns a background `sleep 30`
|
||||||
|
// - prints the PID of the `sleep`
|
||||||
|
// - waits for `sleep` to exit
|
||||||
|
const { stdout, exitCode } = await (async () => {
|
||||||
|
const p = rawExec(cmd, {}, [], abortController.signal);
|
||||||
|
|
||||||
// Give Bash a tiny bit of time to start and print the PID.
|
// Give Bash a tiny bit of time to start and print the PID.
|
||||||
await new Promise((r) => setTimeout(r, 100));
|
await new Promise((r) => setTimeout(r, 100));
|
||||||
|
|
||||||
// Cancel the task – this should kill *both* bash and the inner sleep.
|
// Cancel the task – this should kill *both* bash and the inner sleep.
|
||||||
abortController.abort();
|
abortController.abort();
|
||||||
|
|
||||||
const { exitCode, stdout } = await execPromise;
|
// Wait for rawExec to resolve after aborting
|
||||||
|
return p;
|
||||||
|
})();
|
||||||
|
|
||||||
// We expect a non‑zero exit code because the process was killed.
|
// We expect a non‑zero exit code because the process was killed.
|
||||||
expect(exitCode).not.toBe(0);
|
expect(exitCode).not.toBe(0);
|
||||||
|
|
||||||
// Attempt to extract the grand‑child PID from stdout.
|
// Extract the PID of the sleep process that bash printed
|
||||||
const pidMatch = /^(\d+)/.exec(stdout.trim());
|
const pid = Number(stdout.trim().match(/^\d+/)?.[0]);
|
||||||
|
if (pid) {
|
||||||
if (pidMatch) {
|
// Confirm that the sleep process is no longer alive
|
||||||
const sleepPid = Number(pidMatch[1]);
|
await ensureProcessGone(pid);
|
||||||
|
|
||||||
// Verify that the sleep process is no longer alive.
|
|
||||||
let alive = true;
|
|
||||||
try {
|
|
||||||
process.kill(sleepPid, 0);
|
|
||||||
} catch (error: any) {
|
|
||||||
// Check if error is ESRCH (No such process)
|
|
||||||
if (error.code === "ESRCH") {
|
|
||||||
alive = false; // Process is dead, as expected.
|
|
||||||
} else {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
expect(alive).toBe(false);
|
|
||||||
} else {
|
|
||||||
// If PID was not printed, it implies bash was killed very early.
|
|
||||||
// The test passes implicitly in this scenario as the abort mechanism
|
|
||||||
// successfully stopped the command execution quickly.
|
|
||||||
expect(true).toBe(true);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Waits until a process no longer exists, or throws after timeout.
|
||||||
|
* @param pid - The process ID to check
|
||||||
|
* @throws {Error} If the process is still alive after 500ms
|
||||||
|
*/
|
||||||
|
async function ensureProcessGone(pid: number) {
|
||||||
|
const timeout = 500;
|
||||||
|
const deadline = Date.now() + timeout;
|
||||||
|
while (Date.now() < deadline) {
|
||||||
|
try {
|
||||||
|
process.kill(pid, 0); // check if process still exists
|
||||||
|
await new Promise((r) => setTimeout(r, 50)); // wait and retry
|
||||||
|
} catch (e: any) {
|
||||||
|
if (e.code === "ESRCH") {
|
||||||
|
return; // process is gone — success
|
||||||
|
}
|
||||||
|
throw e; // unexpected error — rethrow
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new Error(
|
||||||
|
`Process with PID ${pid} failed to terminate within ${timeout}ms`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user