Include command output when sending timeout to model (#3576)

Being able to see the output helps the model decide how to handle the
timeout.
This commit is contained in:
pakrym-oai
2025-09-14 14:38:26 -07:00
committed by GitHub
parent 7e1543f5d8
commit 863d9c237e
5 changed files with 147 additions and 50 deletions

View File

@@ -845,6 +845,7 @@ impl Session {
aggregated_output, aggregated_output,
duration, duration,
exit_code, exit_code,
timed_out: _,
} = output; } = output;
// Send full stdout/stderr to clients; do not truncate. // Send full stdout/stderr to clients; do not truncate.
let stdout = stdout.text.clone(); let stdout = stdout.text.clone();
@@ -920,6 +921,7 @@ impl Session {
let output_stderr; let output_stderr;
let borrowed: &ExecToolCallOutput = match &result { let borrowed: &ExecToolCallOutput = match &result {
Ok(output) => output, Ok(output) => output,
Err(CodexErr::Sandbox(SandboxErr::Timeout { output })) => output,
Err(e) => { Err(e) => {
output_stderr = ExecToolCallOutput { output_stderr = ExecToolCallOutput {
exit_code: -1, exit_code: -1,
@@ -927,6 +929,7 @@ impl Session {
stderr: StreamOutput::new(get_error_message_ui(e)), stderr: StreamOutput::new(get_error_message_ui(e)),
aggregated_output: StreamOutput::new(get_error_message_ui(e)), aggregated_output: StreamOutput::new(get_error_message_ui(e)),
duration: Duration::default(), duration: Duration::default(),
timed_out: false,
}; };
&output_stderr &output_stderr
} }
@@ -2917,15 +2920,12 @@ async fn handle_sandbox_error(
let sub_id = exec_command_context.sub_id.clone(); let sub_id = exec_command_context.sub_id.clone();
let cwd = exec_command_context.cwd.clone(); let cwd = exec_command_context.cwd.clone();
// if the command timed out, we can simply return this failure to the model if let SandboxErr::Timeout { output } = &error {
if matches!(error, SandboxErr::Timeout) { let content = format_exec_output(output);
return ResponseInputItem::FunctionCallOutput { return ResponseInputItem::FunctionCallOutput {
call_id, call_id,
output: FunctionCallOutputPayload { output: FunctionCallOutputPayload {
content: format!( content,
"command timed out after {} milliseconds",
params.timeout_duration().as_millis()
),
success: Some(false), success: Some(false),
}, },
}; };
@@ -3050,7 +3050,17 @@ fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
// Head+tail truncation for the model: show the beginning and end with an elision. // Head+tail truncation for the model: show the beginning and end with an elision.
// Clients still receive full streams; only this formatted summary is capped. // Clients still receive full streams; only this formatted summary is capped.
let s = aggregated_output.text.as_str(); let mut s = &aggregated_output.text;
let prefixed_str: String;
if exec_output.timed_out {
prefixed_str = format!(
"command timed out after {} milliseconds\n",
exec_output.duration.as_millis()
) + s;
s = &prefixed_str;
}
let total_lines = s.lines().count(); let total_lines = s.lines().count();
if s.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES { if s.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES {
return s.to_string(); return s.to_string();
@@ -3093,6 +3103,7 @@ fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
// Build final string respecting byte budgets // Build final string respecting byte budgets
let head_part = take_bytes_at_char_boundary(&head_lines_text, head_budget); let head_part = take_bytes_at_char_boundary(&head_lines_text, head_budget);
let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(s.len())); let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(s.len()));
result.push_str(head_part); result.push_str(head_part);
result.push_str(&marker); result.push_str(&marker);
@@ -3342,6 +3353,7 @@ mod tests {
stderr: StreamOutput::new(String::new()), stderr: StreamOutput::new(String::new()),
aggregated_output: StreamOutput::new(full), aggregated_output: StreamOutput::new(full),
duration: StdDuration::from_secs(1), duration: StdDuration::from_secs(1),
timed_out: false,
}; };
let out = format_exec_output_str(&exec); let out = format_exec_output_str(&exec);
@@ -3384,6 +3396,7 @@ mod tests {
stderr: StreamOutput::new(String::new()), stderr: StreamOutput::new(String::new()),
aggregated_output: StreamOutput::new(full.clone()), aggregated_output: StreamOutput::new(full.clone()),
duration: StdDuration::from_secs(1), duration: StdDuration::from_secs(1),
timed_out: false,
}; };
let out = format_exec_output_str(&exec); let out = format_exec_output_str(&exec);
@@ -3406,6 +3419,25 @@ mod tests {
); );
} }
#[test]
fn includes_timed_out_message() {
let exec = ExecToolCallOutput {
exit_code: 0,
stdout: StreamOutput::new(String::new()),
stderr: StreamOutput::new(String::new()),
aggregated_output: StreamOutput::new("Command output".to_string()),
duration: StdDuration::from_secs(1),
timed_out: true,
};
let out = format_exec_output_str(&exec);
assert_eq!(
out,
"command timed out after 1000 milliseconds\nCommand output"
);
}
#[test] #[test]
fn falls_back_to_content_when_structured_is_null() { fn falls_back_to_content_when_structured_is_null() {
let ctr = CallToolResult { let ctr = CallToolResult {

View File

@@ -1,3 +1,4 @@
use crate::exec::ExecToolCallOutput;
use crate::token_data::KnownPlan; use crate::token_data::KnownPlan;
use crate::token_data::PlanType; use crate::token_data::PlanType;
use codex_protocol::mcp_protocol::ConversationId; use codex_protocol::mcp_protocol::ConversationId;
@@ -13,8 +14,11 @@ pub type Result<T> = std::result::Result<T, CodexErr>;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum SandboxErr { pub enum SandboxErr {
/// Error from sandbox execution /// Error from sandbox execution
#[error("sandbox denied exec error, exit code: {0}, stdout: {1}, stderr: {2}")] #[error(
Denied(i32, String, String), "sandbox denied exec error, exit code: {}, stdout: {}, stderr: {}",
.output.exit_code, .output.stdout.text, .output.stderr.text
)]
Denied { output: Box<ExecToolCallOutput> },
/// Error from linux seccomp filter setup /// Error from linux seccomp filter setup
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
@@ -28,7 +32,7 @@ pub enum SandboxErr {
/// Command timed out /// Command timed out
#[error("command timed out")] #[error("command timed out")]
Timeout, Timeout { output: Box<ExecToolCallOutput> },
/// Command was killed by a signal /// Command was killed by a signal
#[error("command was killed by a signal")] #[error("command was killed by a signal")]
@@ -245,9 +249,12 @@ impl CodexErr {
pub fn get_error_message_ui(e: &CodexErr) -> String { pub fn get_error_message_ui(e: &CodexErr) -> String {
match e { match e {
CodexErr::Sandbox(SandboxErr::Denied(_, _, stderr)) => stderr.to_string(), CodexErr::Sandbox(SandboxErr::Denied { output }) => output.stderr.text.clone(),
// Timeouts are not sandbox errors from a UX perspective; present them plainly // Timeouts are not sandbox errors from a UX perspective; present them plainly
CodexErr::Sandbox(SandboxErr::Timeout) => "error: command timed out".to_string(), CodexErr::Sandbox(SandboxErr::Timeout { output }) => format!(
"error: command timed out after {} ms",
output.duration.as_millis()
),
_ => e.to_string(), _ => e.to_string(),
} }
} }

View File

@@ -34,6 +34,7 @@ const DEFAULT_TIMEOUT_MS: u64 = 10_000;
const SIGKILL_CODE: i32 = 9; const SIGKILL_CODE: i32 = 9;
const TIMEOUT_CODE: i32 = 64; const TIMEOUT_CODE: i32 = 64;
const EXIT_CODE_SIGNAL_BASE: i32 = 128; // conventional shell: 128 + signal const EXIT_CODE_SIGNAL_BASE: i32 = 128; // conventional shell: 128 + signal
const EXEC_TIMEOUT_EXIT_CODE: i32 = 124; // conventional timeout exit code
// I/O buffer sizing // I/O buffer sizing
const READ_CHUNK_SIZE: usize = 8192; // bytes per read const READ_CHUNK_SIZE: usize = 8192; // bytes per read
@@ -86,11 +87,12 @@ pub async fn process_exec_tool_call(
) -> Result<ExecToolCallOutput> { ) -> Result<ExecToolCallOutput> {
let start = Instant::now(); let start = Instant::now();
let timeout_duration = params.timeout_duration();
let raw_output_result: std::result::Result<RawExecToolCallOutput, CodexErr> = match sandbox_type let raw_output_result: std::result::Result<RawExecToolCallOutput, CodexErr> = match sandbox_type
{ {
SandboxType::None => exec(params, sandbox_policy, stdout_stream.clone()).await, SandboxType::None => exec(params, sandbox_policy, stdout_stream.clone()).await,
SandboxType::MacosSeatbelt => { SandboxType::MacosSeatbelt => {
let timeout = params.timeout_duration();
let ExecParams { let ExecParams {
command, cwd, env, .. command, cwd, env, ..
} = params; } = params;
@@ -102,10 +104,9 @@ pub async fn process_exec_tool_call(
env, env,
) )
.await?; .await?;
consume_truncated_output(child, timeout, stdout_stream.clone()).await consume_truncated_output(child, timeout_duration, stdout_stream.clone()).await
} }
SandboxType::LinuxSeccomp => { SandboxType::LinuxSeccomp => {
let timeout = params.timeout_duration();
let ExecParams { let ExecParams {
command, cwd, env, .. command, cwd, env, ..
} = params; } = params;
@@ -123,41 +124,56 @@ pub async fn process_exec_tool_call(
) )
.await?; .await?;
consume_truncated_output(child, timeout, stdout_stream).await consume_truncated_output(child, timeout_duration, stdout_stream).await
} }
}; };
let duration = start.elapsed(); let duration = start.elapsed();
match raw_output_result { match raw_output_result {
Ok(raw_output) => { Ok(raw_output) => {
let stdout = raw_output.stdout.from_utf8_lossy(); #[allow(unused_mut)]
let stderr = raw_output.stderr.from_utf8_lossy(); let mut timed_out = raw_output.timed_out;
#[cfg(target_family = "unix")] #[cfg(target_family = "unix")]
match raw_output.exit_status.signal() { {
Some(TIMEOUT_CODE) => return Err(CodexErr::Sandbox(SandboxErr::Timeout)), if let Some(signal) = raw_output.exit_status.signal() {
Some(signal) => { if signal == TIMEOUT_CODE {
return Err(CodexErr::Sandbox(SandboxErr::Signal(signal))); timed_out = true;
} else {
return Err(CodexErr::Sandbox(SandboxErr::Signal(signal)));
}
} }
None => {}
} }
let exit_code = raw_output.exit_status.code().unwrap_or(-1); let mut exit_code = raw_output.exit_status.code().unwrap_or(-1);
if timed_out {
if exit_code != 0 && is_likely_sandbox_denied(sandbox_type, exit_code) { exit_code = EXEC_TIMEOUT_EXIT_CODE;
return Err(CodexErr::Sandbox(SandboxErr::Denied(
exit_code,
stdout.text,
stderr.text,
)));
} }
Ok(ExecToolCallOutput { let stdout = raw_output.stdout.from_utf8_lossy();
let stderr = raw_output.stderr.from_utf8_lossy();
let aggregated_output = raw_output.aggregated_output.from_utf8_lossy();
let exec_output = ExecToolCallOutput {
exit_code, exit_code,
stdout, stdout,
stderr, stderr,
aggregated_output: raw_output.aggregated_output.from_utf8_lossy(), aggregated_output,
duration, duration,
}) timed_out,
};
if timed_out {
return Err(CodexErr::Sandbox(SandboxErr::Timeout {
output: Box::new(exec_output),
}));
}
if exit_code != 0 && is_likely_sandbox_denied(sandbox_type, exit_code) {
return Err(CodexErr::Sandbox(SandboxErr::Denied {
output: Box::new(exec_output),
}));
}
Ok(exec_output)
} }
Err(err) => { Err(err) => {
tracing::error!("exec error: {err}"); tracing::error!("exec error: {err}");
@@ -197,6 +213,7 @@ struct RawExecToolCallOutput {
pub stdout: StreamOutput<Vec<u8>>, pub stdout: StreamOutput<Vec<u8>>,
pub stderr: StreamOutput<Vec<u8>>, pub stderr: StreamOutput<Vec<u8>>,
pub aggregated_output: StreamOutput<Vec<u8>>, pub aggregated_output: StreamOutput<Vec<u8>>,
pub timed_out: bool,
} }
impl StreamOutput<String> { impl StreamOutput<String> {
@@ -229,6 +246,7 @@ pub struct ExecToolCallOutput {
pub stderr: StreamOutput<String>, pub stderr: StreamOutput<String>,
pub aggregated_output: StreamOutput<String>, pub aggregated_output: StreamOutput<String>,
pub duration: Duration, pub duration: Duration,
pub timed_out: bool,
} }
async fn exec( async fn exec(
@@ -298,22 +316,24 @@ async fn consume_truncated_output(
Some(agg_tx.clone()), Some(agg_tx.clone()),
)); ));
let exit_status = tokio::select! { let (exit_status, timed_out) = tokio::select! {
result = tokio::time::timeout(timeout, child.wait()) => { result = tokio::time::timeout(timeout, child.wait()) => {
match result { match result {
Ok(Ok(exit_status)) => exit_status, Ok(status_result) => {
Ok(e) => e?, let exit_status = status_result?;
(exit_status, false)
}
Err(_) => { Err(_) => {
// timeout // timeout
child.start_kill()?; child.start_kill()?;
// Debatable whether `child.wait().await` should be called here. // Debatable whether `child.wait().await` should be called here.
synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + TIMEOUT_CODE) (synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + TIMEOUT_CODE), true)
} }
} }
} }
_ = tokio::signal::ctrl_c() => { _ = tokio::signal::ctrl_c() => {
child.start_kill()?; child.start_kill()?;
synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + SIGKILL_CODE) (synthetic_exit_status(EXIT_CODE_SIGNAL_BASE + SIGKILL_CODE), false)
} }
}; };
@@ -336,6 +356,7 @@ async fn consume_truncated_output(
stdout, stdout,
stderr, stderr,
aggregated_output, aggregated_output,
timed_out,
}) })
} }

View File

@@ -2,8 +2,11 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::time::Duration;
use async_channel::Receiver; use async_channel::Receiver;
use codex_core::error::CodexErr;
use codex_core::error::SandboxErr;
use codex_core::exec::ExecParams; use codex_core::exec::ExecParams;
use codex_core::exec::SandboxType; use codex_core::exec::SandboxType;
use codex_core::exec::StdoutStream; use codex_core::exec::StdoutStream;
@@ -170,3 +173,36 @@ async fn test_aggregated_output_interleaves_in_order() {
assert_eq!(result.aggregated_output.text, "O1\nE1\nO2\nE2\n"); assert_eq!(result.aggregated_output.text, "O1\nE1\nO2\nE2\n");
assert_eq!(result.aggregated_output.truncated_after_lines, None); assert_eq!(result.aggregated_output.truncated_after_lines, None);
} }
#[tokio::test]
async fn test_exec_timeout_returns_partial_output() {
let cmd = vec![
"/bin/sh".to_string(),
"-c".to_string(),
"printf 'before\\n'; sleep 2; printf 'after\\n'".to_string(),
];
let params = ExecParams {
command: cmd,
cwd: std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")),
timeout_ms: Some(200),
env: HashMap::new(),
with_escalated_permissions: None,
justification: None,
};
let policy = SandboxPolicy::new_read_only_policy();
let result = process_exec_tool_call(params, SandboxType::None, &policy, &None, None).await;
let Err(CodexErr::Sandbox(SandboxErr::Timeout { output })) = result else {
panic!("expected timeout error");
};
assert_eq!(output.exit_code, 124);
assert_eq!(output.stdout.text, "before\n");
assert!(output.stderr.text.is_empty());
assert_eq!(output.aggregated_output.text, "before\n");
assert!(output.duration >= Duration::from_millis(200));
assert!(output.timed_out);
}

View File

@@ -121,7 +121,7 @@ async fn test_writable_root() {
} }
#[tokio::test] #[tokio::test]
#[should_panic(expected = "Sandbox(Timeout)")] #[should_panic(expected = "Sandbox(Timeout")]
async fn test_timeout() { async fn test_timeout() {
run_cmd(&["sleep", "2"], &[], 50).await; run_cmd(&["sleep", "2"], &[], 50).await;
} }
@@ -156,26 +156,27 @@ async fn assert_network_blocked(cmd: &[&str]) {
) )
.await; .await;
let (exit_code, stdout, stderr) = match result { let output = match result {
Ok(output) => (output.exit_code, output.stdout.text, output.stderr.text), Ok(output) => output,
Err(CodexErr::Sandbox(SandboxErr::Denied(exit_code, stdout, stderr))) => { Err(CodexErr::Sandbox(SandboxErr::Denied { output })) => *output,
(exit_code, stdout, stderr)
}
_ => { _ => {
panic!("expected sandbox denied error, got: {result:?}"); panic!("expected sandbox denied error, got: {result:?}");
} }
}; };
dbg!(&stderr); dbg!(&output.stderr.text);
dbg!(&stdout); dbg!(&output.stdout.text);
dbg!(&exit_code); dbg!(&output.exit_code);
// A completely missing binary exits with 127. Anything else should also // A completely missing binary exits with 127. Anything else should also
// be nonzero (EPERM from seccomp will usually bubble up as 1, 2, 13…) // be nonzero (EPERM from seccomp will usually bubble up as 1, 2, 13…)
// If—*and only if*—the command exits 0 we consider the sandbox breached. // If—*and only if*—the command exits 0 we consider the sandbox breached.
if exit_code == 0 { if output.exit_code == 0 {
panic!("Network sandbox FAILED - {cmd:?} exited 0\nstdout:\n{stdout}\nstderr:\n{stderr}",); panic!(
"Network sandbox FAILED - {cmd:?} exited 0\nstdout:\n{}\nstderr:\n{}",
output.stdout.text, output.stderr.text
);
} }
} }