fix: exec commands that blows up context window. (#4706)
We truncate the output of exec commands to not blow the context window. However, some cases we weren't doing that. This caused reports of people with 76% context window left facing `input exceeded context window` which is weird.
This commit is contained in:
@@ -145,6 +145,7 @@ pub(crate) async fn handle_container_exec_with_params(
|
||||
)
|
||||
.await;
|
||||
|
||||
// always make sure to truncate the output if its length isn't controlled.
|
||||
match output_result {
|
||||
Ok(output) => {
|
||||
let ExecToolCallOutput { exit_code, .. } = &output;
|
||||
@@ -155,13 +156,16 @@ pub(crate) async fn handle_container_exec_with_params(
|
||||
Err(FunctionCallError::RespondToModel(content))
|
||||
}
|
||||
}
|
||||
Err(ExecError::Function(err)) => Err(err),
|
||||
Err(ExecError::Function(err)) => Err(truncate_function_error(err)),
|
||||
Err(ExecError::Codex(CodexErr::Sandbox(SandboxErr::Timeout { output }))) => Err(
|
||||
FunctionCallError::RespondToModel(format_exec_output_apply_patch(&output)),
|
||||
),
|
||||
Err(ExecError::Codex(err)) => Err(FunctionCallError::RespondToModel(format!(
|
||||
"execution error: {err:?}"
|
||||
))),
|
||||
Err(ExecError::Codex(err)) => {
|
||||
let message = format!("execution error: {err:?}");
|
||||
Err(FunctionCallError::RespondToModel(
|
||||
truncate_formatted_exec_output(&message),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,26 +210,28 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
|
||||
aggregated_output, ..
|
||||
} = exec_output;
|
||||
|
||||
// Head+tail truncation for the model: show the beginning and end with an elision.
|
||||
// Clients still receive full streams; only this formatted summary is capped.
|
||||
|
||||
let mut s = &aggregated_output.text;
|
||||
let prefixed_str: String;
|
||||
let content = aggregated_output.text.as_str();
|
||||
|
||||
if exec_output.timed_out {
|
||||
prefixed_str = format!(
|
||||
"command timed out after {} milliseconds\n",
|
||||
let prefixed = format!(
|
||||
"command timed out after {} milliseconds\n{content}",
|
||||
exec_output.duration.as_millis()
|
||||
) + s;
|
||||
s = &prefixed_str;
|
||||
);
|
||||
return truncate_formatted_exec_output(&prefixed);
|
||||
}
|
||||
|
||||
let total_lines = s.lines().count();
|
||||
if s.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES {
|
||||
return s.to_string();
|
||||
truncate_formatted_exec_output(content)
|
||||
}
|
||||
|
||||
fn truncate_formatted_exec_output(content: &str) -> String {
|
||||
// Head+tail truncation for the model: show the beginning and end with an elision.
|
||||
// Clients still receive full streams; only this formatted summary is capped.
|
||||
let total_lines = content.lines().count();
|
||||
if content.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES {
|
||||
return content.to_string();
|
||||
}
|
||||
|
||||
let segments: Vec<&str> = s.split_inclusive('\n').collect();
|
||||
let segments: Vec<&str> = content.split_inclusive('\n').collect();
|
||||
let head_take = MODEL_FORMAT_HEAD_LINES.min(segments.len());
|
||||
let tail_take = MODEL_FORMAT_TAIL_LINES.min(segments.len().saturating_sub(head_take));
|
||||
let omitted = segments.len().saturating_sub(head_take + tail_take);
|
||||
@@ -236,9 +242,9 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
|
||||
.map(|segment| segment.len())
|
||||
.sum();
|
||||
let tail_slice_start: usize = if tail_take == 0 {
|
||||
s.len()
|
||||
content.len()
|
||||
} else {
|
||||
s.len()
|
||||
content.len()
|
||||
- segments
|
||||
.iter()
|
||||
.rev()
|
||||
@@ -260,9 +266,9 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
|
||||
head_budget = MODEL_FORMAT_MAX_BYTES.saturating_sub(marker.len());
|
||||
}
|
||||
|
||||
let head_slice = &s[..head_slice_end];
|
||||
let head_slice = &content[..head_slice_end];
|
||||
let head_part = take_bytes_at_char_boundary(head_slice, head_budget);
|
||||
let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(s.len()));
|
||||
let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(content.len()));
|
||||
|
||||
result.push_str(head_part);
|
||||
result.push_str(&marker);
|
||||
@@ -272,9 +278,71 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
|
||||
return result;
|
||||
}
|
||||
|
||||
let tail_slice = &s[tail_slice_start..];
|
||||
let tail_slice = &content[tail_slice_start..];
|
||||
let tail_part = take_last_bytes_at_char_boundary(tail_slice, remaining);
|
||||
result.push_str(tail_part);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn truncate_function_error(err: FunctionCallError) -> FunctionCallError {
|
||||
match err {
|
||||
FunctionCallError::RespondToModel(msg) => {
|
||||
FunctionCallError::RespondToModel(truncate_formatted_exec_output(&msg))
|
||||
}
|
||||
FunctionCallError::Fatal(msg) => {
|
||||
FunctionCallError::Fatal(truncate_formatted_exec_output(&msg))
|
||||
}
|
||||
other => other,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn truncate_formatted_exec_output_truncates_large_error() {
|
||||
let line = "very long execution error line that should trigger truncation\n";
|
||||
let large_error = line.repeat(2_500); // way beyond both byte and line limits
|
||||
|
||||
let truncated = truncate_formatted_exec_output(&large_error);
|
||||
|
||||
assert!(truncated.len() <= MODEL_FORMAT_MAX_BYTES);
|
||||
assert!(truncated.starts_with(line));
|
||||
assert!(truncated.contains("[... omitted"));
|
||||
assert_ne!(truncated, large_error);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_function_error_trims_respond_to_model() {
|
||||
let line = "respond-to-model error that should be truncated\n";
|
||||
let huge = line.repeat(3_000);
|
||||
|
||||
let err = truncate_function_error(FunctionCallError::RespondToModel(huge));
|
||||
match err {
|
||||
FunctionCallError::RespondToModel(message) => {
|
||||
assert!(message.len() <= MODEL_FORMAT_MAX_BYTES);
|
||||
assert!(message.contains("[... omitted"));
|
||||
assert!(message.starts_with(line));
|
||||
}
|
||||
other => panic!("unexpected error variant: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_function_error_trims_fatal() {
|
||||
let line = "fatal error output that should be truncated\n";
|
||||
let huge = line.repeat(3_000);
|
||||
|
||||
let err = truncate_function_error(FunctionCallError::Fatal(huge));
|
||||
match err {
|
||||
FunctionCallError::Fatal(message) => {
|
||||
assert!(message.len() <= MODEL_FORMAT_MAX_BYTES);
|
||||
assert!(message.contains("[... omitted"));
|
||||
assert!(message.starts_with(line));
|
||||
}
|
||||
other => panic!("unexpected error variant: {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -458,3 +458,150 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex();
|
||||
let test = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-denied";
|
||||
let long_line = "this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz";
|
||||
let script = format!(
|
||||
"for i in $(seq 1 500); do >&2 echo '{long_line}'; done; cat <<'EOF' > denied.txt\ncontent\nEOF",
|
||||
);
|
||||
let args = json!({
|
||||
"command": ["/bin/sh", "-c", script],
|
||||
"timeout_ms": 1_000,
|
||||
});
|
||||
|
||||
let responses = vec![
|
||||
sse(vec![
|
||||
json!({"type": "response.created", "response": {"id": "resp-1"}}),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
];
|
||||
mount_sse_sequence(&server, responses).await;
|
||||
|
||||
submit_turn(
|
||||
&test,
|
||||
"attempt to write in read-only sandbox",
|
||||
AskForApproval::Never,
|
||||
SandboxPolicy::ReadOnly,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let requests = server.received_requests().await.expect("recorded requests");
|
||||
let bodies = request_bodies(&requests)?;
|
||||
let function_outputs = collect_output_items(&bodies, "function_call_output");
|
||||
let denied_item = function_outputs
|
||||
.iter()
|
||||
.find(|item| item.get("call_id").and_then(Value::as_str) == Some(call_id))
|
||||
.expect("denied output present");
|
||||
|
||||
let output = denied_item
|
||||
.get("output")
|
||||
.and_then(Value::as_str)
|
||||
.expect("denied output string");
|
||||
|
||||
assert!(
|
||||
output.starts_with("failed in sandbox: "),
|
||||
"expected sandbox error prefix, got {output:?}"
|
||||
);
|
||||
assert!(
|
||||
output.contains("[... omitted"),
|
||||
"expected truncated marker, got {output:?}"
|
||||
);
|
||||
assert!(
|
||||
output.contains(long_line),
|
||||
"expected truncated stderr sample, got {output:?}"
|
||||
);
|
||||
// Linux distributions may surface sandbox write failures as different errno messages
|
||||
// depending on the underlying mechanism (e.g., EPERM, EACCES, or EROFS). Accept a
|
||||
// small set of common variants to keep this cross-platform.
|
||||
let denial_markers = [
|
||||
"Operation not permitted", // EPERM
|
||||
"Permission denied", // EACCES
|
||||
"Read-only file system", // EROFS
|
||||
];
|
||||
assert!(
|
||||
denial_markers.iter().any(|m| output.contains(m)),
|
||||
"expected sandbox denial message, got {output:?}"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex().with_config(|cfg| {
|
||||
cfg.sandbox_policy = SandboxPolicy::DangerFullAccess;
|
||||
});
|
||||
let test = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-spawn-failure";
|
||||
let bogus_component = "missing-bin-".repeat(700);
|
||||
let bogus_exe = test
|
||||
.cwd
|
||||
.path()
|
||||
.join(bogus_component)
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
|
||||
let args = json!({
|
||||
"command": [bogus_exe],
|
||||
"timeout_ms": 1_000,
|
||||
});
|
||||
|
||||
let responses = vec![
|
||||
sse(vec![
|
||||
json!({"type": "response.created", "response": {"id": "resp-1"}}),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
];
|
||||
mount_sse_sequence(&server, responses).await;
|
||||
|
||||
submit_turn(
|
||||
&test,
|
||||
"spawn a missing binary",
|
||||
AskForApproval::Never,
|
||||
SandboxPolicy::DangerFullAccess,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let requests = server.received_requests().await.expect("recorded requests");
|
||||
let bodies = request_bodies(&requests)?;
|
||||
let function_outputs = collect_output_items(&bodies, "function_call_output");
|
||||
let failure_item = function_outputs
|
||||
.iter()
|
||||
.find(|item| item.get("call_id").and_then(Value::as_str) == Some(call_id))
|
||||
.expect("spawn failure output present");
|
||||
|
||||
let output = failure_item
|
||||
.get("output")
|
||||
.and_then(Value::as_str)
|
||||
.expect("spawn failure output string");
|
||||
|
||||
assert!(
|
||||
output.starts_with("execution error:"),
|
||||
"expected execution error prefix, got {output:?}"
|
||||
);
|
||||
assert!(output.len() <= 10 * 1024);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user