fix: exec commands that blows up context window. (#4706)

We truncate the output of exec commands to not blow the context window. However, some cases we weren't doing that. This caused reports of people with 76% context window left facing `input exceeded context window` which is weird.
2025-10-04 11:49:56 -07:00
parent c5465aed60
commit d7acd146fb
2 changed files with 237 additions and 22 deletions
--- a/codex-rs/core/src/tools/mod.rs
+++ b/codex-rs/core/src/tools/mod.rs
@@ -145,6 +145,7 @@ pub(crate) async fn handle_container_exec_with_params(
        )
        .await;

+    // always make sure to truncate the output if its length isn't controlled.
    match output_result {
        Ok(output) => {
            let ExecToolCallOutput { exit_code, .. } = &output;
@@ -155,13 +156,16 @@ pub(crate) async fn handle_container_exec_with_params(
                Err(FunctionCallError::RespondToModel(content))
            }
        }
-        Err(ExecError::Function(err)) => Err(err),
+        Err(ExecError::Function(err)) => Err(truncate_function_error(err)),
        Err(ExecError::Codex(CodexErr::Sandbox(SandboxErr::Timeout { output }))) => Err(
            FunctionCallError::RespondToModel(format_exec_output_apply_patch(&output)),
        ),
-        Err(ExecError::Codex(err)) => Err(FunctionCallError::RespondToModel(format!(
-            "execution error: {err:?}"
-        ))),
+        Err(ExecError::Codex(err)) => {
+            let message = format!("execution error: {err:?}");
+            Err(FunctionCallError::RespondToModel(
+                truncate_formatted_exec_output(&message),
+            ))
+        }
    }
 }

@@ -206,26 +210,28 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
        aggregated_output, ..
    } = exec_output;

-    // Head+tail truncation for the model: show the beginning and end with an elision.
-    // Clients still receive full streams; only this formatted summary is capped.
-
-    let mut s = &aggregated_output.text;
-    let prefixed_str: String;
+    let content = aggregated_output.text.as_str();

    if exec_output.timed_out {
-        prefixed_str = format!(
-            "command timed out after {} milliseconds\n",
+        let prefixed = format!(
+            "command timed out after {} milliseconds\n{content}",
            exec_output.duration.as_millis()
-        ) + s;
-        s = &prefixed_str;
+        );
+        return truncate_formatted_exec_output(&prefixed);
    }

-    let total_lines = s.lines().count();
-    if s.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES {
-        return s.to_string();
+    truncate_formatted_exec_output(content)
+}
+
+fn truncate_formatted_exec_output(content: &str) -> String {
+    // Head+tail truncation for the model: show the beginning and end with an elision.
+    // Clients still receive full streams; only this formatted summary is capped.
+    let total_lines = content.lines().count();
+    if content.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES {
+        return content.to_string();
    }

-    let segments: Vec<&str> = s.split_inclusive('\n').collect();
+    let segments: Vec<&str> = content.split_inclusive('\n').collect();
    let head_take = MODEL_FORMAT_HEAD_LINES.min(segments.len());
    let tail_take = MODEL_FORMAT_TAIL_LINES.min(segments.len().saturating_sub(head_take));
    let omitted = segments.len().saturating_sub(head_take + tail_take);
@@ -236,9 +242,9 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
        .map(|segment| segment.len())
        .sum();
    let tail_slice_start: usize = if tail_take == 0 {
-        s.len()
+        content.len()
    } else {
-        s.len()
+        content.len()
            - segments
                .iter()
                .rev()
@@ -260,9 +266,9 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
        head_budget = MODEL_FORMAT_MAX_BYTES.saturating_sub(marker.len());
    }

-    let head_slice = &s[..head_slice_end];
+    let head_slice = &content[..head_slice_end];
    let head_part = take_bytes_at_char_boundary(head_slice, head_budget);
-    let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(s.len()));
+    let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(content.len()));

    result.push_str(head_part);
    result.push_str(&marker);
@@ -272,9 +278,71 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
        return result;
    }

-    let tail_slice = &s[tail_slice_start..];
+    let tail_slice = &content[tail_slice_start..];
    let tail_part = take_last_bytes_at_char_boundary(tail_slice, remaining);
    result.push_str(tail_part);

    result
 }
+
+fn truncate_function_error(err: FunctionCallError) -> FunctionCallError {
+    match err {
+        FunctionCallError::RespondToModel(msg) => {
+            FunctionCallError::RespondToModel(truncate_formatted_exec_output(&msg))
+        }
+        FunctionCallError::Fatal(msg) => {
+            FunctionCallError::Fatal(truncate_formatted_exec_output(&msg))
+        }
+        other => other,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn truncate_formatted_exec_output_truncates_large_error() {
+        let line = "very long execution error line that should trigger truncation\n";
+        let large_error = line.repeat(2_500); // way beyond both byte and line limits
+
+        let truncated = truncate_formatted_exec_output(&large_error);
+
+        assert!(truncated.len() <= MODEL_FORMAT_MAX_BYTES);
+        assert!(truncated.starts_with(line));
+        assert!(truncated.contains("[... omitted"));
+        assert_ne!(truncated, large_error);
+    }
+
+    #[test]
+    fn truncate_function_error_trims_respond_to_model() {
+        let line = "respond-to-model error that should be truncated\n";
+        let huge = line.repeat(3_000);
+
+        let err = truncate_function_error(FunctionCallError::RespondToModel(huge));
+        match err {
+            FunctionCallError::RespondToModel(message) => {
+                assert!(message.len() <= MODEL_FORMAT_MAX_BYTES);
+                assert!(message.contains("[... omitted"));
+                assert!(message.starts_with(line));
+            }
+            other => panic!("unexpected error variant: {other:?}"),
+        }
+    }
+
+    #[test]
+    fn truncate_function_error_trims_fatal() {
+        let line = "fatal error output that should be truncated\n";
+        let huge = line.repeat(3_000);
+
+        let err = truncate_function_error(FunctionCallError::Fatal(huge));
+        match err {
+            FunctionCallError::Fatal(message) => {
+                assert!(message.len() <= MODEL_FORMAT_MAX_BYTES);
+                assert!(message.contains("[... omitted"));
+                assert!(message.starts_with(line));
+            }
+            other => panic!("unexpected error variant: {other:?}"),
+        }
+    }
+}
--- a/codex-rs/core/tests/suite/tools.rs
+++ b/codex-rs/core/tests/suite/tools.rs
@@ -458,3 +458,150 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {

    Ok(())
 }
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let mut builder = test_codex();
+    let test = builder.build(&server).await?;
+
+    let call_id = "shell-denied";
+    let long_line = "this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz";
+    let script = format!(
+        "for i in $(seq 1 500); do >&2 echo '{long_line}'; done; cat <<'EOF' > denied.txt\ncontent\nEOF",
+    );
+    let args = json!({
+        "command": ["/bin/sh", "-c", script],
+        "timeout_ms": 1_000,
+    });
+
+    let responses = vec![
+        sse(vec![
+            json!({"type": "response.created", "response": {"id": "resp-1"}}),
+            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_turn(
+        &test,
+        "attempt to write in read-only sandbox",
+        AskForApproval::Never,
+        SandboxPolicy::ReadOnly,
+    )
+    .await?;
+
+    let requests = server.received_requests().await.expect("recorded requests");
+    let bodies = request_bodies(&requests)?;
+    let function_outputs = collect_output_items(&bodies, "function_call_output");
+    let denied_item = function_outputs
+        .iter()
+        .find(|item| item.get("call_id").and_then(Value::as_str) == Some(call_id))
+        .expect("denied output present");
+
+    let output = denied_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("denied output string");
+
+    assert!(
+        output.starts_with("failed in sandbox: "),
+        "expected sandbox error prefix, got {output:?}"
+    );
+    assert!(
+        output.contains("[... omitted"),
+        "expected truncated marker, got {output:?}"
+    );
+    assert!(
+        output.contains(long_line),
+        "expected truncated stderr sample, got {output:?}"
+    );
+    // Linux distributions may surface sandbox write failures as different errno messages
+    // depending on the underlying mechanism (e.g., EPERM, EACCES, or EROFS). Accept a
+    // small set of common variants to keep this cross-platform.
+    let denial_markers = [
+        "Operation not permitted", // EPERM
+        "Permission denied",       // EACCES
+        "Read-only file system",   // EROFS
+    ];
+    assert!(
+        denial_markers.iter().any(|m| output.contains(m)),
+        "expected sandbox denial message, got {output:?}"
+    );
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let mut builder = test_codex().with_config(|cfg| {
+        cfg.sandbox_policy = SandboxPolicy::DangerFullAccess;
+    });
+    let test = builder.build(&server).await?;
+
+    let call_id = "shell-spawn-failure";
+    let bogus_component = "missing-bin-".repeat(700);
+    let bogus_exe = test
+        .cwd
+        .path()
+        .join(bogus_component)
+        .to_string_lossy()
+        .into_owned();
+
+    let args = json!({
+        "command": [bogus_exe],
+        "timeout_ms": 1_000,
+    });
+
+    let responses = vec![
+        sse(vec![
+            json!({"type": "response.created", "response": {"id": "resp-1"}}),
+            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_turn(
+        &test,
+        "spawn a missing binary",
+        AskForApproval::Never,
+        SandboxPolicy::DangerFullAccess,
+    )
+    .await?;
+
+    let requests = server.received_requests().await.expect("recorded requests");
+    let bodies = request_bodies(&requests)?;
+    let function_outputs = collect_output_items(&bodies, "function_call_output");
+    let failure_item = function_outputs
+        .iter()
+        .find(|item| item.get("call_id").and_then(Value::as_str) == Some(call_id))
+        .expect("spawn failure output present");
+
+    let output = failure_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("spawn failure output string");
+
+    assert!(
+        output.starts_with("execution error:"),
+        "expected execution error prefix, got {output:?}"
+    );
+    assert!(output.len() <= 10 * 1024);
+
+    Ok(())
+}