bug: sandbox denied error logs (#4874)
Check on STDOUT / STDERR or aggregated output for some logs when sanbox is denied
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
use crate::exec::ExecToolCallOutput;
|
||||
use crate::token_data::KnownPlan;
|
||||
use crate::token_data::PlanType;
|
||||
use crate::truncate::truncate_middle;
|
||||
use codex_protocol::ConversationId;
|
||||
use codex_protocol::protocol::RateLimitSnapshot;
|
||||
use reqwest::StatusCode;
|
||||
@@ -12,6 +13,9 @@ use tokio::task::JoinError;
|
||||
|
||||
pub type Result<T> = std::result::Result<T, CodexErr>;
|
||||
|
||||
/// Limit UI error messages to a reasonable size while keeping useful context.
|
||||
const ERROR_MESSAGE_UI_MAX_BYTES: usize = 2 * 1024; // 4 KiB
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SandboxErr {
|
||||
/// Error from sandbox execution
|
||||
@@ -304,21 +308,44 @@ impl CodexErr {
|
||||
}
|
||||
|
||||
pub fn get_error_message_ui(e: &CodexErr) -> String {
|
||||
match e {
|
||||
CodexErr::Sandbox(SandboxErr::Denied { output }) => output.stderr.text.clone(),
|
||||
let message = match e {
|
||||
CodexErr::Sandbox(SandboxErr::Denied { output }) => {
|
||||
let aggregated = output.aggregated_output.text.trim();
|
||||
if !aggregated.is_empty() {
|
||||
output.aggregated_output.text.clone()
|
||||
} else {
|
||||
let stderr = output.stderr.text.trim();
|
||||
let stdout = output.stdout.text.trim();
|
||||
match (stderr.is_empty(), stdout.is_empty()) {
|
||||
(false, false) => format!("{stderr}\n{stdout}"),
|
||||
(false, true) => output.stderr.text.clone(),
|
||||
(true, false) => output.stdout.text.clone(),
|
||||
(true, true) => format!(
|
||||
"command failed inside sandbox with exit code {}",
|
||||
output.exit_code
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
// Timeouts are not sandbox errors from a UX perspective; present them plainly
|
||||
CodexErr::Sandbox(SandboxErr::Timeout { output }) => format!(
|
||||
"error: command timed out after {} ms",
|
||||
output.duration.as_millis()
|
||||
),
|
||||
CodexErr::Sandbox(SandboxErr::Timeout { output }) => {
|
||||
format!(
|
||||
"error: command timed out after {} ms",
|
||||
output.duration.as_millis()
|
||||
)
|
||||
}
|
||||
_ => e.to_string(),
|
||||
}
|
||||
};
|
||||
|
||||
truncate_middle(&message, ERROR_MESSAGE_UI_MAX_BYTES).0
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::exec::StreamOutput;
|
||||
use codex_protocol::protocol::RateLimitWindow;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
fn rate_limit_snapshot() -> RateLimitSnapshot {
|
||||
RateLimitSnapshot {
|
||||
@@ -348,6 +375,73 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_denied_uses_aggregated_output_when_stderr_empty() {
|
||||
let output = ExecToolCallOutput {
|
||||
exit_code: 77,
|
||||
stdout: StreamOutput::new(String::new()),
|
||||
stderr: StreamOutput::new(String::new()),
|
||||
aggregated_output: StreamOutput::new("aggregate detail".to_string()),
|
||||
duration: Duration::from_millis(10),
|
||||
timed_out: false,
|
||||
};
|
||||
let err = CodexErr::Sandbox(SandboxErr::Denied {
|
||||
output: Box::new(output),
|
||||
});
|
||||
assert_eq!(get_error_message_ui(&err), "aggregate detail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_denied_reports_both_streams_when_available() {
|
||||
let output = ExecToolCallOutput {
|
||||
exit_code: 9,
|
||||
stdout: StreamOutput::new("stdout detail".to_string()),
|
||||
stderr: StreamOutput::new("stderr detail".to_string()),
|
||||
aggregated_output: StreamOutput::new(String::new()),
|
||||
duration: Duration::from_millis(10),
|
||||
timed_out: false,
|
||||
};
|
||||
let err = CodexErr::Sandbox(SandboxErr::Denied {
|
||||
output: Box::new(output),
|
||||
});
|
||||
assert_eq!(get_error_message_ui(&err), "stderr detail\nstdout detail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_denied_reports_stdout_when_no_stderr() {
|
||||
let output = ExecToolCallOutput {
|
||||
exit_code: 11,
|
||||
stdout: StreamOutput::new("stdout only".to_string()),
|
||||
stderr: StreamOutput::new(String::new()),
|
||||
aggregated_output: StreamOutput::new(String::new()),
|
||||
duration: Duration::from_millis(8),
|
||||
timed_out: false,
|
||||
};
|
||||
let err = CodexErr::Sandbox(SandboxErr::Denied {
|
||||
output: Box::new(output),
|
||||
});
|
||||
assert_eq!(get_error_message_ui(&err), "stdout only");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_denied_reports_exit_code_when_no_output_available() {
|
||||
let output = ExecToolCallOutput {
|
||||
exit_code: 13,
|
||||
stdout: StreamOutput::new(String::new()),
|
||||
stderr: StreamOutput::new(String::new()),
|
||||
aggregated_output: StreamOutput::new(String::new()),
|
||||
duration: Duration::from_millis(5),
|
||||
timed_out: false,
|
||||
};
|
||||
let err = CodexErr::Sandbox(SandboxErr::Denied {
|
||||
output: Box::new(output),
|
||||
});
|
||||
assert_eq!(
|
||||
get_error_message_ui(&err),
|
||||
"command failed inside sandbox with exit code 13"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn usage_limit_reached_error_formats_free_plan() {
|
||||
let err = UsageLimitReachedError {
|
||||
|
||||
@@ -177,7 +177,7 @@ pub async fn process_exec_tool_call(
|
||||
}));
|
||||
}
|
||||
|
||||
if exit_code != 0 && is_likely_sandbox_denied(sandbox_type, exit_code) {
|
||||
if is_likely_sandbox_denied(sandbox_type, &exec_output) {
|
||||
return Err(CodexErr::Sandbox(SandboxErr::Denied {
|
||||
output: Box::new(exec_output),
|
||||
}));
|
||||
@@ -195,21 +195,57 @@ pub async fn process_exec_tool_call(
|
||||
/// We don't have a fully deterministic way to tell if our command failed
|
||||
/// because of the sandbox - a command in the user's zshrc file might hit an
|
||||
/// error, but the command itself might fail or succeed for other reasons.
|
||||
/// For now, we conservatively check for 'command not found' (exit code 127),
|
||||
/// and can add additional cases as necessary.
|
||||
fn is_likely_sandbox_denied(sandbox_type: SandboxType, exit_code: i32) -> bool {
|
||||
if sandbox_type == SandboxType::None {
|
||||
/// For now, we conservatively check for well known command failure exit codes and
|
||||
/// also look for common sandbox denial keywords in the command output.
|
||||
fn is_likely_sandbox_denied(sandbox_type: SandboxType, exec_output: &ExecToolCallOutput) -> bool {
|
||||
if sandbox_type == SandboxType::None || exec_output.exit_code == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quick rejects: well-known non-sandbox shell exit codes
|
||||
// 127: command not found, 2: misuse of shell builtins
|
||||
if exit_code == 127 {
|
||||
// 2: misuse of shell builtins
|
||||
// 126: permission denied
|
||||
// 127: command not found
|
||||
const QUICK_REJECT_EXIT_CODES: [i32; 3] = [2, 126, 127];
|
||||
if QUICK_REJECT_EXIT_CODES.contains(&exec_output.exit_code) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// For all other cases, we assume the sandbox is the cause
|
||||
true
|
||||
const SANDBOX_DENIED_KEYWORDS: [&str; 6] = [
|
||||
"operation not permitted",
|
||||
"permission denied",
|
||||
"read-only file system",
|
||||
"seccomp",
|
||||
"sandbox",
|
||||
"landlock",
|
||||
];
|
||||
|
||||
if [
|
||||
&exec_output.stderr.text,
|
||||
&exec_output.stdout.text,
|
||||
&exec_output.aggregated_output.text,
|
||||
]
|
||||
.into_iter()
|
||||
.any(|section| {
|
||||
let lower = section.to_lowercase();
|
||||
SANDBOX_DENIED_KEYWORDS
|
||||
.iter()
|
||||
.any(|needle| lower.contains(needle))
|
||||
}) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
{
|
||||
const SIGSYS_CODE: i32 = libc::SIGSYS;
|
||||
if sandbox_type == SandboxType::LinuxSeccomp
|
||||
&& exec_output.exit_code == EXIT_CODE_SIGNAL_BASE + SIGSYS_CODE
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -436,3 +472,77 @@ fn synthetic_exit_status(code: i32) -> ExitStatus {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
std::process::ExitStatus::from_raw(code.try_into().unwrap())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::Duration;
|
||||
|
||||
fn make_exec_output(
|
||||
exit_code: i32,
|
||||
stdout: &str,
|
||||
stderr: &str,
|
||||
aggregated: &str,
|
||||
) -> ExecToolCallOutput {
|
||||
ExecToolCallOutput {
|
||||
exit_code,
|
||||
stdout: StreamOutput::new(stdout.to_string()),
|
||||
stderr: StreamOutput::new(stderr.to_string()),
|
||||
aggregated_output: StreamOutput::new(aggregated.to_string()),
|
||||
duration: Duration::from_millis(1),
|
||||
timed_out: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_detection_requires_keywords() {
|
||||
let output = make_exec_output(1, "", "", "");
|
||||
assert!(!is_likely_sandbox_denied(
|
||||
SandboxType::LinuxSeccomp,
|
||||
&output
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_detection_identifies_keyword_in_stderr() {
|
||||
let output = make_exec_output(1, "", "Operation not permitted", "");
|
||||
assert!(is_likely_sandbox_denied(SandboxType::LinuxSeccomp, &output));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_detection_respects_quick_reject_exit_codes() {
|
||||
let output = make_exec_output(127, "", "command not found", "");
|
||||
assert!(!is_likely_sandbox_denied(
|
||||
SandboxType::LinuxSeccomp,
|
||||
&output
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_detection_ignores_non_sandbox_mode() {
|
||||
let output = make_exec_output(1, "", "Operation not permitted", "");
|
||||
assert!(!is_likely_sandbox_denied(SandboxType::None, &output));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_detection_uses_aggregated_output() {
|
||||
let output = make_exec_output(
|
||||
101,
|
||||
"",
|
||||
"",
|
||||
"cargo failed: Read-only file system when writing target",
|
||||
);
|
||||
assert!(is_likely_sandbox_denied(
|
||||
SandboxType::MacosSeatbelt,
|
||||
&output
|
||||
));
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
#[test]
|
||||
fn sandbox_detection_flags_sigsys_exit_code() {
|
||||
let exit_code = EXIT_CODE_SIGNAL_BASE + libc::SIGSYS;
|
||||
let output = make_exec_output(exit_code, "", "", "");
|
||||
assert!(is_likely_sandbox_denied(SandboxType::LinuxSeccomp, &output));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -380,6 +380,23 @@ mod tests {
|
||||
assert_eq!(message, "failed in sandbox: sandbox stderr");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sandbox_failure_message_falls_back_to_aggregated_output() {
|
||||
let output = ExecToolCallOutput {
|
||||
exit_code: 101,
|
||||
stdout: StreamOutput::new(String::new()),
|
||||
stderr: StreamOutput::new(String::new()),
|
||||
aggregated_output: StreamOutput::new("aggregate text".to_string()),
|
||||
duration: Duration::from_millis(10),
|
||||
timed_out: false,
|
||||
};
|
||||
let err = SandboxErr::Denied {
|
||||
output: Box::new(output),
|
||||
};
|
||||
let message = sandbox_failure_message(err);
|
||||
assert_eq!(message, "failed in sandbox: aggregate text");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_function_error_synthesizes_payload() {
|
||||
let err = FunctionCallError::RespondToModel("boom".to_string());
|
||||
|
||||
@@ -403,79 +403,6 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex();
|
||||
let test = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-denied";
|
||||
let long_line = "this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz";
|
||||
let script = format!(
|
||||
"for i in $(seq 1 500); do >&2 echo '{long_line}'; done; cat <<'EOF' > denied.txt\ncontent\nEOF",
|
||||
);
|
||||
let args = json!({
|
||||
"command": ["/bin/sh", "-c", script],
|
||||
"timeout_ms": 1_000,
|
||||
});
|
||||
|
||||
mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
let second_mock = mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
|
||||
submit_turn(
|
||||
&test,
|
||||
"attempt to write in read-only sandbox",
|
||||
AskForApproval::Never,
|
||||
SandboxPolicy::ReadOnly,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let denied_item = second_mock.single_request().function_call_output(call_id);
|
||||
|
||||
let output = denied_item
|
||||
.get("output")
|
||||
.and_then(Value::as_str)
|
||||
.expect("denied output string");
|
||||
|
||||
let sandbox_pattern = r#"(?s)^Exit code: -?\d+
|
||||
Wall time: [0-9]+(?:\.[0-9]+)? seconds
|
||||
Total output lines: \d+
|
||||
Output:
|
||||
|
||||
failed in sandbox: .*?(?:Operation not permitted|Permission denied|Read-only file system).*?
|
||||
\[\.{3} omitted \d+ of \d+ lines \.{3}\]
|
||||
.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
|
||||
\n?$"#;
|
||||
let sandbox_regex = Regex::new(sandbox_pattern)?;
|
||||
if !sandbox_regex.is_match(output) {
|
||||
let fallback_pattern = r#"(?s)^Total output lines: \d+
|
||||
|
||||
failed in sandbox: this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz
|
||||
.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
|
||||
.*(?:Operation not permitted|Permission denied|Read-only file system).*$"#;
|
||||
assert_regex_match(fallback_pattern, output);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
Reference in New Issue
Block a user