codex-rs/core/tests/suite/exec.rs

#![cfg(target_os = "macos")]

use std::collections::HashMap;

use codex_core::exec::ExecParams;
use codex_core::exec::ExecToolCallOutput;
use codex_core::exec::SandboxType;
use codex_core::exec::process_exec_tool_call;
use codex_core::protocol::SandboxPolicy;
use codex_core::spawn::CODEX_SANDBOX_ENV_VAR;
use tempfile::TempDir;

use codex_core::error::Result;

use codex_core::get_platform_sandbox;

fn skip_test() -> bool {
    if std::env::var(CODEX_SANDBOX_ENV_VAR) == Ok("seatbelt".to_string()) {
        eprintln!("{CODEX_SANDBOX_ENV_VAR} is set to 'seatbelt', skipping test.");
        return true;
    }

    false
}

#[expect(clippy::expect_used)]
async fn run_test_cmd(tmp: TempDir, cmd: Vec<&str>) -> Result<ExecToolCallOutput> {
    let sandbox_type = get_platform_sandbox().expect("should be able to get sandbox type");
    assert_eq!(sandbox_type, SandboxType::MacosSeatbelt);

    let params = ExecParams {
        command: cmd.iter().map(|s| s.to_string()).collect(),
        cwd: tmp.path().to_path_buf(),
        timeout_ms: Some(1000),
        env: HashMap::new(),
        with_escalated_permissions: None,
        justification: None,
    };

    let policy = SandboxPolicy::new_read_only_policy();

    process_exec_tool_call(params, sandbox_type, &policy, &None, None).await
}

/// Command succeeds with exit code 0 normally
#[tokio::test]
async fn exit_code_0_succeeds() {
    if skip_test() {
        return;
    }

    let tmp = TempDir::new().expect("should be able to create temp dir");
    let cmd = vec!["echo", "hello"];

    let output = run_test_cmd(tmp, cmd).await.unwrap();
    assert_eq!(output.stdout.text, "hello\n");
    assert_eq!(output.stderr.text, "");
    assert_eq!(output.stdout.truncated_after_lines, None);
}

/// Command succeeds with exit code 0 normally
#[tokio::test]
async fn truncates_output_lines() {
    if skip_test() {
        return;
    }

    let tmp = TempDir::new().expect("should be able to create temp dir");
    let cmd = vec!["seq", "300"];

    let output = run_test_cmd(tmp, cmd).await.unwrap();

    let expected_output = (1..=300)
        .map(|i| format!("{i}\n"))
        .collect::<Vec<_>>()
        .join("");
    assert_eq!(output.stdout.text, expected_output);
    assert_eq!(output.stdout.truncated_after_lines, None);
}

/// Command succeeds with exit code 0 normally
#[tokio::test]
async fn truncates_output_bytes() {
    if skip_test() {
        return;
    }

    let tmp = TempDir::new().expect("should be able to create temp dir");
    // each line is 1000 bytes
    let cmd = vec!["bash", "-lc", "seq 15 | awk '{printf \"%-1000s\\n\", $0}'"];

    let output = run_test_cmd(tmp, cmd).await.unwrap();

    assert!(output.stdout.text.len() >= 15000);
    assert_eq!(output.stdout.truncated_after_lines, None);
}

/// Command not found returns exit code 127, this is not considered a sandbox error
#[tokio::test]
async fn exit_command_not_found_is_ok() {
    if skip_test() {
        return;
    }

    let tmp = TempDir::new().expect("should be able to create temp dir");
    let cmd = vec!["/bin/bash", "-c", "nonexistent_command_12345"];
    run_test_cmd(tmp, cmd).await.unwrap();
}

/// Writing a file fails and should be considered a sandbox error
#[tokio::test]
async fn write_file_fails_as_sandbox_error() {
    if skip_test() {
        return;
    }

    let tmp = TempDir::new().expect("should be able to create temp dir");
    let path = tmp.path().join("test.txt");
    let cmd = vec![
        "/user/bin/touch",
        path.to_str().expect("should be able to get path"),
    ];

    assert!(run_test_cmd(tmp, cmd).await.is_err());
}
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`#![cfg(target_os = "macos")]`

			`use std::collections::HashMap;`

			`use codex_core::exec::ExecParams;`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`use codex_core::exec::ExecToolCallOutput;`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`use codex_core::exec::SandboxType;`
			`use codex_core::exec::process_exec_tool_call;`
			`use codex_core::protocol::SandboxPolicy;`
			`use codex_core::spawn::CODEX_SANDBOX_ENV_VAR;`
			`use tempfile::TempDir;`

Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`use codex_core::error::Result;`

[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`use codex_core::get_platform_sandbox;`

Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`fn skip_test() -> bool {`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`if std::env::var(CODEX_SANDBOX_ENV_VAR) == Ok("seatbelt".to_string()) {`
			`eprintln!("{CODEX_SANDBOX_ENV_VAR} is set to 'seatbelt', skipping test.");`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`return true;`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`}`

Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`false`
			`}`

Added `allow-expect-in-tests` / `allow-unwrap-in-tests` (#2328) This PR: * Added the clippy.toml to configure allowable expect / unwrap usage in tests * Removed as many expect/allow lines as possible from tests * moved a bunch of allows to expects where possible Note: in integration tests, non `#[test]` helper functions are not covered by this so we had to leave a few lingering `expect(expect_used` checks around 2025-08-14 17:59:01 -07:00			`#[expect(clippy::expect_used)]`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`async fn run_test_cmd(tmp: TempDir, cmd: Vec<&str>) -> Result<ExecToolCallOutput> {`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`let sandbox_type = get_platform_sandbox().expect("should be able to get sandbox type");`
			`assert_eq!(sandbox_type, SandboxType::MacosSeatbelt);`

			`let params = ExecParams {`
			`command: cmd.iter().map(\|s\| s.to_string()).collect(),`
			`cwd: tmp.path().to_path_buf(),`
			`timeout_ms: Some(1000),`
			`env: HashMap::new(),`
[approval_policy] Add OnRequest approval_policy (#1865) ## Summary A split-up PR of #1763 , stacked on top of a tools refactor #1858 to make the change clearer. From the previous summary: > Let's try something new: tell the model about the sandbox, and let it decide when it will need to break the sandbox. Some local testing suggests that it works pretty well with zero iteration on the prompt! ## Testing - [x] Added unit tests - [x] Tested locally and it appears to work smoothly! 2025-08-05 20:44:20 -07:00			`with_escalated_permissions: None,`
			`justification: None,`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`};`

			`let policy = SandboxPolicy::new_read_only_policy();`

chore: introduce ConversationManager as a clearinghouse for all conversations (#2240) This PR does two things because after I got deep into the first one I started pulling on the thread to the second: - Makes `ConversationManager` the place where all in-memory conversations are created and stored. Previously, `MessageProcessor` in the `codex-mcp-server` crate was doing this via its `session_map`, but this is something that should be done in `codex-core`. - It unwinds the `ctrl_c: tokio::sync::Notify` that was threaded throughout our code. I think this made sense at one time, but now that we handle Ctrl-C within the TUI and have a proper `Op::Interrupt` event, I don't think this was quite right, so I removed it. For `codex exec` and `codex proto`, we now use `tokio::signal::ctrl_c()` directly, but we no longer make `Notify` a field of `Codex` or `CodexConversation`. Changes of note: - Adds the files `conversation_manager.rs` and `codex_conversation.rs` to `codex-core`. - `Codex` and `CodexSpawnOk` are no longer exported from `codex-core`: other crates must use `CodexConversation` instead (which is created via `ConversationManager`). - `core/src/codex_wrapper.rs` has been deleted in favor of `ConversationManager`. - `ConversationManager::new_conversation()` returns `NewConversation`, which is in line with the `new_conversation` tool we want to add to the MCP server. Note `NewConversation` includes `SessionConfiguredEvent`, so we eliminate checks in cases like `codex-rs/core/tests/client.rs` to verify `SessionConfiguredEvent` is the first event because that is now internal to `ConversationManager`. - Quite a bit of code was deleted from `codex-rs/mcp-server/src/message_processor.rs` since it no longer has to manage multiple conversations itself: it goes through `ConversationManager` instead. - `core/tests/live_agent.rs` has been deleted because I had to update a bunch of tests and all the tests in here were ignored, and I don't think anyone ever ran them, so this was just technical debt, at this point. - Removed `notify_on_sigint()` from `util.rs` (and in a follow-up, I hope to refactor the blandly-named `util.rs` into more descriptive files). - In general, I started replacing local variables named `codex` as `conversation`, where appropriate, though admittedly I didn't do it through all the integration tests because that would have added a lot of noise to this PR. --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/openai/codex/pull/2240). * #2264 * #2263 * __->__ #2240 2025-08-13 13:38:18 -07:00			`process_exec_tool_call(params, sandbox_type, &policy, &None, None).await`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`}`

			`/// Command succeeds with exit code 0 normally`
			`#[tokio::test]`
			`async fn exit_code_0_succeeds() {`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`if skip_test() {`
			`return;`
			`}`

[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`let tmp = TempDir::new().expect("should be able to create temp dir");`
			`let cmd = vec!["echo", "hello"];`

Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`let output = run_test_cmd(tmp, cmd).await.unwrap();`
			`assert_eq!(output.stdout.text, "hello\n");`
			`assert_eq!(output.stderr.text, "");`
			`assert_eq!(output.stdout.truncated_after_lines, None);`
			`}`

			`/// Command succeeds with exit code 0 normally`
			`#[tokio::test]`
			`async fn truncates_output_lines() {`
			`if skip_test() {`
			`return;`
			`}`

			`let tmp = TempDir::new().expect("should be able to create temp dir");`
			`let cmd = vec!["seq", "300"];`

			`let output = run_test_cmd(tmp, cmd).await.unwrap();`

send-aggregated output (#2364) We want to send an aggregated output of stderr and stdout so we don't have to aggregate it stderr+stdout as we lose order sometimes. --------- Co-authored-by: Gabriel Peal <gpeal@users.noreply.github.com> 2025-08-23 09:54:31 -07:00			`let expected_output = (1..=300)`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`.map(\|i\| format!("{i}\n"))`
			`.collect::<Vec<_>>()`
			`.join("");`
			`assert_eq!(output.stdout.text, expected_output);`
send-aggregated output (#2364) We want to send an aggregated output of stderr and stdout so we don't have to aggregate it stderr+stdout as we lose order sometimes. --------- Co-authored-by: Gabriel Peal <gpeal@users.noreply.github.com> 2025-08-23 09:54:31 -07:00			`assert_eq!(output.stdout.truncated_after_lines, None);`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`}`

			`/// Command succeeds with exit code 0 normally`
			`#[tokio::test]`
			`async fn truncates_output_bytes() {`
			`if skip_test() {`
			`return;`
			`}`

			`let tmp = TempDir::new().expect("should be able to create temp dir");`
			`// each line is 1000 bytes`
			`let cmd = vec!["bash", "-lc", "seq 15 \| awk '{printf \"%-1000s\\n\", $0}'"];`

			`let output = run_test_cmd(tmp, cmd).await.unwrap();`

send-aggregated output (#2364) We want to send an aggregated output of stderr and stdout so we don't have to aggregate it stderr+stdout as we lose order sometimes. --------- Co-authored-by: Gabriel Peal <gpeal@users.noreply.github.com> 2025-08-23 09:54:31 -07:00			`assert!(output.stdout.text.len() >= 15000);`
			`assert_eq!(output.stdout.truncated_after_lines, None);`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`}`

			`/// Command not found returns exit code 127, this is not considered a sandbox error`
			`#[tokio::test]`
			`async fn exit_command_not_found_is_ok() {`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`if skip_test() {`
			`return;`
			`}`

[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`let tmp = TempDir::new().expect("should be able to create temp dir");`
			`let cmd = vec!["/bin/bash", "-c", "nonexistent_command_12345"];`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`run_test_cmd(tmp, cmd).await.unwrap();`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`}`

			`/// Writing a file fails and should be considered a sandbox error`
			`#[tokio::test]`
			`async fn write_file_fails_as_sandbox_error() {`
Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`if skip_test() {`
			`return;`
			`}`

[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`let tmp = TempDir::new().expect("should be able to create temp dir");`
			`let path = tmp.path().join("test.txt");`
			`let cmd = vec![`
			`"/user/bin/touch",`
			`path.to_str().expect("should be able to get path"),`
			`];`

Include output truncation message in tool call results (#2183) To avoid model being confused about incomplete output. 2025-08-11 11:52:05 -07:00			`assert!(run_test_cmd(tmp, cmd).await.is_err());`
[sandbox] Filter out certain non-sandbox errors (#1804) ## Summary Users frequently complain about re-approving commands that have failed for non-sandbox reasons. We can't diagnose with complete accuracy which errors happened because of a sandbox failure, but we can start to eliminate some common simple cases. This PR captures the most common case I've seen, which is a `command not found` error. ## Testing - [x] Added unit tests - [x] Ran a few cases locally 2025-08-03 13:05:48 -07:00			`}`