test: faster test execution in codex-core (#2633)

this dramatically improves time to run `cargo test -p codex-core` (~25x speedup). before: ``` cargo test -p codex-core 35.96s user 68.63s system 19% cpu 8:49.80 total ``` after: ``` cargo test -p codex-core 5.51s user 8.16s system 63% cpu 21.407 total ``` both tests measured "hot", i.e. on a 2nd run with no filesystem changes, to exclude compile times. approach inspired by [Delete Cargo Integration Tests](https://matklad.github.io/2021/02/27/delete-cargo-integration-tests.html), we move all test cases in tests/ into a single suite in order to have a single binary, as there is significant overhead for each test binary executed, and because test execution is only parallelized with a single binary.
2025-08-24 11:10:53 -07:00
parent c6a52d611c
commit 32bbbbad61
56 changed files with 78 additions and 3 deletions
--- a/codex-rs/exec/tests/suite/apply_patch.rs
+++ b/codex-rs/exec/tests/suite/apply_patch.rs
@@ -0,0 +1,339 @@
+#![allow(clippy::expect_used, clippy::unwrap_used)]
+
+use anyhow::Context;
+use assert_cmd::prelude::*;
+use codex_core::CODEX_APPLY_PATCH_ARG1;
+use std::fs;
+use std::process::Command;
+use tempfile::tempdir;
+
+/// While we may add an `apply-patch` subcommand to the `codex` CLI multitool
+/// at some point, we must ensure that the smaller `codex-exec` CLI can still
+/// emulate the `apply_patch` CLI.
+#[test]
+fn test_standalone_exec_cli_can_use_apply_patch() -> anyhow::Result<()> {
+    let tmp = tempdir()?;
+    let relative_path = "source.txt";
+    let absolute_path = tmp.path().join(relative_path);
+    fs::write(&absolute_path, "original content\n")?;
+
+    Command::cargo_bin("codex-exec")
+        .context("should find binary for codex-exec")?
+        .arg(CODEX_APPLY_PATCH_ARG1)
+        .arg(
+            r#"*** Begin Patch
+*** Update File: source.txt
+@@
+-original content
+modified by apply_patch
+*** End Patch"#,
+        )
+        .current_dir(tmp.path())
+        .assert()
+        .success()
+        .stdout("Success. Updated the following files:\nM source.txt\n")
+        .stderr(predicates::str::is_empty());
+    assert_eq!(
+        fs::read_to_string(absolute_path)?,
+        "modified by apply_patch\n"
+    );
+    Ok(())
+}
+
+#[cfg(not(target_os = "windows"))]
+#[tokio::test]
+async fn test_apply_patch_tool() -> anyhow::Result<()> {
+    use core_test_support::load_sse_fixture_with_id_from_str;
+    use tempfile::TempDir;
+    use wiremock::Mock;
+    use wiremock::MockServer;
+    use wiremock::ResponseTemplate;
+    use wiremock::matchers::method;
+    use wiremock::matchers::path;
+
+    const SSE_TOOL_CALL_ADD: &str = r#"[
+  {
+    "type": "response.output_item.done",
+    "item": {
+      "type": "function_call",
+      "name": "apply_patch",
+      "arguments": "{\n  \"input\": \"*** Begin Patch\\n*** Add File: test.md\\n+Hello world\\n*** End Patch\"\n}",
+      "call_id": "__ID__"
+    }
+  },
+  {
+    "type": "response.completed",
+    "response": {
+      "id": "__ID__",
+      "usage": {
+        "input_tokens": 0,
+        "input_tokens_details": null,
+        "output_tokens": 0,
+        "output_tokens_details": null,
+        "total_tokens": 0
+      },
+      "output": []
+    }
+  }
+]"#;
+
+    const SSE_TOOL_CALL_UPDATE: &str = r#"[
+  {
+    "type": "response.output_item.done",
+    "item": {
+      "type": "function_call",
+      "name": "apply_patch",
+      "arguments": "{\n  \"input\": \"*** Begin Patch\\n*** Update File: test.md\\n@@\\n-Hello world\\n+Final text\\n*** End Patch\"\n}",
+      "call_id": "__ID__"
+    }
+  },
+  {
+    "type": "response.completed",
+    "response": {
+      "id": "__ID__",
+      "usage": {
+        "input_tokens": 0,
+        "input_tokens_details": null,
+        "output_tokens": 0,
+        "output_tokens_details": null,
+        "total_tokens": 0
+      },
+      "output": []
+    }
+  }
+]"#;
+
+    const SSE_TOOL_CALL_COMPLETED: &str = r#"[
+  {
+    "type": "response.completed",
+    "response": {
+      "id": "__ID__",
+      "usage": {
+        "input_tokens": 0,
+        "input_tokens_details": null,
+        "output_tokens": 0,
+        "output_tokens_details": null,
+        "total_tokens": 0
+      },
+      "output": []
+    }
+  }
+]"#;
+
+    // Start a mock model server
+    let server = MockServer::start().await;
+
+    // First response: model calls apply_patch to create test.md
+    let first = ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(
+            load_sse_fixture_with_id_from_str(SSE_TOOL_CALL_ADD, "call1"),
+            "text/event-stream",
+        );
+
+    Mock::given(method("POST"))
+        // .and(path("/v1/responses"))
+        .respond_with(first)
+        .up_to_n_times(1)
+        .mount(&server)
+        .await;
+
+    // Second response: model calls apply_patch to update test.md
+    let second = ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(
+            load_sse_fixture_with_id_from_str(SSE_TOOL_CALL_UPDATE, "call2"),
+            "text/event-stream",
+        );
+
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .respond_with(second)
+        .up_to_n_times(1)
+        .mount(&server)
+        .await;
+
+    let final_completed = ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(
+            load_sse_fixture_with_id_from_str(SSE_TOOL_CALL_COMPLETED, "resp3"),
+            "text/event-stream",
+        );
+
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .respond_with(final_completed)
+        .expect(1)
+        .mount(&server)
+        .await;
+
+    let tmp_cwd = TempDir::new().unwrap();
+    Command::cargo_bin("codex-exec")
+        .context("should find binary for codex-exec")?
+        .current_dir(tmp_cwd.path())
+        .env("CODEX_HOME", tmp_cwd.path())
+        .env("OPENAI_API_KEY", "dummy")
+        .env("OPENAI_BASE_URL", format!("{}/v1", server.uri()))
+        .arg("--skip-git-repo-check")
+        .arg("-s")
+        .arg("workspace-write")
+        .arg("foo")
+        .assert()
+        .success();
+
+    // Verify final file contents
+    let final_path = tmp_cwd.path().join("test.md");
+    let contents = std::fs::read_to_string(&final_path)
+        .unwrap_or_else(|e| panic!("failed reading {}: {e}", final_path.display()));
+    assert_eq!(contents, "Final text\n");
+    Ok(())
+}
+
+#[cfg(not(target_os = "windows"))]
+#[tokio::test]
+async fn test_apply_patch_freeform_tool() -> anyhow::Result<()> {
+    use core_test_support::load_sse_fixture_with_id_from_str;
+    use tempfile::TempDir;
+    use wiremock::Mock;
+    use wiremock::MockServer;
+    use wiremock::ResponseTemplate;
+    use wiremock::matchers::method;
+    use wiremock::matchers::path;
+
+    const SSE_TOOL_CALL_ADD: &str = r#"[
+  {
+    "type": "response.output_item.done",
+    "item": {
+      "type": "custom_tool_call",
+      "name": "apply_patch",
+      "input": "*** Begin Patch\n*** Add File: test.md\n+Hello world\n*** End Patch",
+      "call_id": "__ID__"
+    }
+  },
+  {
+    "type": "response.completed",
+    "response": {
+      "id": "__ID__",
+      "usage": {
+        "input_tokens": 0,
+        "input_tokens_details": null,
+        "output_tokens": 0,
+        "output_tokens_details": null,
+        "total_tokens": 0
+      },
+      "output": []
+    }
+  }
+]"#;
+
+    const SSE_TOOL_CALL_UPDATE: &str = r#"[
+  {
+    "type": "response.output_item.done",
+    "item": {
+      "type": "custom_tool_call",
+      "name": "apply_patch",
+      "input": "*** Begin Patch\n*** Update File: test.md\n@@\n-Hello world\n+Final text\n*** End Patch",
+      "call_id": "__ID__"
+    }
+  },
+  {
+    "type": "response.completed",
+    "response": {
+      "id": "__ID__",
+      "usage": {
+        "input_tokens": 0,
+        "input_tokens_details": null,
+        "output_tokens": 0,
+        "output_tokens_details": null,
+        "total_tokens": 0
+      },
+      "output": []
+    }
+  }
+]"#;
+
+    const SSE_TOOL_CALL_COMPLETED: &str = r#"[
+  {
+    "type": "response.completed",
+    "response": {
+      "id": "__ID__",
+      "usage": {
+        "input_tokens": 0,
+        "input_tokens_details": null,
+        "output_tokens": 0,
+        "output_tokens_details": null,
+        "total_tokens": 0
+      },
+      "output": []
+    }
+  }
+]"#;
+
+    // Start a mock model server
+    let server = MockServer::start().await;
+
+    // First response: model calls apply_patch to create test.md
+    let first = ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(
+            load_sse_fixture_with_id_from_str(SSE_TOOL_CALL_ADD, "call1"),
+            "text/event-stream",
+        );
+
+    Mock::given(method("POST"))
+        // .and(path("/v1/responses"))
+        .respond_with(first)
+        .up_to_n_times(1)
+        .mount(&server)
+        .await;
+
+    // Second response: model calls apply_patch to update test.md
+    let second = ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(
+            load_sse_fixture_with_id_from_str(SSE_TOOL_CALL_UPDATE, "call2"),
+            "text/event-stream",
+        );
+
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .respond_with(second)
+        .up_to_n_times(1)
+        .mount(&server)
+        .await;
+
+    let final_completed = ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(
+            load_sse_fixture_with_id_from_str(SSE_TOOL_CALL_COMPLETED, "resp3"),
+            "text/event-stream",
+        );
+
+    Mock::given(method("POST"))
+        // .and(path("/v1/responses"))
+        .respond_with(final_completed)
+        .expect(1)
+        .mount(&server)
+        .await;
+
+    let tmp_cwd = TempDir::new().unwrap();
+    Command::cargo_bin("codex-exec")
+        .context("should find binary for codex-exec")?
+        .current_dir(tmp_cwd.path())
+        .env("CODEX_HOME", tmp_cwd.path())
+        .env("OPENAI_API_KEY", "dummy")
+        .env("OPENAI_BASE_URL", format!("{}/v1", server.uri()))
+        .arg("--skip-git-repo-check")
+        .arg("-s")
+        .arg("workspace-write")
+        .arg("foo")
+        .assert()
+        .success();
+
+    // Verify final file contents
+    let final_path = tmp_cwd.path().join("test.md");
+    let contents = std::fs::read_to_string(&final_path)
+        .unwrap_or_else(|e| panic!("failed reading {}: {e}", final_path.display()));
+    assert_eq!(contents, "Final text\n");
+    Ok(())
+}
--- a/codex-rs/exec/tests/suite/mod.rs
+++ b/codex-rs/exec/tests/suite/mod.rs
@@ -0,0 +1,3 @@
+// Aggregates all former standalone integration tests as modules.
+mod apply_patch;
+mod sandbox;
--- a/codex-rs/exec/tests/suite/sandbox.rs
+++ b/codex-rs/exec/tests/suite/sandbox.rs
@@ -0,0 +1,219 @@
+#![cfg(unix)]
+use codex_core::protocol::SandboxPolicy;
+use codex_core::spawn::StdioPolicy;
+use std::collections::HashMap;
+use std::future::Future;
+use std::io;
+use std::path::PathBuf;
+use std::process::ExitStatus;
+use tokio::process::Child;
+
+#[cfg(target_os = "macos")]
+async fn spawn_command_under_sandbox(
+    command: Vec<String>,
+    sandbox_policy: &SandboxPolicy,
+    cwd: PathBuf,
+    stdio_policy: StdioPolicy,
+    env: HashMap<String, String>,
+) -> std::io::Result<Child> {
+    use codex_core::seatbelt::spawn_command_under_seatbelt;
+    spawn_command_under_seatbelt(command, sandbox_policy, cwd, stdio_policy, env).await
+}
+
+#[cfg(target_os = "linux")]
+async fn spawn_command_under_sandbox(
+    command: Vec<String>,
+    sandbox_policy: &SandboxPolicy,
+    cwd: PathBuf,
+    stdio_policy: StdioPolicy,
+    env: HashMap<String, String>,
+) -> std::io::Result<Child> {
+    use codex_core::landlock::spawn_command_under_linux_sandbox;
+    let codex_linux_sandbox_exe = assert_cmd::cargo::cargo_bin("codex-exec");
+    spawn_command_under_linux_sandbox(
+        codex_linux_sandbox_exe,
+        command,
+        sandbox_policy,
+        cwd,
+        stdio_policy,
+        env,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn python_multiprocessing_lock_works_under_sandbox() {
+    #[cfg(target_os = "macos")]
+    let writable_roots = Vec::<PathBuf>::new();
+
+    // From https://man7.org/linux/man-pages/man7/sem_overview.7.html
+    //
+    // > On Linux, named semaphores are created in a virtual filesystem,
+    // > normally mounted under /dev/shm.
+    #[cfg(target_os = "linux")]
+    let writable_roots = vec![PathBuf::from("/dev/shm")];
+
+    let policy = SandboxPolicy::WorkspaceWrite {
+        writable_roots,
+        network_access: false,
+        exclude_tmpdir_env_var: false,
+        exclude_slash_tmp: false,
+    };
+
+    let python_code = r#"import multiprocessing
+from multiprocessing import Lock, Process
+
+def f(lock):
+    with lock:
+        print("Lock acquired in child process")
+
+if __name__ == '__main__':
+    lock = Lock()
+    p = Process(target=f, args=(lock,))
+    p.start()
+    p.join()
+"#;
+
+    let mut child = spawn_command_under_sandbox(
+        vec![
+            "python3".to_string(),
+            "-c".to_string(),
+            python_code.to_string(),
+        ],
+        &policy,
+        std::env::current_dir().expect("should be able to get current dir"),
+        StdioPolicy::Inherit,
+        HashMap::new(),
+    )
+    .await
+    .expect("should be able to spawn python under sandbox");
+
+    let status = child.wait().await.expect("should wait for child process");
+    assert!(status.success(), "python exited with {status:?}");
+}
+
+fn unix_sock_body() {
+    unsafe {
+        let mut fds = [0i32; 2];
+        let r = libc::socketpair(libc::AF_UNIX, libc::SOCK_DGRAM, 0, fds.as_mut_ptr());
+        assert_eq!(
+            r,
+            0,
+            "socketpair(AF_UNIX, SOCK_DGRAM) failed: {}",
+            io::Error::last_os_error()
+        );
+
+        let msg = b"hello_unix";
+        // write() from one end (generic write is allowed)
+        let sent = libc::write(fds[0], msg.as_ptr() as *const libc::c_void, msg.len());
+        assert!(sent >= 0, "write() failed: {}", io::Error::last_os_error());
+
+        // recvfrom() on the other end. We don’t need the address for socketpair,
+        // so we pass null pointers for src address.
+        let mut buf = [0u8; 64];
+        let recvd = libc::recvfrom(
+            fds[1],
+            buf.as_mut_ptr() as *mut libc::c_void,
+            buf.len(),
+            0,
+            std::ptr::null_mut(),
+            std::ptr::null_mut(),
+        );
+        assert!(
+            recvd >= 0,
+            "recvfrom() failed: {}",
+            io::Error::last_os_error()
+        );
+
+        let recvd_slice = &buf[..(recvd as usize)];
+        assert_eq!(
+            recvd_slice,
+            &msg[..],
+            "payload mismatch: sent {} bytes, got {} bytes",
+            msg.len(),
+            recvd
+        );
+
+        // Also exercise AF_UNIX stream socketpair quickly to ensure AF_UNIX in general works.
+        let mut sfds = [0i32; 2];
+        let sr = libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, sfds.as_mut_ptr());
+        assert_eq!(
+            sr,
+            0,
+            "socketpair(AF_UNIX, SOCK_STREAM) failed: {}",
+            io::Error::last_os_error()
+        );
+        let snt2 = libc::write(sfds[0], msg.as_ptr() as *const libc::c_void, msg.len());
+        assert!(
+            snt2 >= 0,
+            "write(stream) failed: {}",
+            io::Error::last_os_error()
+        );
+        let mut b2 = [0u8; 64];
+        let rcv2 = libc::recv(sfds[1], b2.as_mut_ptr() as *mut libc::c_void, b2.len(), 0);
+        assert!(
+            rcv2 >= 0,
+            "recv(stream) failed: {}",
+            io::Error::last_os_error()
+        );
+
+        // Clean up
+        let _ = libc::close(sfds[0]);
+        let _ = libc::close(sfds[1]);
+        let _ = libc::close(fds[0]);
+        let _ = libc::close(fds[1]);
+    }
+}
+
+#[tokio::test]
+async fn allow_unix_socketpair_recvfrom() {
+    run_code_under_sandbox(
+        "allow_unix_socketpair_recvfrom",
+        &SandboxPolicy::ReadOnly,
+        || async { unix_sock_body() },
+    )
+    .await
+    .expect("should be able to reexec");
+}
+
+const IN_SANDBOX_ENV_VAR: &str = "IN_SANDBOX";
+
+#[expect(clippy::expect_used)]
+pub async fn run_code_under_sandbox<F, Fut>(
+    test_selector: &str,
+    policy: &SandboxPolicy,
+    child_body: F,
+) -> io::Result<Option<ExitStatus>>
+where
+    F: FnOnce() -> Fut + Send + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
+    if std::env::var(IN_SANDBOX_ENV_VAR).is_err() {
+        let exe = std::env::current_exe()?;
+        let mut cmds = vec![exe.to_string_lossy().into_owned(), "--exact".into()];
+        let mut stdio_policy = StdioPolicy::RedirectForShellTool;
+        // Allow for us to pass forward --nocapture / use the right stdio policy.
+        if std::env::args().any(|a| a == "--nocapture") {
+            cmds.push("--nocapture".into());
+            stdio_policy = StdioPolicy::Inherit;
+        }
+        cmds.push(test_selector.into());
+
+        // Your existing launcher:
+        let mut child = spawn_command_under_sandbox(
+            cmds,
+            policy,
+            std::env::current_dir().expect("should be able to get current dir"),
+            stdio_policy,
+            HashMap::from([("IN_SANDBOX".into(), "1".into())]),
+        )
+        .await?;
+
+        let status = child.wait().await?;
+        Ok(Some(status))
+    } else {
+        // Child branch: run the provided body.
+        child_body().await;
+        Ok(None)
+    }
+}