codex-rs/core/tests/live_agent.rs

//! Live integration tests that exercise the full [`Agent`] stack **against the real
//! OpenAI `/v1/responses` API**.  These tests complement the lightweight mock‑based
//! unit tests by verifying that the agent can drive an end‑to‑end conversation,
//! stream incremental events, execute function‑call tool invocations and safely
//! chain multiple turns inside a single session – the exact scenarios that have
//! historically been brittle.
//!
//! The live tests are **ignored by default** so CI remains deterministic and free
//! of external dependencies.  Developers can opt‑in locally with e.g.
//!
//! ```bash
//! OPENAI_API_KEY=sk‑... cargo test --test live_agent -- --ignored --nocapture
//! ```
//!
//! Make sure your key has access to the experimental *Responses* API and that
//! any billable usage is acceptable.

use std::time::Duration;

use codex_core::Codex;
use codex_core::config::Config;
use codex_core::protocol::EventMsg;
use codex_core::protocol::InputItem;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_core::protocol::Submission;
use tokio::sync::Notify;
use tokio::time::timeout;

fn api_key_available() -> bool {
    std::env::var("OPENAI_API_KEY").is_ok()
}

/// Helper that spawns a fresh Agent and sends the mandatory *ConfigureSession*
/// submission.  The caller receives the constructed [`Agent`] plus the unique
/// submission id used for the initialization message.
async fn spawn_codex() -> Codex {
    assert!(
        api_key_available(),
        "OPENAI_API_KEY must be set for live tests"
    );

    // Environment tweaks to keep the tests snappy and inexpensive while still
    // exercising retry/robustness logic.
    //
    // NOTE: Starting with the 2024 edition `std::env::set_var` is `unsafe`
    // because changing the process environment races with any other threads
    // that might be performing environment look-ups at the same time.
    // Restrict the unsafety to this tiny block that happens at the very
    // beginning of the test, before we spawn any background tasks that could
    // observe the environment.
    unsafe {
        std::env::set_var("OPENAI_REQUEST_MAX_RETRIES", "2");
        std::env::set_var("OPENAI_STREAM_MAX_RETRIES", "2");
    }

    let agent = Codex::spawn(std::sync::Arc::new(Notify::new())).unwrap();

    let config = Config::load_default_config_for_test();
    agent
        .submit(Submission {
            id: "init".into(),
            op: Op::ConfigureSession {
                model: config.model,
                instructions: None,
                approval_policy: config.approval_policy,
                sandbox_policy: SandboxPolicy::new_read_only_policy(),
                disable_response_storage: false,
                notify: None,
                cwd: std::env::current_dir().unwrap(),
            },
        })
        .await
        .expect("failed to submit init");

    // Drain the SessionInitialized event so subsequent helper loops don't have
    // to special‑case it.
    loop {
        let ev = timeout(Duration::from_secs(30), agent.next_event())
            .await
            .expect("timeout waiting for init event")
            .expect("agent channel closed");
        if matches!(ev.msg, EventMsg::SessionConfigured { .. }) {
            break;
        }
    }

    agent
}

/// Verifies that the agent streams incremental *AgentMessage* events **before**
/// emitting `TaskComplete` and that a second task inside the same session does
/// not get tripped up by a stale `previous_response_id`.
#[ignore]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn live_streaming_and_prev_id_reset() {
    if !api_key_available() {
        eprintln!("skipping live_streaming_and_prev_id_reset – OPENAI_API_KEY not set");
        return;
    }

    let codex = spawn_codex().await;

    // ---------- Task 1 ----------
    codex
        .submit(Submission {
            id: "task1".into(),
            op: Op::UserInput {
                items: vec![InputItem::Text {
                    text: "Say the words 'stream test'".into(),
                }],
            },
        })
        .await
        .unwrap();

    let mut saw_message_before_complete = false;
    loop {
        let ev = timeout(Duration::from_secs(60), codex.next_event())
            .await
            .expect("timeout waiting for task1 events")
            .expect("agent closed");

        match ev.msg {
            EventMsg::AgentMessage { .. } => saw_message_before_complete = true,
            EventMsg::TaskComplete => break,
            EventMsg::Error { message } => panic!("agent reported error in task1: {message}"),
            _ => (),
        }
    }

    assert!(
        saw_message_before_complete,
        "Agent did not stream any AgentMessage before TaskComplete"
    );

    // ---------- Task 2 (same session) ----------
    codex
        .submit(Submission {
            id: "task2".into(),
            op: Op::UserInput {
                items: vec![InputItem::Text {
                    text: "Respond with exactly: second turn succeeded".into(),
                }],
            },
        })
        .await
        .unwrap();

    let mut got_expected = false;
    loop {
        let ev = timeout(Duration::from_secs(60), codex.next_event())
            .await
            .expect("timeout waiting for task2 events")
            .expect("agent closed");

        match &ev.msg {
            EventMsg::AgentMessage { message } if message.contains("second turn succeeded") => {
                got_expected = true;
            }
            EventMsg::TaskComplete => break,
            EventMsg::Error { message } => panic!("agent reported error in task2: {message}"),
            _ => (),
        }
    }

    assert!(got_expected, "second task did not receive expected answer");
}

/// Exercises a *function‑call → shell execution* round‑trip by instructing the
/// model to run a harmless `echo` command.  The test asserts that:
///   1. the function call is executed (we see `ExecCommandBegin`/`End` events)
///   2. the captured stdout reaches the client unchanged.
#[ignore]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn live_shell_function_call() {
    if !api_key_available() {
        eprintln!("skipping live_shell_function_call – OPENAI_API_KEY not set");
        return;
    }

    let codex = spawn_codex().await;

    const MARKER: &str = "codex_live_echo_ok";

    codex
        .submit(Submission {
            id: "task_fn".into(),
            op: Op::UserInput {
                items: vec![InputItem::Text {
                    text: format!(
                        "Use the shell function to run the command `echo {MARKER}` and no other commands."
                    ),
                }],
            },
        })
        .await
        .unwrap();

    let mut saw_begin = false;
    let mut saw_end_with_output = false;

    loop {
        let ev = timeout(Duration::from_secs(60), codex.next_event())
            .await
            .expect("timeout waiting for function‑call events")
            .expect("agent closed");

        match ev.msg {
            EventMsg::ExecCommandBegin { command, .. } => {
                assert_eq!(command, vec!["echo", MARKER]);
                saw_begin = true;
            }
            EventMsg::ExecCommandEnd {
                stdout, exit_code, ..
            } => {
                assert_eq!(exit_code, 0, "echo returned non‑zero exit code");
                assert!(stdout.contains(MARKER));
                saw_end_with_output = true;
            }
            EventMsg::TaskComplete => break,
            EventMsg::Error { message } => panic!("agent error during shell test: {message}"),
            _ => (),
        }
    }

    assert!(saw_begin, "ExecCommandBegin event missing");
    assert!(
        saw_end_with_output,
        "ExecCommandEnd with expected output missing"
    );
}
-												feat: initial import of Rust implementation of Codex CLI in codex-rs/ (#629)

As stated in `codex-rs/README.md`:

Today, Codex CLI is written in TypeScript and requires Node.js 22+ to
run it. For a number of users, this runtime requirement inhibits
adoption: they would be better served by a standalone executable. As
maintainers, we want Codex to run efficiently in a wide range of
environments with minimal overhead. We also want to take advantage of
operating system-specific APIs to provide better sandboxing, where
possible.

To that end, we are moving forward with a Rust implementation of Codex
CLI contained in this folder, which has the following benefits:

- The CLI compiles to small, standalone, platform-specific binaries.
- Can make direct, native calls to
[seccomp](https://man7.org/linux/man-pages/man2/seccomp.2.html) and
[landlock](https://man7.org/linux/man-pages/man7/landlock.7.html) in
order to support sandboxing on Linux.
- No runtime garbage collection, resulting in lower memory consumption
and better, more predictable performance.

Currently, the Rust implementation is materially behind the TypeScript
implementation in functionality, so continue to use the TypeScript
implmentation for the time being. We will publish native executables via
GitHub Releases as soon as we feel the Rust version is usable.
											
										
										
											2025-04-24 13:31:40 -07:00
+								//! Live integration tests that exercise the full [`Agent`] stack **against the real
 								//! OpenAI `/v1/responses` API**.  These tests complement the lightweight mock‑based
 								//! unit tests by verifying that the agent can drive an end‑to‑end conversation,
 								//! stream incremental events, execute function‑call tool invocations and safely
 								//! chain multiple turns inside a single session – the exact scenarios that have
 								//! historically been brittle.
 								//!
 								//! The live tests are **ignored by default** so CI remains deterministic and free
 								//! of external dependencies.  Developers can opt‑in locally with e.g.
 								//!
 								//! ```bash
 								//! OPENAI_API_KEY=sk‑... cargo test --test live_agent -- --ignored --nocapture
 								//! ```
 								//!
 								//! Make sure your key has access to the experimental *Responses* API and that
 								//! any billable usage is acceptable.
 								use std::time::Duration;
-												Update cargo to 2024 edition (#842)

Some effects of this change:
- New formatting changes across many files. No functionality changes
should occur from that.
- Calls to `set_env` are considered unsafe, since this only happens in
tests we wrap them in `unsafe` blocks
											
										
										
											2025-05-07 08:37:48 -07:00
+								use codex_core::Codex;
-												feat: load defaults into Config and introduce ConfigOverrides (#677)

This changes how instantiating `Config` works and also adds
`approval_policy` and `sandbox_policy` as fields. The idea is:

* All fields of `Config` have appropriate default values.
* `Config` is initially loaded from `~/.codex/config.toml`, so values in
`config.toml` will override those defaults.
* Clients must instantiate `Config` via
`Config::load_with_overrides(ConfigOverrides)` where `ConfigOverrides`
has optional overrides that are expected to be settable based on CLI
flags.

The `Config` should be defined early in the program and then passed
down. Now functions like `init_codex()` take fewer individual parameters
because they can just take a `Config`.

Also, `Config::load()` used to fail silently if `~/.codex/config.toml`
had a parse error and fell back to the default config. This seemed
really bad because it wasn't clear why the values in my `config.toml`
weren't getting picked up. I changed things so that
`load_with_overrides()` returns `Result<Config>` and verified that the
various CLIs print a reasonable error if `config.toml` is malformed.

Finally, I also updated the TUI to show which **sandbox** value is being
used, as we do for other key values like **model** and **approval**.
This was also a reminder that the various values of `--sandbox` are
honored on Linux but not macOS today, so I added some TODOs about fixing
that.
											
										
										
											2025-04-27 21:47:50 -07:00
+								use codex_core::config::Config;
-												feat: initial import of Rust implementation of Codex CLI in codex-rs/ (#629)

As stated in `codex-rs/README.md`:

Today, Codex CLI is written in TypeScript and requires Node.js 22+ to
run it. For a number of users, this runtime requirement inhibits
adoption: they would be better served by a standalone executable. As
maintainers, we want Codex to run efficiently in a wide range of
environments with minimal overhead. We also want to take advantage of
operating system-specific APIs to provide better sandboxing, where
possible.

To that end, we are moving forward with a Rust implementation of Codex
CLI contained in this folder, which has the following benefits:

- The CLI compiles to small, standalone, platform-specific binaries.
- Can make direct, native calls to
[seccomp](https://man7.org/linux/man-pages/man2/seccomp.2.html) and
[landlock](https://man7.org/linux/man-pages/man7/landlock.7.html) in
order to support sandboxing on Linux.
- No runtime garbage collection, resulting in lower memory consumption
and better, more predictable performance.

Currently, the Rust implementation is materially behind the TypeScript
implementation in functionality, so continue to use the TypeScript
implmentation for the time being. We will publish native executables via
GitHub Releases as soon as we feel the Rust version is usable.
											
										
										
											2025-04-24 13:31:40 -07:00
+								use codex_core::protocol::EventMsg;
 								use codex_core::protocol::InputItem;
 								use codex_core::protocol::Op;
 								use codex_core::protocol::SandboxPolicy;
 								use codex_core::protocol::Submission;
 								use tokio::sync::Notify;
 								use tokio::time::timeout;
 								fn api_key_available() -> bool {
 								    std::env::var("OPENAI_API_KEY").is_ok()
 								}
 								/// Helper that spawns a fresh Agent and sends the mandatory *ConfigureSession*
 								/// submission.  The caller receives the constructed [`Agent`] plus the unique
 								/// submission id used for the initialization message.
 								async fn spawn_codex() -> Codex {
 								    assert!(
 								        api_key_available(),
 								        "OPENAI_API_KEY must be set for live tests"
 								    );
 								    // Environment tweaks to keep the tests snappy and inexpensive while still
 								    // exercising retry/robustness logic.
-												Update cargo to 2024 edition (#842)

Some effects of this change:
- New formatting changes across many files. No functionality changes
should occur from that.
- Calls to `set_env` are considered unsafe, since this only happens in
tests we wrap them in `unsafe` blocks
											
										
										
											2025-05-07 08:37:48 -07:00
+								    //
 								    // NOTE: Starting with the 2024 edition `std::env::set_var` is `unsafe`
 								    // because changing the process environment races with any other threads
 								    // that might be performing environment look-ups at the same time.
 								    // Restrict the unsafety to this tiny block that happens at the very
 								    // beginning of the test, before we spawn any background tasks that could
 								    // observe the environment.
 								    unsafe {
 								        std::env::set_var("OPENAI_REQUEST_MAX_RETRIES", "2");
 								        std::env::set_var("OPENAI_STREAM_MAX_RETRIES", "2");
 								    }
-												feat: initial import of Rust implementation of Codex CLI in codex-rs/ (#629)

As stated in `codex-rs/README.md`:

Today, Codex CLI is written in TypeScript and requires Node.js 22+ to
run it. For a number of users, this runtime requirement inhibits
adoption: they would be better served by a standalone executable. As
maintainers, we want Codex to run efficiently in a wide range of
environments with minimal overhead. We also want to take advantage of
operating system-specific APIs to provide better sandboxing, where
possible.

To that end, we are moving forward with a Rust implementation of Codex
CLI contained in this folder, which has the following benefits:

- The CLI compiles to small, standalone, platform-specific binaries.
- Can make direct, native calls to
[seccomp](https://man7.org/linux/man-pages/man2/seccomp.2.html) and
[landlock](https://man7.org/linux/man-pages/man7/landlock.7.html) in
order to support sandboxing on Linux.
- No runtime garbage collection, resulting in lower memory consumption
and better, more predictable performance.

Currently, the Rust implementation is materially behind the TypeScript
implementation in functionality, so continue to use the TypeScript
implmentation for the time being. We will publish native executables via
GitHub Releases as soon as we feel the Rust version is usable.
											
										
										
											2025-04-24 13:31:40 -07:00
 								    let agent = Codex::spawn(std::sync::Arc::new(Notify::new())).unwrap();
-												feat: load defaults into Config and introduce ConfigOverrides (#677)

This changes how instantiating `Config` works and also adds
`approval_policy` and `sandbox_policy` as fields. The idea is:

* All fields of `Config` have appropriate default values.
* `Config` is initially loaded from `~/.codex/config.toml`, so values in
`config.toml` will override those defaults.
* Clients must instantiate `Config` via
`Config::load_with_overrides(ConfigOverrides)` where `ConfigOverrides`
has optional overrides that are expected to be settable based on CLI
flags.

The `Config` should be defined early in the program and then passed
down. Now functions like `init_codex()` take fewer individual parameters
because they can just take a `Config`.

Also, `Config::load()` used to fail silently if `~/.codex/config.toml`
had a parse error and fell back to the default config. This seemed
really bad because it wasn't clear why the values in my `config.toml`
weren't getting picked up. I changed things so that
`load_with_overrides()` returns `Result<Config>` and verified that the
various CLIs print a reasonable error if `config.toml` is malformed.

Finally, I also updated the TUI to show which **sandbox** value is being
used, as we do for other key values like **model** and **approval**.
This was also a reminder that the various values of `--sandbox` are
honored on Linux but not macOS today, so I added some TODOs about fixing
that.
											
										
										
											2025-04-27 21:47:50 -07:00
+								    let config = Config::load_default_config_for_test();
-												feat: initial import of Rust implementation of Codex CLI in codex-rs/ (#629)

As stated in `codex-rs/README.md`:

Today, Codex CLI is written in TypeScript and requires Node.js 22+ to
run it. For a number of users, this runtime requirement inhibits
adoption: they would be better served by a standalone executable. As
maintainers, we want Codex to run efficiently in a wide range of
environments with minimal overhead. We also want to take advantage of
operating system-specific APIs to provide better sandboxing, where
possible.

To that end, we are moving forward with a Rust implementation of Codex
CLI contained in this folder, which has the following benefits:

- The CLI compiles to small, standalone, platform-specific binaries.
- Can make direct, native calls to
[seccomp](https://man7.org/linux/man-pages/man2/seccomp.2.html) and
[landlock](https://man7.org/linux/man-pages/man7/landlock.7.html) in
order to support sandboxing on Linux.
- No runtime garbage collection, resulting in lower memory consumption
and better, more predictable performance.

Currently, the Rust implementation is materially behind the TypeScript
implementation in functionality, so continue to use the TypeScript
implmentation for the time being. We will publish native executables via
GitHub Releases as soon as we feel the Rust version is usable.
											
										
										
											2025-04-24 13:31:40 -07:00
+								    agent
 								        .submit(Submission {
 								            id: "init".into(),
 								            op: Op::ConfigureSession {
-												feat: load defaults into Config and introduce ConfigOverrides (#677)

This changes how instantiating `Config` works and also adds
`approval_policy` and `sandbox_policy` as fields. The idea is:

* All fields of `Config` have appropriate default values.
* `Config` is initially loaded from `~/.codex/config.toml`, so values in
`config.toml` will override those defaults.
* Clients must instantiate `Config` via
`Config::load_with_overrides(ConfigOverrides)` where `ConfigOverrides`
has optional overrides that are expected to be settable based on CLI
flags.

The `Config` should be defined early in the program and then passed
down. Now functions like `init_codex()` take fewer individual parameters
because they can just take a `Config`.

Also, `Config::load()` used to fail silently if `~/.codex/config.toml`
had a parse error and fell back to the default config. This seemed
really bad because it wasn't clear why the values in my `config.toml`
weren't getting picked up. I changed things so that
`load_with_overrides()` returns `Result<Config>` and verified that the
various CLIs print a reasonable error if `config.toml` is malformed.

Finally, I also updated the TUI to show which **sandbox** value is being
used, as we do for other key values like **model** and **approval**.
This was also a reminder that the various values of `--sandbox` are
honored on Linux but not macOS today, so I added some TODOs about fixing
that.
											
										
										
											2025-04-27 21:47:50 -07:00
+								                model: config.model,
-												feat: initial import of Rust implementation of Codex CLI in codex-rs/ (#629)

As stated in `codex-rs/README.md`:

Today, Codex CLI is written in TypeScript and requires Node.js 22+ to
run it. For a number of users, this runtime requirement inhibits
adoption: they would be better served by a standalone executable. As
maintainers, we want Codex to run efficiently in a wide range of
environments with minimal overhead. We also want to take advantage of
operating system-specific APIs to provide better sandboxing, where
possible.

To that end, we are moving forward with a Rust implementation of Codex
CLI contained in this folder, which has the following benefits:

- The CLI compiles to small, standalone, platform-specific binaries.
- Can make direct, native calls to
[seccomp](https://man7.org/linux/man-pages/man2/seccomp.2.html) and
[landlock](https://man7.org/linux/man-pages/man7/landlock.7.html) in
order to support sandboxing on Linux.
- No runtime garbage collection, resulting in lower memory consumption
and better, more predictable performance.

Currently, the Rust implementation is materially behind the TypeScript
implementation in functionality, so continue to use the TypeScript
implmentation for the time being. We will publish native executables via
GitHub Releases as soon as we feel the Rust version is usable.
											
										
										
											2025-04-24 13:31:40 -07:00
+								                instructions: None,
-												feat: load defaults into Config and introduce ConfigOverrides (#677)

This changes how instantiating `Config` works and also adds
`approval_policy` and `sandbox_policy` as fields. The idea is:

* All fields of `Config` have appropriate default values.
* `Config` is initially loaded from `~/.codex/config.toml`, so values in
`config.toml` will override those defaults.
* Clients must instantiate `Config` via
`Config::load_with_overrides(ConfigOverrides)` where `ConfigOverrides`
has optional overrides that are expected to be settable based on CLI
flags.

The `Config` should be defined early in the program and then passed
down. Now functions like `init_codex()` take fewer individual parameters
because they can just take a `Config`.

Also, `Config::load()` used to fail silently if `~/.codex/config.toml`
had a parse error and fell back to the default config. This seemed
really bad because it wasn't clear why the values in my `config.toml`
weren't getting picked up. I changed things so that
`load_with_overrides()` returns `Result<Config>` and verified that the
various CLIs print a reasonable error if `config.toml` is malformed.

Finally, I also updated the TUI to show which **sandbox** value is being
used, as we do for other key values like **model** and **approval**.
This was also a reminder that the various values of `--sandbox` are
honored on Linux but not macOS today, so I added some TODOs about fixing
that.
											
										
										
											2025-04-27 21:47:50 -07:00
+								                approval_policy: config.approval_policy,
-												fix: overhaul SandboxPolicy and config loading in Rust (#732)

Previous to this PR, `SandboxPolicy` was a bit difficult to work with:


https://github.com/openai/codex/blob/237f8a11e11fdcc793a09e787e48215676d9b95b/codex-rs/core/src/protocol.rs#L98-L108

Specifically:

* It was an `enum` and therefore options were mutually exclusive as
opposed to additive.
* It defined things in terms of what the agent _could not_ do as opposed
to what they _could_ do. This made things hard to support because we
would prefer to build up a sandbox config by starting with something
extremely restrictive and only granting permissions for things the user
as explicitly allowed.

This PR changes things substantially by redefining the policy in terms
of two concepts:

* A `SandboxPermission` enum that defines permissions that can be
granted to the agent/sandbox.
* A `SandboxPolicy` that internally stores a `Vec<SandboxPermission>`,
but externally exposes a simpler API that can be used to configure
Seatbelt/Landlock.

Previous to this PR, we supported a `--sandbox` flag that effectively
mapped to an enum value in `SandboxPolicy`. Though now that
`SandboxPolicy` is a wrapper around `Vec<SandboxPermission>`, the single
`--sandbox` flag no longer makes sense. While I could have turned it
into a flag that the user can specify multiple times, I think the
current values to use with such a flag are long and potentially messy,
so for the moment, I have dropped support for `--sandbox` altogether and
we can bring it back once we have figured out the naming thing.

Since `--sandbox` is gone, users now have to specify `--full-auto` to
get a sandbox that allows writes in `cwd`. Admittedly, there is no clean
way to specify the equivalent of `--full-auto` in your `config.toml`
right now, so we will have to revisit that, as well.

Because `Config` presents a `SandboxPolicy` field and `SandboxPolicy`
changed considerably, I had to overhaul how config loading works, as
well. There are now two distinct concepts, `ConfigToml` and `Config`:

* `ConfigToml` is the deserialization of `~/.codex/config.toml`. As one
might expect, every field is `Optional` and it is `#[derive(Deserialize,
Default)]`. Consistent use of `Optional` makes it clear what the user
has specified explicitly.
* `Config` is the "normalized config" and is produced by merging
`ConfigToml` with `ConfigOverrides`. Where `ConfigToml` contains a raw
`Option<Vec<SandboxPermission>>`, `Config` presents only the final
`SandboxPolicy`.

The changes to `core/src/exec.rs` and `core/src/linux.rs` merit extra
special attention to ensure we are faithfully mapping the
`SandboxPolicy` to the Seatbelt and Landlock configs, respectively.

Also, take note that `core/src/seatbelt_readonly_policy.sbpl` has been
renamed to `codex-rs/core/src/seatbelt_base_policy.sbpl` and that
`(allow file-read*)` has been removed from the `.sbpl` file as now this
is added to the policy in `core/src/exec.rs` when
`sandbox_policy.has_full_disk_read_access()` is `true`.
											
										
										
											2025-04-29 15:01:16 -07:00
+								                sandbox_policy: SandboxPolicy::new_read_only_policy(),
-												feat: add ZDR support to Rust implementation (#642)

This adds support for the `--disable-response-storage` flag across our
multiple Rust CLIs to support customers who have opted into Zero-Data
Retention (ZDR). The analogous changes to the TypeScript CLI were:

* https://github.com/openai/codex/pull/481
* https://github.com/openai/codex/pull/543

For a client using ZDR, `previous_response_id` will never be available,
so the `input` field of an API request must include the full transcript
of the conversation thus far. As such, this PR changes the type of
`Prompt.input` from `Vec<ResponseInputItem>` to `Vec<ResponseItem>`.

Practically speaking, `ResponseItem` was effectively a "superset" of
`ResponseInputItem` already. The main difference for us is that
`ResponseItem` includes the `FunctionCall` variant that we have to
include as part of the conversation history in the ZDR case.

Another key change in this PR is modifying `try_run_turn()` so that it
returns the `Vec<ResponseItem>` for the turn in addition to the
`Vec<ResponseInputItem>` produced by `try_run_turn()`. This is because
the caller of `run_turn()` needs to record the `Vec<ResponseItem>` when
ZDR is enabled.

To that end, this PR introduces `ZdrTranscript` (and adds
`zdr_transcript: Option<ZdrTranscript>` to `struct State` in `codex.rs`)
to take responsibility for maintaining the conversation transcript in
the ZDR case.
											
										
										
											2025-04-25 12:08:18 -07:00
+								                disable_response_storage: false,
-												feat: configurable notifications in the Rust CLI  (#793)

With this change, you can specify a program that will be executed to get
notified about events generated by Codex. The notification info will be
packaged as a JSON object. The supported notification types are defined
by the `UserNotification` enum introduced in this PR. Initially, it
contains only one variant, `AgentTurnComplete`:

```rust
pub(crate) enum UserNotification {
    #[serde(rename_all = "kebab-case")]
    AgentTurnComplete {
        turn_id: String,

        /// Messages that the user sent to the agent to initiate the turn.
        input_messages: Vec<String>,

        /// The last message sent by the assistant in the turn.
        last_assistant_message: Option<String>,
    },
}
```

This is intended to support the common case when a "turn" ends, which
often means it is now your chance to give Codex further instructions.

For example, I have the following in my `~/.codex/config.toml`:

```toml
notify = ["python3", "/Users/mbolin/.codex/notify.py"]
```

I created my own custom notifier script that calls out to
[terminal-notifier](https://github.com/julienXX/terminal-notifier) to
show a desktop push notification on macOS. Contents of `notify.py`:

```python
#!/usr/bin/env python3

import json
import subprocess
import sys


def main() -> int:
    if len(sys.argv) != 2:
        print("Usage: notify.py <NOTIFICATION_JSON>")
        return 1

    try:
        notification = json.loads(sys.argv[1])
    except json.JSONDecodeError:
        return 1

    match notification_type := notification.get("type"):
        case "agent-turn-complete":
            assistant_message = notification.get("last-assistant-message")
            if assistant_message:
                title = f"Codex: {assistant_message}"
            else:
                title = "Codex: Turn Complete!"
            input_messages = notification.get("input_messages", [])
            message = " ".join(input_messages)
            title += message
        case _:
            print(f"not sending a push notification for: {notification_type}")
            return 0

    subprocess.check_output(
        [
            "terminal-notifier",
            "-title",
            title,
            "-message",
            message,
            "-group",
            "codex",
            "-ignoreDnD",
            "-activate",
            "com.googlecode.iterm2",
        ]
    )

    return 0


if __name__ == "__main__":
    sys.exit(main())
```

For reference, here are related PRs that tried to add this functionality
to the TypeScript version of the Codex CLI:

* https://github.com/openai/codex/pull/160
* https://github.com/openai/codex/pull/498
											
										
										
											2025-05-02 19:48:13 -07:00
+								                notify: None,
-												feat: make cwd a required field of Config so we stop assuming std::env::current_dir() in a session (#800)

In order to expose Codex via an MCP server, I realized that we should be
taking `cwd` as a parameter rather than assuming
`std::env::current_dir()` as the `cwd`. Specifically, the user may want
to start a session in a directory other than the one where the MCP
server has been started.

This PR makes `cwd: PathBuf` a required field of `Session` and threads
it all the way through, though I think there is still an issue with not
honoring `workdir` for `apply_patch`, which is something we also had to
fix in the TypeScript version: https://github.com/openai/codex/pull/556.

This also adds `-C`/`--cd` to change the cwd via the command line.

To test, I ran:

```
cargo run --bin codex -- exec -C /tmp 'show the output of ls'
```

and verified it showed the contents of my `/tmp` folder instead of
`$PWD`.
											
										
										
											2025-05-04 10:57:12 -07:00
+								                cwd: std::env::current_dir().unwrap(),
-												feat: initial import of Rust implementation of Codex CLI in codex-rs/ (#629)

As stated in `codex-rs/README.md`:

Today, Codex CLI is written in TypeScript and requires Node.js 22+ to
run it. For a number of users, this runtime requirement inhibits
adoption: they would be better served by a standalone executable. As
maintainers, we want Codex to run efficiently in a wide range of
environments with minimal overhead. We also want to take advantage of
operating system-specific APIs to provide better sandboxing, where
possible.

To that end, we are moving forward with a Rust implementation of Codex
CLI contained in this folder, which has the following benefits:

- The CLI compiles to small, standalone, platform-specific binaries.
- Can make direct, native calls to
[seccomp](https://man7.org/linux/man-pages/man2/seccomp.2.html) and
[landlock](https://man7.org/linux/man-pages/man7/landlock.7.html) in
order to support sandboxing on Linux.
- No runtime garbage collection, resulting in lower memory consumption
and better, more predictable performance.

Currently, the Rust implementation is materially behind the TypeScript
implementation in functionality, so continue to use the TypeScript
implmentation for the time being. We will publish native executables via
GitHub Releases as soon as we feel the Rust version is usable.
											
										
										
											2025-04-24 13:31:40 -07:00
+								            },
 								        })
 								        .await
 								        .expect("failed to submit init");
 								    // Drain the SessionInitialized event so subsequent helper loops don't have
 								    // to special‑case it.
 								    loop {
 								        let ev = timeout(Duration::from_secs(30), agent.next_event())
 								            .await
 								            .expect("timeout waiting for init event")
 								            .expect("agent channel closed");
 								        if matches!(ev.msg, EventMsg::SessionConfigured { .. }) {
 								            break;
 								        }
 								    }
 								    agent
 								}
 								/// Verifies that the agent streams incremental *AgentMessage* events **before**
 								/// emitting `TaskComplete` and that a second task inside the same session does
 								/// not get tripped up by a stale `previous_response_id`.
 								#[ignore]
 								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn live_streaming_and_prev_id_reset() {
 								    if !api_key_available() {
 								        eprintln!("skipping live_streaming_and_prev_id_reset – OPENAI_API_KEY not set");
 								        return;
 								    }
 								    let codex = spawn_codex().await;
 								    // ---------- Task 1 ----------
 								    codex
 								        .submit(Submission {
 								            id: "task1".into(),
 								            op: Op::UserInput {
 								                items: vec![InputItem::Text {
 								                    text: "Say the words 'stream test'".into(),
 								                }],
 								            },
 								        })
 								        .await
 								        .unwrap();
 								    let mut saw_message_before_complete = false;
 								    loop {
 								        let ev = timeout(Duration::from_secs(60), codex.next_event())
 								            .await
 								            .expect("timeout waiting for task1 events")
 								            .expect("agent closed");
 								        match ev.msg {
 								            EventMsg::AgentMessage { .. } => saw_message_before_complete = true,
 								            EventMsg::TaskComplete => break,
 								            EventMsg::Error { message } => panic!("agent reported error in task1: {message}"),
 								            _ => (),
 								        }
 								    }
 								    assert!(
 								        saw_message_before_complete,
 								        "Agent did not stream any AgentMessage before TaskComplete"
 								    );
 								    // ---------- Task 2 (same session) ----------
 								    codex
 								        .submit(Submission {
 								            id: "task2".into(),
 								            op: Op::UserInput {
 								                items: vec![InputItem::Text {
 								                    text: "Respond with exactly: second turn succeeded".into(),
 								                }],
 								            },
 								        })
 								        .await
 								        .unwrap();
 								    let mut got_expected = false;
 								    loop {
 								        let ev = timeout(Duration::from_secs(60), codex.next_event())
 								            .await
 								            .expect("timeout waiting for task2 events")
 								            .expect("agent closed");
 								        match &ev.msg {
 								            EventMsg::AgentMessage { message } if message.contains("second turn succeeded") => {
 								                got_expected = true;
 								            }
 								            EventMsg::TaskComplete => break,
 								            EventMsg::Error { message } => panic!("agent reported error in task2: {message}"),
 								            _ => (),
 								        }
 								    }
 								    assert!(got_expected, "second task did not receive expected answer");
 								}
 								/// Exercises a *function‑call → shell execution* round‑trip by instructing the
 								/// model to run a harmless `echo` command.  The test asserts that:
 								///   1. the function call is executed (we see `ExecCommandBegin`/`End` events)
 								///   2. the captured stdout reaches the client unchanged.
 								#[ignore]
 								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn live_shell_function_call() {
 								    if !api_key_available() {
 								        eprintln!("skipping live_shell_function_call – OPENAI_API_KEY not set");
 								        return;
 								    }
 								    let codex = spawn_codex().await;
 								    const MARKER: &str = "codex_live_echo_ok";
 								    codex
 								        .submit(Submission {
 								            id: "task_fn".into(),
 								            op: Op::UserInput {
 								                items: vec![InputItem::Text {
 								                    text: format!(
 								                        "Use the shell function to run the command `echo {MARKER}` and no other commands."
 								                    ),
 								                }],
 								            },
 								        })
 								        .await
 								        .unwrap();
 								    let mut saw_begin = false;
 								    let mut saw_end_with_output = false;
 								    loop {
 								        let ev = timeout(Duration::from_secs(60), codex.next_event())
 								            .await
 								            .expect("timeout waiting for function‑call events")
 								            .expect("agent closed");
 								        match ev.msg {
 								            EventMsg::ExecCommandBegin { command, .. } => {
 								                assert_eq!(command, vec!["echo", MARKER]);
 								                saw_begin = true;
 								            }
 								            EventMsg::ExecCommandEnd {
 								                stdout, exit_code, ..
 								            } => {
 								                assert_eq!(exit_code, 0, "echo returned non‑zero exit code");
 								                assert!(stdout.contains(MARKER));
 								                saw_end_with_output = true;
 								            }
 								            EventMsg::TaskComplete => break,
 								            EventMsg::Error { message } => panic!("agent error during shell test: {message}"),
 								            _ => (),
 								        }
 								    }
 								    assert!(saw_begin, "ExecCommandBegin event missing");
 								    assert!(
 								        saw_end_with_output,
 								        "ExecCommandEnd with expected output missing"
 								    );
 								}