2025-09-02 18:36:19 -07:00
|
|
|
|
use codex_core::CodexAuth;
|
chore: introduce ConversationManager as a clearinghouse for all conversations (#2240)
This PR does two things because after I got deep into the first one I
started pulling on the thread to the second:
- Makes `ConversationManager` the place where all in-memory
conversations are created and stored. Previously, `MessageProcessor` in
the `codex-mcp-server` crate was doing this via its `session_map`, but
this is something that should be done in `codex-core`.
- It unwinds the `ctrl_c: tokio::sync::Notify` that was threaded
throughout our code. I think this made sense at one time, but now that
we handle Ctrl-C within the TUI and have a proper `Op::Interrupt` event,
I don't think this was quite right, so I removed it. For `codex exec`
and `codex proto`, we now use `tokio::signal::ctrl_c()` directly, but we
no longer make `Notify` a field of `Codex` or `CodexConversation`.
Changes of note:
- Adds the files `conversation_manager.rs` and `codex_conversation.rs`
to `codex-core`.
- `Codex` and `CodexSpawnOk` are no longer exported from `codex-core`:
other crates must use `CodexConversation` instead (which is created via
`ConversationManager`).
- `core/src/codex_wrapper.rs` has been deleted in favor of
`ConversationManager`.
- `ConversationManager::new_conversation()` returns `NewConversation`,
which is in line with the `new_conversation` tool we want to add to the
MCP server. Note `NewConversation` includes `SessionConfiguredEvent`, so
we eliminate checks in cases like `codex-rs/core/tests/client.rs` to
verify `SessionConfiguredEvent` is the first event because that is now
internal to `ConversationManager`.
- Quite a bit of code was deleted from
`codex-rs/mcp-server/src/message_processor.rs` since it no longer has to
manage multiple conversations itself: it goes through
`ConversationManager` instead.
- `core/tests/live_agent.rs` has been deleted because I had to update a
bunch of tests and all the tests in here were ignored, and I don't think
anyone ever ran them, so this was just technical debt, at this point.
- Removed `notify_on_sigint()` from `util.rs` (and in a follow-up, I
hope to refactor the blandly-named `util.rs` into more descriptive
files).
- In general, I started replacing local variables named `codex` as
`conversation`, where appropriate, though admittedly I didn't do it
through all the integration tests because that would have added a lot of
noise to this PR.
---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with [ReviewStack](https://reviewstack.dev/openai/codex/pull/2240).
* #2264
* #2263
* __->__ #2240
2025-08-13 13:38:18 -07:00
|
|
|
|
use codex_core::ConversationManager;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use codex_core::ModelProviderInfo;
|
2025-09-11 11:08:51 -07:00
|
|
|
|
use codex_core::NewConversation;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use codex_core::built_in_model_providers;
|
2025-09-12 13:07:10 -07:00
|
|
|
|
use codex_core::protocol::ErrorEvent;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use codex_core::protocol::EventMsg;
|
|
|
|
|
|
use codex_core::protocol::InputItem;
|
|
|
|
|
|
use codex_core::protocol::Op;
|
2025-09-11 11:08:51 -07:00
|
|
|
|
use codex_core::protocol::RolloutItem;
|
|
|
|
|
|
use codex_core::protocol::RolloutLine;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use core_test_support::load_default_config_for_test;
|
2025-09-25 13:11:14 -07:00
|
|
|
|
use core_test_support::skip_if_no_network;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use core_test_support::wait_for_event;
|
|
|
|
|
|
use tempfile::TempDir;
|
|
|
|
|
|
use wiremock::Mock;
|
2025-09-12 13:07:10 -07:00
|
|
|
|
use wiremock::Request;
|
|
|
|
|
|
use wiremock::Respond;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use wiremock::ResponseTemplate;
|
|
|
|
|
|
use wiremock::matchers::method;
|
|
|
|
|
|
use wiremock::matchers::path;
|
|
|
|
|
|
|
2025-09-23 17:59:17 +01:00
|
|
|
|
use codex_core::codex::compact::SUMMARIZATION_PROMPT;
|
2025-09-22 07:50:41 -07:00
|
|
|
|
use core_test_support::responses::ev_assistant_message;
|
|
|
|
|
|
use core_test_support::responses::ev_completed;
|
|
|
|
|
|
use core_test_support::responses::ev_completed_with_tokens;
|
|
|
|
|
|
use core_test_support::responses::ev_function_call;
|
OpenTelemetry events (#2103)
### Title
## otel
Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.
```toml
[otel]
environment = "staging" # defaults to "dev"
exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false # defaults to false; redact prompt text unless explicitly enabled
```
Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.
### Event catalog
Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.
With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):
- `codex.api_request`
- `cf_ray` (optional)
- `attempt`
- `duration_ms`
- `http.response.status_code` (optional)
- `error.message` (failures)
- `codex.sse_event`
- `event.kind`
- `duration_ms`
- `error.message` (failures)
- `input_token_count` (completion only)
- `output_token_count` (completion only)
- `cached_token_count` (completion only, optional)
- `reasoning_token_count` (completion only, optional)
- `tool_token_count` (completion only)
- `codex.user_prompt`
- `prompt_length`
- `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
- `tool_name`
- `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
- `source` (`config` or `user`)
- `codex.tool_result`
- `tool_name`
- `call_id`
- `arguments`
- `duration_ms` (execution time for the tool)
- `success` (`"true"` or `"false"`)
- `output`
### Choosing an exporter
Set `otel.exporter` to control where events go:
- `none` – leaves instrumentation active but skips exporting. This is
the
default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
endpoint, protocol, and headers your collector expects:
```toml
[otel]
exporter = { otlp-http = {
endpoint = "https://otel.example.com/v1/logs",
protocol = "binary",
headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
}}
```
- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
metadata headers:
```toml
[otel]
exporter = { otlp-grpc = {
endpoint = "https://otel.example.com:4317",
headers = { "x-otlp-meta" = "abc123" }
}}
```
If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.
If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.
---------
Co-authored-by: Anton Panasenko <apanasenko@openai.com>
2025-09-29 19:30:55 +01:00
|
|
|
|
use core_test_support::responses::mount_sse_once_match;
|
2025-09-22 07:50:41 -07:00
|
|
|
|
use core_test_support::responses::sse;
|
|
|
|
|
|
use core_test_support::responses::sse_response;
|
|
|
|
|
|
use core_test_support::responses::start_mock_server;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
use pretty_assertions::assert_eq;
|
2025-09-12 13:07:10 -07:00
|
|
|
|
use std::sync::Arc;
|
|
|
|
|
|
use std::sync::Mutex;
|
|
|
|
|
|
use std::sync::atomic::AtomicUsize;
|
|
|
|
|
|
use std::sync::atomic::Ordering;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
// --- Test helpers -----------------------------------------------------------
|
|
|
|
|
|
|
2025-09-14 09:23:31 -04:00
|
|
|
|
pub(super) const FIRST_REPLY: &str = "FIRST_REPLY";
|
|
|
|
|
|
pub(super) const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT";
|
2025-07-31 21:34:32 -07:00
|
|
|
|
const THIRD_USER_MSG: &str = "next turn";
|
2025-09-12 13:07:10 -07:00
|
|
|
|
const AUTO_SUMMARY_TEXT: &str = "AUTO_SUMMARY";
|
|
|
|
|
|
const FIRST_AUTO_MSG: &str = "token limit start";
|
|
|
|
|
|
const SECOND_AUTO_MSG: &str = "token limit push";
|
|
|
|
|
|
const STILL_TOO_BIG_REPLY: &str = "STILL_TOO_BIG";
|
|
|
|
|
|
const MULTI_AUTO_MSG: &str = "multi auto";
|
|
|
|
|
|
const SECOND_LARGE_REPLY: &str = "SECOND_LARGE_REPLY";
|
|
|
|
|
|
const FIRST_AUTO_SUMMARY: &str = "FIRST_AUTO_SUMMARY";
|
|
|
|
|
|
const SECOND_AUTO_SUMMARY: &str = "SECOND_AUTO_SUMMARY";
|
|
|
|
|
|
const FINAL_REPLY: &str = "FINAL_REPLY";
|
|
|
|
|
|
const DUMMY_FUNCTION_NAME: &str = "unsupported_tool";
|
|
|
|
|
|
const DUMMY_CALL_ID: &str = "call-multi-auto";
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
|
|
|
|
async fn summarize_context_three_requests_and_instructions() {
|
2025-09-25 13:11:14 -07:00
|
|
|
|
skip_if_no_network!();
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
// Set up a mock server that we can inspect after the run.
|
2025-09-12 13:07:10 -07:00
|
|
|
|
let server = start_mock_server().await;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
// SSE 1: assistant replies normally so it is recorded in history.
|
|
|
|
|
|
let sse1 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m1", FIRST_REPLY),
|
|
|
|
|
|
ev_completed("r1"),
|
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
// SSE 2: summarizer returns a summary message.
|
|
|
|
|
|
let sse2 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m2", SUMMARY_TEXT),
|
|
|
|
|
|
ev_completed("r2"),
|
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
// SSE 3: minimal completed; we only need to capture the request body.
|
|
|
|
|
|
let sse3 = sse(vec![ev_completed("r3")]);
|
|
|
|
|
|
|
|
|
|
|
|
// Mount three expectations, one per request, matched by body content.
|
|
|
|
|
|
let first_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains("\"text\":\"hello world\"")
|
2025-09-23 17:59:17 +01:00
|
|
|
|
&& !body.contains("You have exceeded the maximum number of tokens")
|
2025-07-31 21:34:32 -07:00
|
|
|
|
};
|
OpenTelemetry events (#2103)
### Title
## otel
Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.
```toml
[otel]
environment = "staging" # defaults to "dev"
exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false # defaults to false; redact prompt text unless explicitly enabled
```
Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.
### Event catalog
Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.
With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):
- `codex.api_request`
- `cf_ray` (optional)
- `attempt`
- `duration_ms`
- `http.response.status_code` (optional)
- `error.message` (failures)
- `codex.sse_event`
- `event.kind`
- `duration_ms`
- `error.message` (failures)
- `input_token_count` (completion only)
- `output_token_count` (completion only)
- `cached_token_count` (completion only, optional)
- `reasoning_token_count` (completion only, optional)
- `tool_token_count` (completion only)
- `codex.user_prompt`
- `prompt_length`
- `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
- `tool_name`
- `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
- `source` (`config` or `user`)
- `codex.tool_result`
- `tool_name`
- `call_id`
- `arguments`
- `duration_ms` (execution time for the tool)
- `success` (`"true"` or `"false"`)
- `output`
### Choosing an exporter
Set `otel.exporter` to control where events go:
- `none` – leaves instrumentation active but skips exporting. This is
the
default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
endpoint, protocol, and headers your collector expects:
```toml
[otel]
exporter = { otlp-http = {
endpoint = "https://otel.example.com/v1/logs",
protocol = "binary",
headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
}}
```
- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
metadata headers:
```toml
[otel]
exporter = { otlp-grpc = {
endpoint = "https://otel.example.com:4317",
headers = { "x-otlp-meta" = "abc123" }
}}
```
If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.
If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.
---------
Co-authored-by: Anton Panasenko <apanasenko@openai.com>
2025-09-29 19:30:55 +01:00
|
|
|
|
mount_sse_once_match(&server, first_matcher, sse1).await;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
let second_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
2025-09-23 17:59:17 +01:00
|
|
|
|
body.contains("You have exceeded the maximum number of tokens")
|
2025-07-31 21:34:32 -07:00
|
|
|
|
};
|
OpenTelemetry events (#2103)
### Title
## otel
Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.
```toml
[otel]
environment = "staging" # defaults to "dev"
exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false # defaults to false; redact prompt text unless explicitly enabled
```
Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.
### Event catalog
Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.
With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):
- `codex.api_request`
- `cf_ray` (optional)
- `attempt`
- `duration_ms`
- `http.response.status_code` (optional)
- `error.message` (failures)
- `codex.sse_event`
- `event.kind`
- `duration_ms`
- `error.message` (failures)
- `input_token_count` (completion only)
- `output_token_count` (completion only)
- `cached_token_count` (completion only, optional)
- `reasoning_token_count` (completion only, optional)
- `tool_token_count` (completion only)
- `codex.user_prompt`
- `prompt_length`
- `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
- `tool_name`
- `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
- `source` (`config` or `user`)
- `codex.tool_result`
- `tool_name`
- `call_id`
- `arguments`
- `duration_ms` (execution time for the tool)
- `success` (`"true"` or `"false"`)
- `output`
### Choosing an exporter
Set `otel.exporter` to control where events go:
- `none` – leaves instrumentation active but skips exporting. This is
the
default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
endpoint, protocol, and headers your collector expects:
```toml
[otel]
exporter = { otlp-http = {
endpoint = "https://otel.example.com/v1/logs",
protocol = "binary",
headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
}}
```
- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
metadata headers:
```toml
[otel]
exporter = { otlp-grpc = {
endpoint = "https://otel.example.com:4317",
headers = { "x-otlp-meta" = "abc123" }
}}
```
If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.
If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.
---------
Co-authored-by: Anton Panasenko <apanasenko@openai.com>
2025-09-29 19:30:55 +01:00
|
|
|
|
mount_sse_once_match(&server, second_matcher, sse2).await;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
let third_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains(&format!("\"text\":\"{THIRD_USER_MSG}\""))
|
|
|
|
|
|
};
|
OpenTelemetry events (#2103)
### Title
## otel
Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.
```toml
[otel]
environment = "staging" # defaults to "dev"
exporter = "none" # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false # defaults to false; redact prompt text unless explicitly enabled
```
Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.
### Event catalog
Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.
With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):
- `codex.api_request`
- `cf_ray` (optional)
- `attempt`
- `duration_ms`
- `http.response.status_code` (optional)
- `error.message` (failures)
- `codex.sse_event`
- `event.kind`
- `duration_ms`
- `error.message` (failures)
- `input_token_count` (completion only)
- `output_token_count` (completion only)
- `cached_token_count` (completion only, optional)
- `reasoning_token_count` (completion only, optional)
- `tool_token_count` (completion only)
- `codex.user_prompt`
- `prompt_length`
- `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
- `tool_name`
- `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
- `source` (`config` or `user`)
- `codex.tool_result`
- `tool_name`
- `call_id`
- `arguments`
- `duration_ms` (execution time for the tool)
- `success` (`"true"` or `"false"`)
- `output`
### Choosing an exporter
Set `otel.exporter` to control where events go:
- `none` – leaves instrumentation active but skips exporting. This is
the
default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
endpoint, protocol, and headers your collector expects:
```toml
[otel]
exporter = { otlp-http = {
endpoint = "https://otel.example.com/v1/logs",
protocol = "binary",
headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
}}
```
- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
metadata headers:
```toml
[otel]
exporter = { otlp-grpc = {
endpoint = "https://otel.example.com:4317",
headers = { "x-otlp-meta" = "abc123" }
}}
```
If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.
If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.
---------
Co-authored-by: Anton Panasenko <apanasenko@openai.com>
2025-09-29 19:30:55 +01:00
|
|
|
|
mount_sse_once_match(&server, third_matcher, sse3).await;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
// Build config pointing to the mock server and spawn Codex.
|
|
|
|
|
|
let model_provider = ModelProviderInfo {
|
|
|
|
|
|
base_url: Some(format!("{}/v1", server.uri())),
|
|
|
|
|
|
..built_in_model_providers()["openai"].clone()
|
|
|
|
|
|
};
|
|
|
|
|
|
let home = TempDir::new().unwrap();
|
|
|
|
|
|
let mut config = load_default_config_for_test(&home);
|
|
|
|
|
|
config.model_provider = model_provider;
|
2025-09-12 13:07:10 -07:00
|
|
|
|
config.model_auto_compact_token_limit = Some(200_000);
|
2025-08-22 13:10:11 -07:00
|
|
|
|
let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
|
2025-09-11 11:08:51 -07:00
|
|
|
|
let NewConversation {
|
|
|
|
|
|
conversation: codex,
|
|
|
|
|
|
session_configured,
|
|
|
|
|
|
..
|
|
|
|
|
|
} = conversation_manager.new_conversation(config).await.unwrap();
|
|
|
|
|
|
let rollout_path = session_configured.rollout_path;
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
|
|
|
|
|
// 1) Normal user input – should hit server once.
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: "hello world".into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
2025-09-23 17:59:17 +01:00
|
|
|
|
// 2) Summarize – second hit should include the summarization prompt.
|
2025-07-31 21:34:32 -07:00
|
|
|
|
codex.submit(Op::Compact).await.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
// 3) Next user input – third hit; history should include only the summary.
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: THIRD_USER_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
// Inspect the three captured requests.
|
|
|
|
|
|
let requests = server.received_requests().await.unwrap();
|
|
|
|
|
|
assert_eq!(requests.len(), 3, "expected exactly three requests");
|
|
|
|
|
|
|
|
|
|
|
|
let req1 = &requests[0];
|
|
|
|
|
|
let req2 = &requests[1];
|
|
|
|
|
|
let req3 = &requests[2];
|
|
|
|
|
|
|
|
|
|
|
|
let body1 = req1.body_json::<serde_json::Value>().unwrap();
|
|
|
|
|
|
let body2 = req2.body_json::<serde_json::Value>().unwrap();
|
|
|
|
|
|
let body3 = req3.body_json::<serde_json::Value>().unwrap();
|
|
|
|
|
|
|
2025-09-23 17:59:17 +01:00
|
|
|
|
// Manual compact should keep the baseline developer instructions.
|
2025-07-31 21:34:32 -07:00
|
|
|
|
let instr1 = body1.get("instructions").and_then(|v| v.as_str()).unwrap();
|
|
|
|
|
|
let instr2 = body2.get("instructions").and_then(|v| v.as_str()).unwrap();
|
2025-09-23 17:59:17 +01:00
|
|
|
|
assert_eq!(
|
2025-07-31 21:34:32 -07:00
|
|
|
|
instr1, instr2,
|
2025-09-23 17:59:17 +01:00
|
|
|
|
"manual compact should keep the standard developer instructions"
|
2025-07-31 21:34:32 -07:00
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
// The summarization request should include the injected user input marker.
|
|
|
|
|
|
let input2 = body2.get("input").and_then(|v| v.as_array()).unwrap();
|
|
|
|
|
|
// The last item is the user message created from the injected input.
|
|
|
|
|
|
let last2 = input2.last().unwrap();
|
|
|
|
|
|
assert_eq!(last2.get("type").unwrap().as_str().unwrap(), "message");
|
|
|
|
|
|
assert_eq!(last2.get("role").unwrap().as_str().unwrap(), "user");
|
|
|
|
|
|
let text2 = last2["content"][0]["text"].as_str().unwrap();
|
2025-09-23 17:59:17 +01:00
|
|
|
|
assert_eq!(
|
|
|
|
|
|
text2, SUMMARIZATION_PROMPT,
|
2025-09-12 13:07:10 -07:00
|
|
|
|
"expected summarize trigger, got `{text2}`"
|
|
|
|
|
|
);
|
2025-07-31 21:34:32 -07:00
|
|
|
|
|
2025-09-12 13:07:10 -07:00
|
|
|
|
// Third request must contain the refreshed instructions, bridge summary message and new user msg.
|
2025-07-31 21:34:32 -07:00
|
|
|
|
let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();
|
2025-09-23 17:59:17 +01:00
|
|
|
|
|
2025-07-31 21:34:32 -07:00
|
|
|
|
assert!(
|
2025-09-12 13:07:10 -07:00
|
|
|
|
input3.len() >= 3,
|
|
|
|
|
|
"expected refreshed context and new user message in third request"
|
2025-07-31 21:34:32 -07:00
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
// Collect all (role, text) message tuples.
|
|
|
|
|
|
let mut messages: Vec<(String, String)> = Vec::new();
|
|
|
|
|
|
for item in input3 {
|
|
|
|
|
|
if item["type"].as_str() == Some("message") {
|
|
|
|
|
|
let role = item["role"].as_str().unwrap_or_default().to_string();
|
|
|
|
|
|
let text = item["content"][0]["text"]
|
|
|
|
|
|
.as_str()
|
|
|
|
|
|
.unwrap_or_default()
|
|
|
|
|
|
.to_string();
|
|
|
|
|
|
messages.push((role, text));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-09-12 13:07:10 -07:00
|
|
|
|
// No previous assistant messages should remain and the new user message is present.
|
2025-07-31 21:34:32 -07:00
|
|
|
|
let assistant_count = messages.iter().filter(|(r, _)| r == "assistant").count();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
assert_eq!(assistant_count, 0, "assistant history should be cleared");
|
2025-07-31 21:34:32 -07:00
|
|
|
|
assert!(
|
|
|
|
|
|
messages
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
.any(|(r, t)| r == "user" && t == THIRD_USER_MSG),
|
|
|
|
|
|
"third request should include the new user message"
|
|
|
|
|
|
);
|
2025-09-12 13:07:10 -07:00
|
|
|
|
let Some((_, bridge_text)) = messages.iter().find(|(role, text)| {
|
|
|
|
|
|
role == "user"
|
|
|
|
|
|
&& (text.contains("Here were the user messages")
|
|
|
|
|
|
|| text.contains("Here are all the user messages"))
|
|
|
|
|
|
&& text.contains(SUMMARY_TEXT)
|
|
|
|
|
|
}) else {
|
|
|
|
|
|
panic!("expected a bridge message containing the summary");
|
|
|
|
|
|
};
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
bridge_text.contains("hello world"),
|
|
|
|
|
|
"bridge should capture earlier user messages"
|
|
|
|
|
|
);
|
2025-07-31 21:34:32 -07:00
|
|
|
|
assert!(
|
2025-09-23 17:59:17 +01:00
|
|
|
|
!bridge_text.contains(SUMMARIZATION_PROMPT),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
"bridge text should not echo the summarize trigger"
|
2025-07-31 21:34:32 -07:00
|
|
|
|
);
|
|
|
|
|
|
assert!(
|
2025-09-12 13:07:10 -07:00
|
|
|
|
!messages
|
|
|
|
|
|
.iter()
|
2025-09-23 17:59:17 +01:00
|
|
|
|
.any(|(_, text)| text.contains(SUMMARIZATION_PROMPT)),
|
2025-07-31 21:34:32 -07:00
|
|
|
|
"third request should not include the summarize trigger"
|
|
|
|
|
|
);
|
2025-09-11 11:08:51 -07:00
|
|
|
|
|
|
|
|
|
|
// Shut down Codex to flush rollout entries before inspecting the file.
|
|
|
|
|
|
codex.submit(Op::Shutdown).await.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;
|
|
|
|
|
|
|
|
|
|
|
|
// Verify rollout contains APITurn entries for each API call and a Compacted entry.
|
2025-09-12 13:07:10 -07:00
|
|
|
|
println!("rollout path: {}", rollout_path.display());
|
2025-09-11 11:08:51 -07:00
|
|
|
|
let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
|
|
|
|
|
|
panic!(
|
|
|
|
|
|
"failed to read rollout file {}: {e}",
|
|
|
|
|
|
rollout_path.display()
|
|
|
|
|
|
)
|
|
|
|
|
|
});
|
|
|
|
|
|
let mut api_turn_count = 0usize;
|
|
|
|
|
|
let mut saw_compacted_summary = false;
|
|
|
|
|
|
for line in text.lines() {
|
|
|
|
|
|
let trimmed = line.trim();
|
|
|
|
|
|
if trimmed.is_empty() {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
};
|
|
|
|
|
|
match entry.item {
|
|
|
|
|
|
RolloutItem::TurnContext(_) => {
|
|
|
|
|
|
api_turn_count += 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
RolloutItem::Compacted(ci) => {
|
|
|
|
|
|
if ci.message == SUMMARY_TEXT {
|
|
|
|
|
|
saw_compacted_summary = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
_ => {}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
api_turn_count == 3,
|
|
|
|
|
|
"expected three APITurn entries in rollout"
|
|
|
|
|
|
);
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
saw_compacted_summary,
|
|
|
|
|
|
"expected a Compacted entry containing the summarizer output"
|
|
|
|
|
|
);
|
2025-07-31 21:34:32 -07:00
|
|
|
|
}
|
2025-09-12 13:07:10 -07:00
|
|
|
|
|
2025-09-14 16:20:25 -07:00
|
|
|
|
// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
|
|
|
|
|
|
#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
|
|
|
|
|
|
#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
|
2025-09-12 13:07:10 -07:00
|
|
|
|
async fn auto_compact_runs_after_token_limit_hit() {
|
2025-09-25 13:11:14 -07:00
|
|
|
|
skip_if_no_network!();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
|
|
|
|
|
|
let server = start_mock_server().await;
|
|
|
|
|
|
|
|
|
|
|
|
let sse1 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m1", FIRST_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r1", 70_000),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let sse2 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m2", "SECOND_REPLY"),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r2", 330_000),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let sse3 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r3", 200),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let first_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains(FIRST_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains(SECOND_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(first_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse1))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let second_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains(SECOND_AUTO_MSG)
|
|
|
|
|
|
&& body.contains(FIRST_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(second_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse2))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let third_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(third_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse3))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let model_provider = ModelProviderInfo {
|
|
|
|
|
|
base_url: Some(format!("{}/v1", server.uri())),
|
|
|
|
|
|
..built_in_model_providers()["openai"].clone()
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let home = TempDir::new().unwrap();
|
|
|
|
|
|
let mut config = load_default_config_for_test(&home);
|
|
|
|
|
|
config.model_provider = model_provider;
|
|
|
|
|
|
config.model_auto_compact_token_limit = Some(200_000);
|
|
|
|
|
|
let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
|
|
|
|
|
|
let codex = conversation_manager
|
|
|
|
|
|
.new_conversation(config)
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap()
|
|
|
|
|
|
.conversation;
|
|
|
|
|
|
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: FIRST_AUTO_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
2025-09-14 16:20:25 -07:00
|
|
|
|
|
2025-09-12 13:07:10 -07:00
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: SECOND_AUTO_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
2025-09-14 16:20:25 -07:00
|
|
|
|
|
2025-09-12 13:07:10 -07:00
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
// wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
let requests = server.received_requests().await.unwrap();
|
2025-09-14 16:20:25 -07:00
|
|
|
|
assert!(
|
|
|
|
|
|
requests.len() >= 3,
|
|
|
|
|
|
"auto compact should add at least a third request, got {}",
|
|
|
|
|
|
requests.len()
|
|
|
|
|
|
);
|
|
|
|
|
|
let is_auto_compact = |req: &wiremock::Request| {
|
|
|
|
|
|
std::str::from_utf8(&req.body)
|
|
|
|
|
|
.unwrap_or("")
|
|
|
|
|
|
.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count();
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
auto_compact_count, 1,
|
|
|
|
|
|
"expected exactly one auto compact request"
|
|
|
|
|
|
);
|
|
|
|
|
|
let auto_compact_index = requests
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
.enumerate()
|
|
|
|
|
|
.find_map(|(idx, req)| is_auto_compact(req).then_some(idx))
|
|
|
|
|
|
.expect("auto compact request missing");
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
auto_compact_index, 2,
|
|
|
|
|
|
"auto compact should add a third request"
|
|
|
|
|
|
);
|
2025-09-12 13:07:10 -07:00
|
|
|
|
|
2025-09-23 17:59:17 +01:00
|
|
|
|
let body_first = requests[0].body_json::<serde_json::Value>().unwrap();
|
2025-09-14 16:20:25 -07:00
|
|
|
|
let body3 = requests[auto_compact_index]
|
|
|
|
|
|
.body_json::<serde_json::Value>()
|
|
|
|
|
|
.unwrap();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
let instructions = body3
|
|
|
|
|
|
.get("instructions")
|
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
|
.unwrap_or_default();
|
2025-09-23 17:59:17 +01:00
|
|
|
|
let baseline_instructions = body_first
|
|
|
|
|
|
.get("instructions")
|
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
|
.unwrap_or_default()
|
|
|
|
|
|
.to_string();
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
instructions, baseline_instructions,
|
|
|
|
|
|
"auto compact should keep the standard developer instructions",
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();
|
|
|
|
|
|
let last3 = input3
|
|
|
|
|
|
.last()
|
|
|
|
|
|
.expect("auto compact request should append a user message");
|
|
|
|
|
|
assert_eq!(last3.get("type").and_then(|v| v.as_str()), Some("message"));
|
|
|
|
|
|
assert_eq!(last3.get("role").and_then(|v| v.as_str()), Some("user"));
|
|
|
|
|
|
let last_text = last3
|
|
|
|
|
|
.get("content")
|
|
|
|
|
|
.and_then(|v| v.as_array())
|
|
|
|
|
|
.and_then(|items| items.first())
|
|
|
|
|
|
.and_then(|item| item.get("text"))
|
|
|
|
|
|
.and_then(|text| text.as_str())
|
|
|
|
|
|
.unwrap_or_default();
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
last_text, SUMMARIZATION_PROMPT,
|
|
|
|
|
|
"auto compact should send the summarization prompt as a user message",
|
2025-09-12 13:07:10 -07:00
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
|
|
|
|
async fn auto_compact_persists_rollout_entries() {
|
2025-09-25 13:11:14 -07:00
|
|
|
|
skip_if_no_network!();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
|
|
|
|
|
|
let server = start_mock_server().await;
|
|
|
|
|
|
|
|
|
|
|
|
let sse1 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m1", FIRST_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r1", 70_000),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let sse2 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m2", "SECOND_REPLY"),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r2", 330_000),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let sse3 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r3", 200),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let first_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains(FIRST_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains(SECOND_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(first_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse1))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let second_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains(SECOND_AUTO_MSG)
|
|
|
|
|
|
&& body.contains(FIRST_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(second_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse2))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let third_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(third_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse3))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let model_provider = ModelProviderInfo {
|
|
|
|
|
|
base_url: Some(format!("{}/v1", server.uri())),
|
|
|
|
|
|
..built_in_model_providers()["openai"].clone()
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let home = TempDir::new().unwrap();
|
|
|
|
|
|
let mut config = load_default_config_for_test(&home);
|
|
|
|
|
|
config.model_provider = model_provider;
|
|
|
|
|
|
let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
|
|
|
|
|
|
let NewConversation {
|
|
|
|
|
|
conversation: codex,
|
|
|
|
|
|
session_configured,
|
|
|
|
|
|
..
|
|
|
|
|
|
} = conversation_manager.new_conversation(config).await.unwrap();
|
|
|
|
|
|
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: FIRST_AUTO_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: SECOND_AUTO_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
codex.submit(Op::Shutdown).await.unwrap();
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;
|
|
|
|
|
|
|
|
|
|
|
|
let rollout_path = session_configured.rollout_path;
|
|
|
|
|
|
let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
|
|
|
|
|
|
panic!(
|
|
|
|
|
|
"failed to read rollout file {}: {e}",
|
|
|
|
|
|
rollout_path.display()
|
|
|
|
|
|
)
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
let mut turn_context_count = 0usize;
|
|
|
|
|
|
for line in text.lines() {
|
|
|
|
|
|
let trimmed = line.trim();
|
|
|
|
|
|
if trimmed.is_empty() {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
};
|
|
|
|
|
|
match entry.item {
|
|
|
|
|
|
RolloutItem::TurnContext(_) => {
|
|
|
|
|
|
turn_context_count += 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
RolloutItem::Compacted(_) => {}
|
|
|
|
|
|
_ => {}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
turn_context_count >= 2,
|
|
|
|
|
|
"expected at least two turn context entries, got {turn_context_count}"
|
|
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
|
|
|
|
async fn auto_compact_stops_after_failed_attempt() {
|
2025-09-25 13:11:14 -07:00
|
|
|
|
skip_if_no_network!();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
|
|
|
|
|
|
let server = start_mock_server().await;
|
|
|
|
|
|
|
|
|
|
|
|
let sse1 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m1", FIRST_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r1", 500),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let sse2 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m2", SUMMARY_TEXT),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r2", 50),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let sse3 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m3", STILL_TOO_BIG_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r3", 500),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
let first_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains(FIRST_AUTO_MSG)
|
|
|
|
|
|
&& !body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(first_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse1.clone()))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let second_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(second_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse2.clone()))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let third_matcher = |req: &wiremock::Request| {
|
|
|
|
|
|
let body = std::str::from_utf8(&req.body).unwrap_or("");
|
|
|
|
|
|
!body.contains("You have exceeded the maximum number of tokens")
|
|
|
|
|
|
&& body.contains(SUMMARY_TEXT)
|
|
|
|
|
|
};
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.and(third_matcher)
|
2025-09-22 07:50:41 -07:00
|
|
|
|
.respond_with(sse_response(sse3.clone()))
|
2025-09-12 13:07:10 -07:00
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let model_provider = ModelProviderInfo {
|
|
|
|
|
|
base_url: Some(format!("{}/v1", server.uri())),
|
|
|
|
|
|
..built_in_model_providers()["openai"].clone()
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let home = TempDir::new().unwrap();
|
|
|
|
|
|
let mut config = load_default_config_for_test(&home);
|
|
|
|
|
|
config.model_provider = model_provider;
|
|
|
|
|
|
config.model_auto_compact_token_limit = Some(200);
|
|
|
|
|
|
let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
|
|
|
|
|
|
let codex = conversation_manager
|
|
|
|
|
|
.new_conversation(config)
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap()
|
|
|
|
|
|
.conversation;
|
|
|
|
|
|
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: FIRST_AUTO_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
|
|
let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await;
|
|
|
|
|
|
let EventMsg::Error(ErrorEvent { message }) = error_event else {
|
|
|
|
|
|
panic!("expected error event");
|
|
|
|
|
|
};
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
message.contains("limit"),
|
|
|
|
|
|
"error message should include limit information: {message}"
|
|
|
|
|
|
);
|
|
|
|
|
|
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
|
|
|
|
|
|
|
|
|
|
|
let requests = server.received_requests().await.unwrap();
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
requests.len(),
|
|
|
|
|
|
3,
|
|
|
|
|
|
"auto compact should attempt at most one summarization before erroring"
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
let last_body = requests[2].body_json::<serde_json::Value>().unwrap();
|
2025-09-23 17:59:17 +01:00
|
|
|
|
let input = last_body
|
|
|
|
|
|
.get("input")
|
|
|
|
|
|
.and_then(|v| v.as_array())
|
|
|
|
|
|
.unwrap_or_else(|| panic!("unexpected request format: {last_body}"));
|
|
|
|
|
|
let contains_prompt = input.iter().any(|item| {
|
|
|
|
|
|
item.get("type").and_then(|v| v.as_str()) == Some("message")
|
|
|
|
|
|
&& item.get("role").and_then(|v| v.as_str()) == Some("user")
|
|
|
|
|
|
&& item
|
|
|
|
|
|
.get("content")
|
|
|
|
|
|
.and_then(|v| v.as_array())
|
|
|
|
|
|
.and_then(|items| items.first())
|
|
|
|
|
|
.and_then(|entry| entry.get("text"))
|
|
|
|
|
|
.and_then(|text| text.as_str())
|
|
|
|
|
|
.map(|text| text == SUMMARIZATION_PROMPT)
|
|
|
|
|
|
.unwrap_or(false)
|
|
|
|
|
|
});
|
2025-09-12 13:07:10 -07:00
|
|
|
|
assert!(
|
2025-09-23 17:59:17 +01:00
|
|
|
|
!contains_prompt,
|
|
|
|
|
|
"third request should be the follow-up turn, not another summarization",
|
2025-09-12 13:07:10 -07:00
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
|
|
|
|
async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_events() {
|
2025-09-25 13:11:14 -07:00
|
|
|
|
skip_if_no_network!();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
|
|
|
|
|
|
let server = start_mock_server().await;
|
|
|
|
|
|
|
|
|
|
|
|
let sse1 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m1", FIRST_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r1", 500),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
let sse2 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m2", FIRST_AUTO_SUMMARY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r2", 50),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
let sse3 = sse(vec![
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
|
|
|
|
|
|
ev_completed_with_tokens("r3", 150),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
let sse4 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m4", SECOND_LARGE_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r4", 450),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
let sse5 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m5", SECOND_AUTO_SUMMARY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r5", 60),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
let sse6 = sse(vec![
|
|
|
|
|
|
ev_assistant_message("m6", FINAL_REPLY),
|
2025-09-22 07:50:41 -07:00
|
|
|
|
ev_completed_with_tokens("r6", 120),
|
2025-09-12 13:07:10 -07:00
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
|
|
|
|
|
struct SeqResponder {
|
|
|
|
|
|
bodies: Arc<Vec<String>>,
|
|
|
|
|
|
calls: Arc<AtomicUsize>,
|
|
|
|
|
|
requests: Arc<Mutex<Vec<Vec<u8>>>>,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
impl SeqResponder {
|
|
|
|
|
|
fn new(bodies: Vec<String>) -> Self {
|
|
|
|
|
|
Self {
|
|
|
|
|
|
bodies: Arc::new(bodies),
|
|
|
|
|
|
calls: Arc::new(AtomicUsize::new(0)),
|
|
|
|
|
|
requests: Arc::new(Mutex::new(Vec::new())),
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fn recorded_requests(&self) -> Vec<Vec<u8>> {
|
|
|
|
|
|
self.requests.lock().unwrap().clone()
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
impl Respond for SeqResponder {
|
|
|
|
|
|
fn respond(&self, req: &Request) -> ResponseTemplate {
|
|
|
|
|
|
let idx = self.calls.fetch_add(1, Ordering::SeqCst);
|
|
|
|
|
|
self.requests.lock().unwrap().push(req.body.clone());
|
|
|
|
|
|
let body = self
|
|
|
|
|
|
.bodies
|
|
|
|
|
|
.get(idx)
|
|
|
|
|
|
.unwrap_or_else(|| panic!("unexpected request index {idx}"))
|
|
|
|
|
|
.clone();
|
|
|
|
|
|
ResponseTemplate::new(200)
|
|
|
|
|
|
.insert_header("content-type", "text/event-stream")
|
|
|
|
|
|
.set_body_raw(body, "text/event-stream")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
let responder = SeqResponder::new(vec![sse1, sse2, sse3, sse4, sse5, sse6]);
|
|
|
|
|
|
Mock::given(method("POST"))
|
|
|
|
|
|
.and(path("/v1/responses"))
|
|
|
|
|
|
.respond_with(responder.clone())
|
|
|
|
|
|
.expect(6)
|
|
|
|
|
|
.mount(&server)
|
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
|
|
let model_provider = ModelProviderInfo {
|
|
|
|
|
|
base_url: Some(format!("{}/v1", server.uri())),
|
|
|
|
|
|
..built_in_model_providers()["openai"].clone()
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let home = TempDir::new().unwrap();
|
|
|
|
|
|
let mut config = load_default_config_for_test(&home);
|
|
|
|
|
|
config.model_provider = model_provider;
|
|
|
|
|
|
config.model_auto_compact_token_limit = Some(200);
|
|
|
|
|
|
let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
|
|
|
|
|
|
let codex = conversation_manager
|
|
|
|
|
|
.new_conversation(config)
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap()
|
|
|
|
|
|
.conversation;
|
|
|
|
|
|
|
|
|
|
|
|
codex
|
|
|
|
|
|
.submit(Op::UserInput {
|
|
|
|
|
|
items: vec![InputItem::Text {
|
|
|
|
|
|
text: MULTI_AUTO_MSG.into(),
|
|
|
|
|
|
}],
|
|
|
|
|
|
})
|
|
|
|
|
|
.await
|
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
2025-09-18 16:34:16 +01:00
|
|
|
|
let mut auto_compact_lifecycle_events = Vec::new();
|
2025-09-12 13:07:10 -07:00
|
|
|
|
loop {
|
|
|
|
|
|
let event = codex.next_event().await.unwrap();
|
2025-09-18 16:34:16 +01:00
|
|
|
|
if event.id.starts_with("auto-compact-")
|
|
|
|
|
|
&& matches!(
|
|
|
|
|
|
event.msg,
|
|
|
|
|
|
EventMsg::TaskStarted(_) | EventMsg::TaskComplete(_)
|
|
|
|
|
|
)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto_compact_lifecycle_events.push(event);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
2025-09-12 13:07:10 -07:00
|
|
|
|
if let EventMsg::TaskComplete(_) = &event.msg
|
|
|
|
|
|
&& !event.id.starts_with("auto-compact-")
|
|
|
|
|
|
{
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-09-18 16:34:16 +01:00
|
|
|
|
assert!(
|
|
|
|
|
|
auto_compact_lifecycle_events.is_empty(),
|
|
|
|
|
|
"auto compact should not emit task lifecycle events"
|
|
|
|
|
|
);
|
|
|
|
|
|
|
2025-09-12 13:07:10 -07:00
|
|
|
|
let request_bodies: Vec<String> = responder
|
|
|
|
|
|
.recorded_requests()
|
|
|
|
|
|
.into_iter()
|
|
|
|
|
|
.map(|body| String::from_utf8(body).unwrap_or_default())
|
|
|
|
|
|
.collect();
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
request_bodies.len(),
|
|
|
|
|
|
6,
|
|
|
|
|
|
"expected six requests including two auto compactions"
|
|
|
|
|
|
);
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
request_bodies[0].contains(MULTI_AUTO_MSG),
|
|
|
|
|
|
"first request should contain the user input"
|
|
|
|
|
|
);
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
request_bodies[1].contains("You have exceeded the maximum number of tokens"),
|
2025-09-23 17:59:17 +01:00
|
|
|
|
"first auto compact request should include the summarization prompt"
|
2025-09-12 13:07:10 -07:00
|
|
|
|
);
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
request_bodies[3].contains(&format!("unsupported call: {DUMMY_FUNCTION_NAME}")),
|
|
|
|
|
|
"function call output should be sent before the second auto compact"
|
|
|
|
|
|
);
|
|
|
|
|
|
assert!(
|
|
|
|
|
|
request_bodies[4].contains("You have exceeded the maximum number of tokens"),
|
2025-09-23 17:59:17 +01:00
|
|
|
|
"second auto compact request should include the summarization prompt"
|
2025-09-12 13:07:10 -07:00
|
|
|
|
);
|
|
|
|
|
|
}
|