codex-rs/core/tests/suite/compact.rs

use codex_core::CodexAuth;
use codex_core::ConversationManager;
use codex_core::ModelProviderInfo;
use codex_core::NewConversation;
use codex_core::built_in_model_providers;
use codex_core::protocol::ErrorEvent;
use codex_core::protocol::EventMsg;
use codex_core::protocol::Op;
use codex_core::protocol::RolloutItem;
use codex_core::protocol::RolloutLine;
use codex_protocol::user_input::UserInput;
use core_test_support::load_default_config_for_test;
use core_test_support::skip_if_no_network;
use core_test_support::wait_for_event;
use tempfile::TempDir;

use codex_core::codex::compact::SUMMARIZATION_PROMPT;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_completed_with_tokens;
use core_test_support::responses::ev_function_call;
use core_test_support::responses::mount_sse_once;
use core_test_support::responses::mount_sse_once_match;
use core_test_support::responses::mount_sse_sequence;
use core_test_support::responses::sse;
use core_test_support::responses::sse_failed;
use core_test_support::responses::start_mock_server;
use pretty_assertions::assert_eq;
// --- Test helpers -----------------------------------------------------------

pub(super) const FIRST_REPLY: &str = "FIRST_REPLY";
pub(super) const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT";
const THIRD_USER_MSG: &str = "next turn";
const AUTO_SUMMARY_TEXT: &str = "AUTO_SUMMARY";
const FIRST_AUTO_MSG: &str = "token limit start";
const SECOND_AUTO_MSG: &str = "token limit push";
const STILL_TOO_BIG_REPLY: &str = "STILL_TOO_BIG";
const MULTI_AUTO_MSG: &str = "multi auto";
const SECOND_LARGE_REPLY: &str = "SECOND_LARGE_REPLY";
const FIRST_AUTO_SUMMARY: &str = "FIRST_AUTO_SUMMARY";
const SECOND_AUTO_SUMMARY: &str = "SECOND_AUTO_SUMMARY";
const FINAL_REPLY: &str = "FINAL_REPLY";
const CONTEXT_LIMIT_MESSAGE: &str =
    "Your input exceeds the context window of this model. Please adjust your input and try again.";
const DUMMY_FUNCTION_NAME: &str = "unsupported_tool";
const DUMMY_CALL_ID: &str = "call-multi-auto";
const FUNCTION_CALL_LIMIT_MSG: &str = "function call limit push";

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn summarize_context_three_requests_and_instructions() {
    skip_if_no_network!();

    // Set up a mock server that we can inspect after the run.
    let server = start_mock_server().await;

    // SSE 1: assistant replies normally so it is recorded in history.
    let sse1 = sse(vec![
        ev_assistant_message("m1", FIRST_REPLY),
        ev_completed("r1"),
    ]);

    // SSE 2: summarizer returns a summary message.
    let sse2 = sse(vec![
        ev_assistant_message("m2", SUMMARY_TEXT),
        ev_completed("r2"),
    ]);

    // SSE 3: minimal completed; we only need to capture the request body.
    let sse3 = sse(vec![ev_completed("r3")]);

    // Mount three expectations, one per request, matched by body content.
    let first_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains("\"text\":\"hello world\"")
            && !body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, first_matcher, sse1).await;

    let second_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, second_matcher, sse2).await;

    let third_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains(&format!("\"text\":\"{THIRD_USER_MSG}\""))
    };
    mount_sse_once_match(&server, third_matcher, sse3).await;

    // Build config pointing to the mock server and spawn Codex.
    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };
    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.model_auto_compact_token_limit = Some(200_000);
    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let NewConversation {
        conversation: codex,
        session_configured,
        ..
    } = conversation_manager.new_conversation(config).await.unwrap();
    let rollout_path = session_configured.rollout_path;

    // 1) Normal user input – should hit server once.
    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: "hello world".into(),
            }],
        })
        .await
        .unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    // 2) Summarize – second hit should include the summarization prompt.
    codex.submit(Op::Compact).await.unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    // 3) Next user input – third hit; history should include only the summary.
    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: THIRD_USER_MSG.into(),
            }],
        })
        .await
        .unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    // Inspect the three captured requests.
    let requests = server.received_requests().await.unwrap();
    assert_eq!(requests.len(), 3, "expected exactly three requests");

    let req1 = &requests[0];
    let req2 = &requests[1];
    let req3 = &requests[2];

    let body1 = req1.body_json::<serde_json::Value>().unwrap();
    let body2 = req2.body_json::<serde_json::Value>().unwrap();
    let body3 = req3.body_json::<serde_json::Value>().unwrap();

    // Manual compact should keep the baseline developer instructions.
    let instr1 = body1.get("instructions").and_then(|v| v.as_str()).unwrap();
    let instr2 = body2.get("instructions").and_then(|v| v.as_str()).unwrap();
    assert_eq!(
        instr1, instr2,
        "manual compact should keep the standard developer instructions"
    );

    // The summarization request should include the injected user input marker.
    let input2 = body2.get("input").and_then(|v| v.as_array()).unwrap();
    // The last item is the user message created from the injected input.
    let last2 = input2.last().unwrap();
    assert_eq!(last2.get("type").unwrap().as_str().unwrap(), "message");
    assert_eq!(last2.get("role").unwrap().as_str().unwrap(), "user");
    let text2 = last2["content"][0]["text"].as_str().unwrap();
    assert_eq!(
        text2, SUMMARIZATION_PROMPT,
        "expected summarize trigger, got `{text2}`"
    );

    // Third request must contain the refreshed instructions, bridge summary message and new user msg.
    let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();

    assert!(
        input3.len() >= 3,
        "expected refreshed context and new user message in third request"
    );

    // Collect all (role, text) message tuples.
    let mut messages: Vec<(String, String)> = Vec::new();
    for item in input3 {
        if item["type"].as_str() == Some("message") {
            let role = item["role"].as_str().unwrap_or_default().to_string();
            let text = item["content"][0]["text"]
                .as_str()
                .unwrap_or_default()
                .to_string();
            messages.push((role, text));
        }
    }

    // No previous assistant messages should remain and the new user message is present.
    let assistant_count = messages.iter().filter(|(r, _)| r == "assistant").count();
    assert_eq!(assistant_count, 0, "assistant history should be cleared");
    assert!(
        messages
            .iter()
            .any(|(r, t)| r == "user" && t == THIRD_USER_MSG),
        "third request should include the new user message"
    );
    let Some((_, bridge_text)) = messages.iter().find(|(role, text)| {
        role == "user"
            && (text.contains("Here were the user messages")
                || text.contains("Here are all the user messages"))
            && text.contains(SUMMARY_TEXT)
    }) else {
        panic!("expected a bridge message containing the summary");
    };
    assert!(
        bridge_text.contains("hello world"),
        "bridge should capture earlier user messages"
    );
    assert!(
        !bridge_text.contains(SUMMARIZATION_PROMPT),
        "bridge text should not echo the summarize trigger"
    );
    assert!(
        !messages
            .iter()
            .any(|(_, text)| text.contains(SUMMARIZATION_PROMPT)),
        "third request should not include the summarize trigger"
    );

    // Shut down Codex to flush rollout entries before inspecting the file.
    codex.submit(Op::Shutdown).await.unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;

    // Verify rollout contains APITurn entries for each API call and a Compacted entry.
    println!("rollout path: {}", rollout_path.display());
    let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
        panic!(
            "failed to read rollout file {}: {e}",
            rollout_path.display()
        )
    });
    let mut api_turn_count = 0usize;
    let mut saw_compacted_summary = false;
    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
            continue;
        };
        match entry.item {
            RolloutItem::TurnContext(_) => {
                api_turn_count += 1;
            }
            RolloutItem::Compacted(ci) => {
                if ci.message == SUMMARY_TEXT {
                    saw_compacted_summary = true;
                }
            }
            _ => {}
        }
    }

    assert!(
        api_turn_count == 3,
        "expected three APITurn entries in rollout"
    );
    assert!(
        saw_compacted_summary,
        "expected a Compacted entry containing the summarizer output"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn manual_compact_uses_custom_prompt() {
    skip_if_no_network!();

    let server = start_mock_server().await;
    let sse_stream = sse(vec![ev_completed("r1")]);
    mount_sse_once(&server, sse_stream).await;

    let custom_prompt = "Use this compact prompt instead";

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };
    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.compact_prompt = Some(custom_prompt.to_string());

    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let codex = conversation_manager
        .new_conversation(config)
        .await
        .expect("create conversation")
        .conversation;

    codex.submit(Op::Compact).await.expect("trigger compact");
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    let requests = server.received_requests().await.expect("collect requests");
    let body = requests
        .iter()
        .find_map(|req| req.body_json::<serde_json::Value>().ok())
        .expect("summary request body");

    let input = body
        .get("input")
        .and_then(|v| v.as_array())
        .expect("input array");
    let mut found_custom_prompt = false;
    let mut found_default_prompt = false;

    for item in input {
        if item["type"].as_str() != Some("message") {
            continue;
        }
        let text = item["content"][0]["text"].as_str().unwrap_or_default();
        if text == custom_prompt {
            found_custom_prompt = true;
        }
        if text == SUMMARIZATION_PROMPT {
            found_default_prompt = true;
        }
    }

    assert!(found_custom_prompt, "custom prompt should be injected");
    assert!(!found_default_prompt, "default prompt should be replaced");
}

// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
async fn auto_compact_runs_after_token_limit_hit() {
    skip_if_no_network!();

    let server = start_mock_server().await;

    let sse1 = sse(vec![
        ev_assistant_message("m1", FIRST_REPLY),
        ev_completed_with_tokens("r1", 70_000),
    ]);

    let sse2 = sse(vec![
        ev_assistant_message("m2", "SECOND_REPLY"),
        ev_completed_with_tokens("r2", 330_000),
    ]);

    let sse3 = sse(vec![
        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
        ev_completed_with_tokens("r3", 200),
    ]);

    let first_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains(FIRST_AUTO_MSG)
            && !body.contains(SECOND_AUTO_MSG)
            && !body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, first_matcher, sse1).await;

    let second_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains(SECOND_AUTO_MSG)
            && body.contains(FIRST_AUTO_MSG)
            && !body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, second_matcher, sse2).await;

    let third_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, third_matcher, sse3).await;

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };

    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.model_auto_compact_token_limit = Some(200_000);
    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let codex = conversation_manager
        .new_conversation(config)
        .await
        .unwrap()
        .conversation;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: FIRST_AUTO_MSG.into(),
            }],
        })
        .await
        .unwrap();

    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: SECOND_AUTO_MSG.into(),
            }],
        })
        .await
        .unwrap();

    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
    // wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    let requests = server.received_requests().await.unwrap();
    assert!(
        requests.len() >= 3,
        "auto compact should add at least a third request, got {}",
        requests.len()
    );
    let is_auto_compact = |req: &wiremock::Request| {
        std::str::from_utf8(&req.body)
            .unwrap_or("")
            .contains("You have exceeded the maximum number of tokens")
    };
    let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count();
    assert_eq!(
        auto_compact_count, 1,
        "expected exactly one auto compact request"
    );
    let auto_compact_index = requests
        .iter()
        .enumerate()
        .find_map(|(idx, req)| is_auto_compact(req).then_some(idx))
        .expect("auto compact request missing");
    assert_eq!(
        auto_compact_index, 2,
        "auto compact should add a third request"
    );

    let body_first = requests[0].body_json::<serde_json::Value>().unwrap();
    let body3 = requests[auto_compact_index]
        .body_json::<serde_json::Value>()
        .unwrap();
    let instructions = body3
        .get("instructions")
        .and_then(|v| v.as_str())
        .unwrap_or_default();
    let baseline_instructions = body_first
        .get("instructions")
        .and_then(|v| v.as_str())
        .unwrap_or_default()
        .to_string();
    assert_eq!(
        instructions, baseline_instructions,
        "auto compact should keep the standard developer instructions",
    );

    let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();
    let last3 = input3
        .last()
        .expect("auto compact request should append a user message");
    assert_eq!(last3.get("type").and_then(|v| v.as_str()), Some("message"));
    assert_eq!(last3.get("role").and_then(|v| v.as_str()), Some("user"));
    let last_text = last3
        .get("content")
        .and_then(|v| v.as_array())
        .and_then(|items| items.first())
        .and_then(|item| item.get("text"))
        .and_then(|text| text.as_str())
        .unwrap_or_default();
    assert_eq!(
        last_text, SUMMARIZATION_PROMPT,
        "auto compact should send the summarization prompt as a user message",
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn auto_compact_persists_rollout_entries() {
    skip_if_no_network!();

    let server = start_mock_server().await;

    let sse1 = sse(vec![
        ev_assistant_message("m1", FIRST_REPLY),
        ev_completed_with_tokens("r1", 70_000),
    ]);

    let sse2 = sse(vec![
        ev_assistant_message("m2", "SECOND_REPLY"),
        ev_completed_with_tokens("r2", 330_000),
    ]);

    let sse3 = sse(vec![
        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
        ev_completed_with_tokens("r3", 200),
    ]);

    let first_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains(FIRST_AUTO_MSG)
            && !body.contains(SECOND_AUTO_MSG)
            && !body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, first_matcher, sse1).await;

    let second_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains(SECOND_AUTO_MSG)
            && body.contains(FIRST_AUTO_MSG)
            && !body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, second_matcher, sse2).await;

    let third_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, third_matcher, sse3).await;

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };

    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let NewConversation {
        conversation: codex,
        session_configured,
        ..
    } = conversation_manager.new_conversation(config).await.unwrap();

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: FIRST_AUTO_MSG.into(),
            }],
        })
        .await
        .unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: SECOND_AUTO_MSG.into(),
            }],
        })
        .await
        .unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    codex.submit(Op::Shutdown).await.unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;

    let rollout_path = session_configured.rollout_path;
    let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
        panic!(
            "failed to read rollout file {}: {e}",
            rollout_path.display()
        )
    });

    let mut turn_context_count = 0usize;
    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
            continue;
        };
        match entry.item {
            RolloutItem::TurnContext(_) => {
                turn_context_count += 1;
            }
            RolloutItem::Compacted(_) => {}
            _ => {}
        }
    }

    assert!(
        turn_context_count >= 2,
        "expected at least two turn context entries, got {turn_context_count}"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn auto_compact_stops_after_failed_attempt() {
    skip_if_no_network!();

    let server = start_mock_server().await;

    let sse1 = sse(vec![
        ev_assistant_message("m1", FIRST_REPLY),
        ev_completed_with_tokens("r1", 500),
    ]);

    let sse2 = sse(vec![
        ev_assistant_message("m2", SUMMARY_TEXT),
        ev_completed_with_tokens("r2", 50),
    ]);

    let sse3 = sse(vec![
        ev_assistant_message("m3", STILL_TOO_BIG_REPLY),
        ev_completed_with_tokens("r3", 500),
    ]);

    let first_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains(FIRST_AUTO_MSG)
            && !body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, first_matcher, sse1.clone()).await;

    let second_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        body.contains("You have exceeded the maximum number of tokens")
    };
    mount_sse_once_match(&server, second_matcher, sse2.clone()).await;

    let third_matcher = |req: &wiremock::Request| {
        let body = std::str::from_utf8(&req.body).unwrap_or("");
        !body.contains("You have exceeded the maximum number of tokens")
            && body.contains(SUMMARY_TEXT)
    };
    mount_sse_once_match(&server, third_matcher, sse3.clone()).await;

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };

    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.model_auto_compact_token_limit = Some(200);
    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let codex = conversation_manager
        .new_conversation(config)
        .await
        .unwrap()
        .conversation;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: FIRST_AUTO_MSG.into(),
            }],
        })
        .await
        .unwrap();

    let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await;
    let EventMsg::Error(ErrorEvent { message }) = error_event else {
        panic!("expected error event");
    };
    assert!(
        message.contains("limit"),
        "error message should include limit information: {message}"
    );
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    let requests = server.received_requests().await.unwrap();
    assert_eq!(
        requests.len(),
        3,
        "auto compact should attempt at most one summarization before erroring"
    );

    let last_body = requests[2].body_json::<serde_json::Value>().unwrap();
    let input = last_body
        .get("input")
        .and_then(|v| v.as_array())
        .unwrap_or_else(|| panic!("unexpected request format: {last_body}"));
    let contains_prompt = input.iter().any(|item| {
        item.get("type").and_then(|v| v.as_str()) == Some("message")
            && item.get("role").and_then(|v| v.as_str()) == Some("user")
            && item
                .get("content")
                .and_then(|v| v.as_array())
                .and_then(|items| items.first())
                .and_then(|entry| entry.get("text"))
                .and_then(|text| text.as_str())
                .map(|text| text == SUMMARIZATION_PROMPT)
                .unwrap_or(false)
    });
    assert!(
        !contains_prompt,
        "third request should be the follow-up turn, not another summarization",
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn manual_compact_retries_after_context_window_error() {
    skip_if_no_network!();

    let server = start_mock_server().await;

    let user_turn = sse(vec![
        ev_assistant_message("m1", FIRST_REPLY),
        ev_completed("r1"),
    ]);
    let compact_failed = sse_failed(
        "resp-fail",
        "context_length_exceeded",
        CONTEXT_LIMIT_MESSAGE,
    );
    let compact_succeeds = sse(vec![
        ev_assistant_message("m2", SUMMARY_TEXT),
        ev_completed("r2"),
    ]);

    let request_log = mount_sse_sequence(
        &server,
        vec![
            user_turn.clone(),
            compact_failed.clone(),
            compact_succeeds.clone(),
        ],
    )
    .await;

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };

    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.model_auto_compact_token_limit = Some(200_000);
    let codex = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"))
        .new_conversation(config)
        .await
        .unwrap()
        .conversation;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: "first turn".into(),
            }],
        })
        .await
        .unwrap();
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    codex.submit(Op::Compact).await.unwrap();

    let EventMsg::BackgroundEvent(event) =
        wait_for_event(&codex, |ev| matches!(ev, EventMsg::BackgroundEvent(_))).await
    else {
        panic!("expected background event after compact retry");
    };
    assert!(
        event.message.contains("Trimmed 1 older conversation item"),
        "background event should mention trimmed item count: {}",
        event.message
    );
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;

    let requests = request_log.requests();
    assert_eq!(
        requests.len(),
        3,
        "expected user turn and two compact attempts"
    );

    let compact_attempt = requests[1].body_json();
    let retry_attempt = requests[2].body_json();

    let compact_input = compact_attempt["input"]
        .as_array()
        .unwrap_or_else(|| panic!("compact attempt missing input array: {compact_attempt}"));
    let retry_input = retry_attempt["input"]
        .as_array()
        .unwrap_or_else(|| panic!("retry attempt missing input array: {retry_attempt}"));
    assert_eq!(
        compact_input
            .last()
            .and_then(|item| item.get("content"))
            .and_then(|v| v.as_array())
            .and_then(|items| items.first())
            .and_then(|entry| entry.get("text"))
            .and_then(|text| text.as_str()),
        Some(SUMMARIZATION_PROMPT),
        "compact attempt should include summarization prompt"
    );
    assert_eq!(
        retry_input
            .last()
            .and_then(|item| item.get("content"))
            .and_then(|v| v.as_array())
            .and_then(|items| items.first())
            .and_then(|entry| entry.get("text"))
            .and_then(|text| text.as_str()),
        Some(SUMMARIZATION_PROMPT),
        "retry attempt should include summarization prompt"
    );
    assert_eq!(
        retry_input.len(),
        compact_input.len().saturating_sub(1),
        "retry should drop exactly one history item (before {} vs after {})",
        compact_input.len(),
        retry_input.len()
    );
    if let (Some(first_before), Some(first_after)) = (compact_input.first(), retry_input.first()) {
        assert_ne!(
            first_before, first_after,
            "retry should drop the oldest conversation item"
        );
    } else {
        panic!("expected non-empty compact inputs");
    }
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_events() {
    skip_if_no_network!();

    let server = start_mock_server().await;

    let sse1 = sse(vec![
        ev_assistant_message("m1", FIRST_REPLY),
        ev_completed_with_tokens("r1", 500),
    ]);
    let sse2 = sse(vec![
        ev_assistant_message("m2", FIRST_AUTO_SUMMARY),
        ev_completed_with_tokens("r2", 50),
    ]);
    let sse3 = sse(vec![
        ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
        ev_completed_with_tokens("r3", 150),
    ]);
    let sse4 = sse(vec![
        ev_assistant_message("m4", SECOND_LARGE_REPLY),
        ev_completed_with_tokens("r4", 450),
    ]);
    let sse5 = sse(vec![
        ev_assistant_message("m5", SECOND_AUTO_SUMMARY),
        ev_completed_with_tokens("r5", 60),
    ]);
    let sse6 = sse(vec![
        ev_assistant_message("m6", FINAL_REPLY),
        ev_completed_with_tokens("r6", 120),
    ]);

    mount_sse_sequence(&server, vec![sse1, sse2, sse3, sse4, sse5, sse6]).await;

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };

    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.model_auto_compact_token_limit = Some(200);
    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let codex = conversation_manager
        .new_conversation(config)
        .await
        .unwrap()
        .conversation;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: MULTI_AUTO_MSG.into(),
            }],
        })
        .await
        .unwrap();

    let mut auto_compact_lifecycle_events = Vec::new();
    loop {
        let event = codex.next_event().await.unwrap();
        if event.id.starts_with("auto-compact-")
            && matches!(
                event.msg,
                EventMsg::TaskStarted(_) | EventMsg::TaskComplete(_)
            )
        {
            auto_compact_lifecycle_events.push(event);
            continue;
        }
        if let EventMsg::TaskComplete(_) = &event.msg
            && !event.id.starts_with("auto-compact-")
        {
            break;
        }
    }

    assert!(
        auto_compact_lifecycle_events.is_empty(),
        "auto compact should not emit task lifecycle events"
    );

    let request_bodies: Vec<String> = server
        .received_requests()
        .await
        .unwrap()
        .into_iter()
        .map(|request| String::from_utf8(request.body).unwrap_or_default())
        .collect();
    assert_eq!(
        request_bodies.len(),
        6,
        "expected six requests including two auto compactions"
    );
    assert!(
        request_bodies[0].contains(MULTI_AUTO_MSG),
        "first request should contain the user input"
    );
    assert!(
        request_bodies[1].contains("You have exceeded the maximum number of tokens"),
        "first auto compact request should include the summarization prompt"
    );
    assert!(
        request_bodies[3].contains(&format!("unsupported call: {DUMMY_FUNCTION_NAME}")),
        "function call output should be sent before the second auto compact"
    );
    assert!(
        request_bodies[4].contains("You have exceeded the maximum number of tokens"),
        "second auto compact request should include the summarization prompt"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn auto_compact_triggers_after_function_call_over_95_percent_usage() {
    skip_if_no_network!();

    let server = start_mock_server().await;

    let context_window = 100;
    let limit = context_window * 90 / 100;
    let over_limit_tokens = context_window * 95 / 100 + 1;

    let first_turn = sse(vec![
        ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
        ev_completed_with_tokens("r1", 50),
    ]);
    let function_call_follow_up = sse(vec![
        ev_assistant_message("m2", FINAL_REPLY),
        ev_completed_with_tokens("r2", over_limit_tokens),
    ]);
    let auto_compact_turn = sse(vec![
        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
        ev_completed_with_tokens("r3", 10),
    ]);
    let post_auto_compact_turn = sse(vec![ev_completed_with_tokens("r4", 10)]);

    // Mount responses in order and keep mocks only for the ones we assert on.
    let first_turn_mock = mount_sse_once(&server, first_turn).await;
    let follow_up_mock = mount_sse_once(&server, function_call_follow_up).await;
    let auto_compact_mock = mount_sse_once(&server, auto_compact_turn).await;
    // We don't assert on the post-compact request, so no need to keep its mock.
    mount_sse_once(&server, post_auto_compact_turn).await;

    let model_provider = ModelProviderInfo {
        base_url: Some(format!("{}/v1", server.uri())),
        ..built_in_model_providers()["openai"].clone()
    };

    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
    config.model_context_window = Some(context_window);
    config.model_auto_compact_token_limit = Some(limit);

    let codex = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"))
        .new_conversation(config)
        .await
        .unwrap()
        .conversation;

    codex
        .submit(Op::UserInput {
            items: vec![UserInput::Text {
                text: FUNCTION_CALL_LIMIT_MSG.into(),
            }],
        })
        .await
        .unwrap();

    wait_for_event(&codex, |msg| matches!(msg, EventMsg::TaskComplete(_))).await;

    // Assert first request captured expected user message that triggers function call.
    let first_request = first_turn_mock.single_request().input();
    assert!(
        first_request.iter().any(|item| {
            item.get("type").and_then(|value| value.as_str()) == Some("message")
                && item
                    .get("content")
                    .and_then(|content| content.as_array())
                    .and_then(|entries| entries.first())
                    .and_then(|entry| entry.get("text"))
                    .and_then(|value| value.as_str())
                    == Some(FUNCTION_CALL_LIMIT_MSG)
        }),
        "first request should include the user message that triggers the function call"
    );

    let function_call_output = follow_up_mock
        .single_request()
        .function_call_output(DUMMY_CALL_ID);
    let output_text = function_call_output
        .get("output")
        .and_then(|value| value.as_str())
        .unwrap_or_default();
    assert!(
        output_text.contains(DUMMY_FUNCTION_NAME),
        "function call output should be sent before auto compact"
    );

    let auto_compact_body = auto_compact_mock.single_request().body_json().to_string();
    assert!(
        auto_compact_body.contains("You have exceeded the maximum number of tokens"),
        "auto compact request should include the summarization prompt after exceeding 95% (limit {limit})"
    );
}
-												Move CodexAuth and AuthManager to the core crate (#3074)

Fix a long standing layering issue.
											
										
										
											2025-09-02 18:36:19 -07:00
+								use codex_core::CodexAuth;
-												chore: introduce ConversationManager as a clearinghouse for all conversations (#2240)

This PR does two things because after I got deep into the first one I
started pulling on the thread to the second:

- Makes `ConversationManager` the place where all in-memory
conversations are created and stored. Previously, `MessageProcessor` in
the `codex-mcp-server` crate was doing this via its `session_map`, but
this is something that should be done in `codex-core`.
- It unwinds the `ctrl_c: tokio::sync::Notify` that was threaded
throughout our code. I think this made sense at one time, but now that
we handle Ctrl-C within the TUI and have a proper `Op::Interrupt` event,
I don't think this was quite right, so I removed it. For `codex exec`
and `codex proto`, we now use `tokio::signal::ctrl_c()` directly, but we
no longer make `Notify` a field of `Codex` or `CodexConversation`.

Changes of note:

- Adds the files `conversation_manager.rs` and `codex_conversation.rs`
to `codex-core`.
- `Codex` and `CodexSpawnOk` are no longer exported from `codex-core`:
other crates must use `CodexConversation` instead (which is created via
`ConversationManager`).
- `core/src/codex_wrapper.rs` has been deleted in favor of
`ConversationManager`.
- `ConversationManager::new_conversation()` returns `NewConversation`,
which is in line with the `new_conversation` tool we want to add to the
MCP server. Note `NewConversation` includes `SessionConfiguredEvent`, so
we eliminate checks in cases like `codex-rs/core/tests/client.rs` to
verify `SessionConfiguredEvent` is the first event because that is now
internal to `ConversationManager`.
- Quite a bit of code was deleted from
`codex-rs/mcp-server/src/message_processor.rs` since it no longer has to
manage multiple conversations itself: it goes through
`ConversationManager` instead.
- `core/tests/live_agent.rs` has been deleted because I had to update a
bunch of tests and all the tests in here were ignored, and I don't think
anyone ever ran them, so this was just technical debt, at this point.
- Removed `notify_on_sigint()` from `util.rs` (and in a follow-up, I
hope to refactor the blandly-named `util.rs` into more descriptive
files).
- In general, I started replacing local variables named `codex` as
`conversation`, where appropriate, though admittedly I didn't do it
through all the integration tests because that would have added a lot of
noise to this PR.




---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with [ReviewStack](https://reviewstack.dev/openai/codex/pull/2240).
* #2264
* #2263
* __->__ #2240
											
										
										
											2025-08-13 13:38:18 -07:00
+								use codex_core::ConversationManager;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								use codex_core::ModelProviderInfo;
-												Add Compact and Turn Context to the rollout items (#3444)

Adding compact and turn context to the rollout items

based on #3440
											
										
										
											2025-09-11 11:08:51 -07:00
+								use codex_core::NewConversation;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								use codex_core::built_in_model_providers;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								use codex_core::protocol::ErrorEvent;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								use codex_core::protocol::EventMsg;
 								use codex_core::protocol::Op;
-												Add Compact and Turn Context to the rollout items (#3444)

Adding compact and turn context to the rollout items

based on #3440
											
										
										
											2025-09-11 11:08:51 -07:00
+								use codex_core::protocol::RolloutItem;
 								use codex_core::protocol::RolloutLine;
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								use codex_protocol::user_input::UserInput;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								use core_test_support::load_default_config_for_test;
-												make tests pass cleanly in sandbox (#4067)

This changes the reqwest client used in tests to be sandbox-friendly,
and skips a bunch of other tests that don't work inside the
sandbox/without network.
											
										
										
											2025-09-25 13:11:14 -07:00
+								use core_test_support::skip_if_no_network;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								use core_test_support::wait_for_event;
 								use tempfile::TempDir;
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								use codex_core::codex::compact::SUMMARIZATION_PROMPT;
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								use core_test_support::responses::ev_assistant_message;
 								use core_test_support::responses::ev_completed;
 								use core_test_support::responses::ev_completed_with_tokens;
 								use core_test_support::responses::ev_function_call;
-												Auto compact at ~90% (#5292)

Users now hit a window exceeded limit and they usually don't know what
to do. This starts auto compact at ~90% of the window.
											
										
										
											2025-10-20 11:29:49 -07:00
+								use core_test_support::responses::mount_sse_once;
-												OpenTelemetry events (#2103)

### Title

## otel

Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.

```toml
[otel]
environment = "staging"   # defaults to "dev"
exporter = "none"          # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false    # defaults to false; redact prompt text unless explicitly enabled
```

Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.

### Event catalog

Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.

With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):

- `codex.api_request`
  - `cf_ray` (optional)
  - `attempt`
  - `duration_ms`
  - `http.response.status_code` (optional)
  - `error.message` (failures)
- `codex.sse_event`
  - `event.kind`
  - `duration_ms`
  - `error.message` (failures)
  - `input_token_count` (completion only)
  - `output_token_count` (completion only)
  - `cached_token_count` (completion only, optional)
  - `reasoning_token_count` (completion only, optional)
  - `tool_token_count` (completion only)
- `codex.user_prompt`
  - `prompt_length`
  - `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
  - `tool_name`
  - `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
  - `source` (`config` or `user`)
- `codex.tool_result`
  - `tool_name`
  - `call_id`
  - `arguments`
  - `duration_ms` (execution time for the tool)
  - `success` (`"true"` or `"false"`)
  - `output`

### Choosing an exporter

Set `otel.exporter` to control where events go:

- `none` – leaves instrumentation active but skips exporting. This is
the
  default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
  endpoint, protocol, and headers your collector expects:

  ```toml
  [otel]
  exporter = { otlp-http = {
    endpoint = "https://otel.example.com/v1/logs",
    protocol = "binary",
    headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
  }}
  ```

- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
  metadata headers:

  ```toml
  [otel]
  exporter = { otlp-grpc = {
    endpoint = "https://otel.example.com:4317",
    headers = { "x-otlp-meta" = "abc123" }
  }}
  ```

If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.

If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.

---------

Co-authored-by: Anton Panasenko <apanasenko@openai.com>
											
										
										
											2025-09-29 19:30:55 +01:00
+								use core_test_support::responses::mount_sse_once_match;
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								use core_test_support::responses::mount_sse_sequence;
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								use core_test_support::responses::sse;
-												feat: truncate on compact (#4942)

Truncate the message during compaction if it is just too large
Do it iteratively as tokenization is basically free on server-side
											
										
										
											2025-10-08 18:11:08 +01:00
+								use core_test_support::responses::sse_failed;
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								use core_test_support::responses::start_mock_server;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								use pretty_assertions::assert_eq;
 								// --- Test helpers -----------------------------------------------------------
-												Handle resuming/forking after compact (#3533)

We need to construct the history different when compact happens. For
this, we need to just consider the history after compact and convert
compact to a response item.

This needs to change and use `build_compact_history` when this #3446 is
merged.
											
										
										
											2025-09-14 09:23:31 -04:00
+								pub(super) const FIRST_REPLY: &str = "FIRST_REPLY";
 								pub(super) const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT";
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								const THIRD_USER_MSG: &str = "next turn";
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								const AUTO_SUMMARY_TEXT: &str = "AUTO_SUMMARY";
 								const FIRST_AUTO_MSG: &str = "token limit start";
 								const SECOND_AUTO_MSG: &str = "token limit push";
 								const STILL_TOO_BIG_REPLY: &str = "STILL_TOO_BIG";
 								const MULTI_AUTO_MSG: &str = "multi auto";
 								const SECOND_LARGE_REPLY: &str = "SECOND_LARGE_REPLY";
 								const FIRST_AUTO_SUMMARY: &str = "FIRST_AUTO_SUMMARY";
 								const SECOND_AUTO_SUMMARY: &str = "SECOND_AUTO_SUMMARY";
 								const FINAL_REPLY: &str = "FINAL_REPLY";
-												feat: truncate on compact (#4942)

Truncate the message during compaction if it is just too large
Do it iteratively as tokenization is basically free on server-side
											
										
										
											2025-10-08 18:11:08 +01:00
+								const CONTEXT_LIMIT_MESSAGE: &str =
 								    "Your input exceeds the context window of this model. Please adjust your input and try again.";
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								const DUMMY_FUNCTION_NAME: &str = "unsupported_tool";
 								const DUMMY_CALL_ID: &str = "call-multi-auto";
-												Auto compact at ~90% (#5292)

Users now hit a window exceeded limit and they usually don't know what
to do. This starts auto compact at ~90% of the window.
											
										
										
											2025-10-20 11:29:49 -07:00
+								const FUNCTION_CALL_LIMIT_MSG: &str = "function call limit push";
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn summarize_context_three_requests_and_instructions() {
-												make tests pass cleanly in sandbox (#4067)

This changes the reqwest client used in tests to be sandbox-friendly,
and skips a bunch of other tests that don't work inside the
sandbox/without network.
											
										
										
											2025-09-25 13:11:14 -07:00
+								    skip_if_no_network!();
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								    // Set up a mock server that we can inspect after the run.
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    let server = start_mock_server().await;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								    // SSE 1: assistant replies normally so it is recorded in history.
 								    let sse1 = sse(vec![
 								        ev_assistant_message("m1", FIRST_REPLY),
 								        ev_completed("r1"),
 								    ]);
 								    // SSE 2: summarizer returns a summary message.
 								    let sse2 = sse(vec![
 								        ev_assistant_message("m2", SUMMARY_TEXT),
 								        ev_completed("r2"),
 								    ]);
 								    // SSE 3: minimal completed; we only need to capture the request body.
 								    let sse3 = sse(vec![ev_completed("r3")]);
 								    // Mount three expectations, one per request, matched by body content.
 								    let first_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains("\"text\":\"hello world\"")
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								            && !body.contains("You have exceeded the maximum number of tokens")
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    };
-												OpenTelemetry events (#2103)

### Title

## otel

Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.

```toml
[otel]
environment = "staging"   # defaults to "dev"
exporter = "none"          # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false    # defaults to false; redact prompt text unless explicitly enabled
```

Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.

### Event catalog

Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.

With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):

- `codex.api_request`
  - `cf_ray` (optional)
  - `attempt`
  - `duration_ms`
  - `http.response.status_code` (optional)
  - `error.message` (failures)
- `codex.sse_event`
  - `event.kind`
  - `duration_ms`
  - `error.message` (failures)
  - `input_token_count` (completion only)
  - `output_token_count` (completion only)
  - `cached_token_count` (completion only, optional)
  - `reasoning_token_count` (completion only, optional)
  - `tool_token_count` (completion only)
- `codex.user_prompt`
  - `prompt_length`
  - `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
  - `tool_name`
  - `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
  - `source` (`config` or `user`)
- `codex.tool_result`
  - `tool_name`
  - `call_id`
  - `arguments`
  - `duration_ms` (execution time for the tool)
  - `success` (`"true"` or `"false"`)
  - `output`

### Choosing an exporter

Set `otel.exporter` to control where events go:

- `none` – leaves instrumentation active but skips exporting. This is
the
  default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
  endpoint, protocol, and headers your collector expects:

  ```toml
  [otel]
  exporter = { otlp-http = {
    endpoint = "https://otel.example.com/v1/logs",
    protocol = "binary",
    headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
  }}
  ```

- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
  metadata headers:

  ```toml
  [otel]
  exporter = { otlp-grpc = {
    endpoint = "https://otel.example.com:4317",
    headers = { "x-otlp-meta" = "abc123" }
  }}
  ```

If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.

If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.

---------

Co-authored-by: Anton Panasenko <apanasenko@openai.com>
											
										
										
											2025-09-29 19:30:55 +01:00
+								    mount_sse_once_match(&server, first_matcher, sse1).await;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								    let second_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								        body.contains("You have exceeded the maximum number of tokens")
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    };
-												OpenTelemetry events (#2103)

### Title

## otel

Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.

```toml
[otel]
environment = "staging"   # defaults to "dev"
exporter = "none"          # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false    # defaults to false; redact prompt text unless explicitly enabled
```

Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.

### Event catalog

Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.

With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):

- `codex.api_request`
  - `cf_ray` (optional)
  - `attempt`
  - `duration_ms`
  - `http.response.status_code` (optional)
  - `error.message` (failures)
- `codex.sse_event`
  - `event.kind`
  - `duration_ms`
  - `error.message` (failures)
  - `input_token_count` (completion only)
  - `output_token_count` (completion only)
  - `cached_token_count` (completion only, optional)
  - `reasoning_token_count` (completion only, optional)
  - `tool_token_count` (completion only)
- `codex.user_prompt`
  - `prompt_length`
  - `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
  - `tool_name`
  - `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
  - `source` (`config` or `user`)
- `codex.tool_result`
  - `tool_name`
  - `call_id`
  - `arguments`
  - `duration_ms` (execution time for the tool)
  - `success` (`"true"` or `"false"`)
  - `output`

### Choosing an exporter

Set `otel.exporter` to control where events go:

- `none` – leaves instrumentation active but skips exporting. This is
the
  default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
  endpoint, protocol, and headers your collector expects:

  ```toml
  [otel]
  exporter = { otlp-http = {
    endpoint = "https://otel.example.com/v1/logs",
    protocol = "binary",
    headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
  }}
  ```

- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
  metadata headers:

  ```toml
  [otel]
  exporter = { otlp-grpc = {
    endpoint = "https://otel.example.com:4317",
    headers = { "x-otlp-meta" = "abc123" }
  }}
  ```

If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.

If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.

---------

Co-authored-by: Anton Panasenko <apanasenko@openai.com>
											
										
										
											2025-09-29 19:30:55 +01:00
+								    mount_sse_once_match(&server, second_matcher, sse2).await;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								    let third_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains(&format!("\"text\":\"{THIRD_USER_MSG}\""))
 								    };
-												OpenTelemetry events (#2103)

### Title

## otel

Codex can emit [OpenTelemetry](https://opentelemetry.io/) **log events**
that
describe each run: outbound API requests, streamed responses, user
input,
tool-approval decisions, and the result of every tool invocation. Export
is
**disabled by default** so local runs remain self-contained. Opt in by
adding an
`[otel]` table and choosing an exporter.

```toml
[otel]
environment = "staging"   # defaults to "dev"
exporter = "none"          # defaults to "none"; set to otlp-http or otlp-grpc to send events
log_user_prompt = false    # defaults to false; redact prompt text unless explicitly enabled
```

Codex tags every exported event with `service.name = "codex-cli"`, the
CLI
version, and an `env` attribute so downstream collectors can distinguish
dev/staging/prod traffic. Only telemetry produced inside the
`codex_otel`
crate—the events listed below—is forwarded to the exporter.

### Event catalog

Every event shares a common set of metadata fields: `event.timestamp`,
`conversation.id`, `app.version`, `auth_mode` (when available),
`user.account_id` (when available), `terminal.type`, `model`, and
`slug`.

With OTEL enabled Codex emits the following event types (in addition to
the
metadata above):

- `codex.api_request`
  - `cf_ray` (optional)
  - `attempt`
  - `duration_ms`
  - `http.response.status_code` (optional)
  - `error.message` (failures)
- `codex.sse_event`
  - `event.kind`
  - `duration_ms`
  - `error.message` (failures)
  - `input_token_count` (completion only)
  - `output_token_count` (completion only)
  - `cached_token_count` (completion only, optional)
  - `reasoning_token_count` (completion only, optional)
  - `tool_token_count` (completion only)
- `codex.user_prompt`
  - `prompt_length`
  - `prompt` (redacted unless `log_user_prompt = true`)
- `codex.tool_decision`
  - `tool_name`
  - `call_id`
- `decision` (`approved`, `approved_for_session`, `denied`, or `abort`)
  - `source` (`config` or `user`)
- `codex.tool_result`
  - `tool_name`
  - `call_id`
  - `arguments`
  - `duration_ms` (execution time for the tool)
  - `success` (`"true"` or `"false"`)
  - `output`

### Choosing an exporter

Set `otel.exporter` to control where events go:

- `none` – leaves instrumentation active but skips exporting. This is
the
  default.
- `otlp-http` – posts OTLP log records to an OTLP/HTTP collector.
Specify the
  endpoint, protocol, and headers your collector expects:

  ```toml
  [otel]
  exporter = { otlp-http = {
    endpoint = "https://otel.example.com/v1/logs",
    protocol = "binary",
    headers = { "x-otlp-api-key" = "${OTLP_TOKEN}" }
  }}
  ```

- `otlp-grpc` – streams OTLP log records over gRPC. Provide the endpoint
and any
  metadata headers:

  ```toml
  [otel]
  exporter = { otlp-grpc = {
    endpoint = "https://otel.example.com:4317",
    headers = { "x-otlp-meta" = "abc123" }
  }}
  ```

If the exporter is `none` nothing is written anywhere; otherwise you
must run or point to your
own collector. All exporters run on a background batch worker that is
flushed on
shutdown.

If you build Codex from source the OTEL crate is still behind an `otel`
feature
flag; the official prebuilt binaries ship with the feature enabled. When
the
feature is disabled the telemetry hooks become no-ops so the CLI
continues to
function without the extra dependencies.

---------

Co-authored-by: Anton Panasenko <apanasenko@openai.com>
											
										
										
											2025-09-29 19:30:55 +01:00
+								    mount_sse_once_match(&server, third_matcher, sse3).await;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								    // Build config pointing to the mock server and spawn Codex.
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    config.model_auto_compact_token_limit = Some(200_000);
-												Add AuthManager and enhance GetAuthStatus command (#2577)

This PR adds a central `AuthManager` struct that manages the auth
information used across conversations and the MCP server. Prior to this,
each conversation and the MCP server got their own private snapshots of
the auth information, and changes to one (such as a logout or token
refresh) were not seen by others.

This is especially problematic when multiple instances of the CLI are
run. For example, consider the case where you start CLI 1 and log in to
ChatGPT account X and then start CLI 2 and log out and then log in to
ChatGPT account Y. The conversation in CLI 1 is still using account X,
but if you create a new conversation, it will suddenly (and
unexpectedly) switch to account Y.

With the `AuthManager`, auth information is read from disk at the time
the `ConversationManager` is constructed, and it is cached in memory.
All new conversations use this same auth information, as do any token
refreshes.

The `AuthManager` is also used by the MCP server's GetAuthStatus
command, which now returns the auth method currently used by the MCP
server.

This PR also includes an enhancement to the GetAuthStatus command. It
now accepts two new (optional) input parameters: `include_token` and
`refresh_token`. Callers can use this to request the in-use auth token
and can optionally request to refresh the token.

The PR also adds tests for the login and auth APIs that I recently added
to the MCP server.
											
										
										
											2025-08-22 13:10:11 -07:00
+								    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
-												Add Compact and Turn Context to the rollout items (#3444)

Adding compact and turn context to the rollout items

based on #3440
											
										
										
											2025-09-11 11:08:51 -07:00
+								    let NewConversation {
 								        conversation: codex,
 								        session_configured,
 								        ..
 								    } = conversation_manager.new_conversation(config).await.unwrap();
 								    let rollout_path = session_configured.rollout_path;
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
 								    // 1) Normal user input – should hit server once.
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								                text: "hello world".into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    // 2) Summarize – second hit should include the summarization prompt.
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    codex.submit(Op::Compact).await.unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    // 3) Next user input – third hit; history should include only the summary.
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								                text: THIRD_USER_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    // Inspect the three captured requests.
 								    let requests = server.received_requests().await.unwrap();
 								    assert_eq!(requests.len(), 3, "expected exactly three requests");
 								    let req1 = &requests[0];
 								    let req2 = &requests[1];
 								    let req3 = &requests[2];
 								    let body1 = req1.body_json::<serde_json::Value>().unwrap();
 								    let body2 = req2.body_json::<serde_json::Value>().unwrap();
 								    let body3 = req3.body_json::<serde_json::Value>().unwrap();
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    // Manual compact should keep the baseline developer instructions.
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    let instr1 = body1.get("instructions").and_then(|v| v.as_str()).unwrap();
 								    let instr2 = body2.get("instructions").and_then(|v| v.as_str()).unwrap();
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    assert_eq!(
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								        instr1, instr2,
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								        "manual compact should keep the standard developer instructions"
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    );
 								    // The summarization request should include the injected user input marker.
 								    let input2 = body2.get("input").and_then(|v| v.as_array()).unwrap();
 								    // The last item is the user message created from the injected input.
 								    let last2 = input2.last().unwrap();
 								    assert_eq!(last2.get("type").unwrap().as_str().unwrap(), "message");
 								    assert_eq!(last2.get("role").unwrap().as_str().unwrap(), "user");
 								    let text2 = last2["content"][0]["text"].as_str().unwrap();
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    assert_eq!(
 								        text2, SUMMARIZATION_PROMPT,
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        "expected summarize trigger, got `{text2}`"
 								    );
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    // Third request must contain the refreshed instructions, bridge summary message and new user msg.
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    assert!(
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        input3.len() >= 3,
 								        "expected refreshed context and new user message in third request"
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    );
 								    // Collect all (role, text) message tuples.
 								    let mut messages: Vec<(String, String)> = Vec::new();
 								    for item in input3 {
 								        if item["type"].as_str() == Some("message") {
 								            let role = item["role"].as_str().unwrap_or_default().to_string();
 								            let text = item["content"][0]["text"]
 								                .as_str()
 								                .unwrap_or_default()
 								                .to_string();
 								            messages.push((role, text));
 								        }
 								    }
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    // No previous assistant messages should remain and the new user message is present.
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    let assistant_count = messages.iter().filter(|(r, _)| r == "assistant").count();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    assert_eq!(assistant_count, 0, "assistant history should be cleared");
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    assert!(
 								        messages
 								            .iter()
 								            .any(|(r, t)| r == "user" && t == THIRD_USER_MSG),
 								        "third request should include the new user message"
 								    );
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    let Some((_, bridge_text)) = messages.iter().find(|(role, text)| {
 								        role == "user"
 								            && (text.contains("Here were the user messages")
 								                || text.contains("Here are all the user messages"))
 								            && text.contains(SUMMARY_TEXT)
 								    }) else {
 								        panic!("expected a bridge message containing the summary");
 								    };
 								    assert!(
 								        bridge_text.contains("hello world"),
 								        "bridge should capture earlier user messages"
 								    );
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    assert!(
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								        !bridge_text.contains(SUMMARIZATION_PROMPT),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        "bridge text should not echo the summarize trigger"
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								    );
 								    assert!(
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        !messages
 								            .iter()
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								            .any(|(_, text)| text.contains(SUMMARIZATION_PROMPT)),
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								        "third request should not include the summarize trigger"
 								    );
-												Add Compact and Turn Context to the rollout items (#3444)

Adding compact and turn context to the rollout items

based on #3440
											
										
										
											2025-09-11 11:08:51 -07:00
 								    // Shut down Codex to flush rollout entries before inspecting the file.
 								    codex.submit(Op::Shutdown).await.unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;
 								    // Verify rollout contains APITurn entries for each API call and a Compacted entry.
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    println!("rollout path: {}", rollout_path.display());
-												Add Compact and Turn Context to the rollout items (#3444)

Adding compact and turn context to the rollout items

based on #3440
											
										
										
											2025-09-11 11:08:51 -07:00
+								    let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
 								        panic!(
 								            "failed to read rollout file {}: {e}",
 								            rollout_path.display()
 								        )
 								    });
 								    let mut api_turn_count = 0usize;
 								    let mut saw_compacted_summary = false;
 								    for line in text.lines() {
 								        let trimmed = line.trim();
 								        if trimmed.is_empty() {
 								            continue;
 								        }
 								        let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
 								            continue;
 								        };
 								        match entry.item {
 								            RolloutItem::TurnContext(_) => {
 								                api_turn_count += 1;
 								            }
 								            RolloutItem::Compacted(ci) => {
 								                if ci.message == SUMMARY_TEXT {
 								                    saw_compacted_summary = true;
 								                }
 								            }
 								            _ => {}
 								        }
 								    }
 								    assert!(
 								        api_turn_count == 3,
 								        "expected three APITurn entries in rollout"
 								    );
 								    assert!(
 								        saw_compacted_summary,
 								        "expected a Compacted entry containing the summarizer output"
 								    );
-												Add /compact (#1527)

- Add operation to summarize the context so far.
- The operation runs a compact task that summarizes the context.
- The operation clear the previous context to free the context window
- The operation didn't use `run_task` to avoid corrupting the session
- Add /compact in the tui



https://github.com/user-attachments/assets/e06c24e5-dcfb-4806-934a-564d425a919c
											
										
										
											2025-07-31 21:34:32 -07:00
+								}
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
-												feat: compaction prompt configurable (#5959)

```
 codex -c compact_prompt="Summarize in bullet points"
 ```
											
										
										
											2025-10-30 14:24:24 +00:00
+								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn manual_compact_uses_custom_prompt() {
 								    skip_if_no_network!();
 								    let server = start_mock_server().await;
 								    let sse_stream = sse(vec![ev_completed("r1")]);
 								    mount_sse_once(&server, sse_stream).await;
 								    let custom_prompt = "Use this compact prompt instead";
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    config.compact_prompt = Some(custom_prompt.to_string());
 								    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
 								    let codex = conversation_manager
 								        .new_conversation(config)
 								        .await
 								        .expect("create conversation")
 								        .conversation;
 								    codex.submit(Op::Compact).await.expect("trigger compact");
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    let requests = server.received_requests().await.expect("collect requests");
 								    let body = requests
 								        .iter()
 								        .find_map(|req| req.body_json::<serde_json::Value>().ok())
 								        .expect("summary request body");
 								    let input = body
 								        .get("input")
 								        .and_then(|v| v.as_array())
 								        .expect("input array");
 								    let mut found_custom_prompt = false;
 								    let mut found_default_prompt = false;
 								    for item in input {
 								        if item["type"].as_str() != Some("message") {
 								            continue;
 								        }
 								        let text = item["content"][0]["text"].as_str().unwrap_or_default();
 								        if text == custom_prompt {
 								            found_custom_prompt = true;
 								        }
 								        if text == SUMMARIZATION_PROMPT {
 								            found_default_prompt = true;
 								        }
 								    }
 								    assert!(found_custom_prompt, "custom prompt should be injected");
 								    assert!(!found_default_prompt, "default prompt should be replaced");
 								}
-												Fix flaky windows test (#3564)

There are exactly 4 types of flaky tests in Windows x86 right now:

1. `review_input_isolated_from_parent_history` => Times out waiting for
closing events
2. `review_does_not_emit_agent_message_on_structured_output` => Times
out waiting for closing events
3. `auto_compact_runs_after_token_limit_hit` => Times out waiting for
closing events
4. `auto_compact_runs_after_token_limit_hit` => Also has a problem where
auto compact should add a third request, but receives 4 requests.

1, 2, and 3 seem to be solved with increasing threads on windows runner
from 2 -> 4.

Don't know yet why # 4 is happening, but probably also because of
WireMock issues on windows causing races.
											
										
										
											2025-09-14 16:20:25 -07:00
+								// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
 								#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
 								#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								async fn auto_compact_runs_after_token_limit_hit() {
-												make tests pass cleanly in sandbox (#4067)

This changes the reqwest client used in tests to be sandbox-friendly,
and skips a bunch of other tests that don't work inside the
sandbox/without network.
											
										
										
											2025-09-25 13:11:14 -07:00
+								    skip_if_no_network!();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let server = start_mock_server().await;
 								    let sse1 = sse(vec![
 								        ev_assistant_message("m1", FIRST_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r1", 70_000),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse2 = sse(vec![
 								        ev_assistant_message("m2", "SECOND_REPLY"),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r2", 330_000),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse3 = sse(vec![
 								        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r3", 200),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let first_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains(FIRST_AUTO_MSG)
 								            && !body.contains(SECOND_AUTO_MSG)
 								            && !body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, first_matcher, sse1).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let second_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains(SECOND_AUTO_MSG)
 								            && body.contains(FIRST_AUTO_MSG)
 								            && !body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, second_matcher, sse2).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let third_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, third_matcher, sse3).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    config.model_auto_compact_token_limit = Some(200_000);
 								    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
 								    let codex = conversation_manager
 								        .new_conversation(config)
 								        .await
 								        .unwrap()
 								        .conversation;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								                text: FIRST_AUTO_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
-												Fix flaky windows test (#3564)

There are exactly 4 types of flaky tests in Windows x86 right now:

1. `review_input_isolated_from_parent_history` => Times out waiting for
closing events
2. `review_does_not_emit_agent_message_on_structured_output` => Times
out waiting for closing events
3. `auto_compact_runs_after_token_limit_hit` => Times out waiting for
closing events
4. `auto_compact_runs_after_token_limit_hit` => Also has a problem where
auto compact should add a third request, but receives 4 requests.

1, 2, and 3 seem to be solved with increasing threads on windows runner
from 2 -> 4.

Don't know yet why # 4 is happening, but probably also because of
WireMock issues on windows causing races.
											
										
										
											2025-09-14 16:20:25 -07:00
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								                text: SECOND_AUTO_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
-												Fix flaky windows test (#3564)

There are exactly 4 types of flaky tests in Windows x86 right now:

1. `review_input_isolated_from_parent_history` => Times out waiting for
closing events
2. `review_does_not_emit_agent_message_on_structured_output` => Times
out waiting for closing events
3. `auto_compact_runs_after_token_limit_hit` => Times out waiting for
closing events
4. `auto_compact_runs_after_token_limit_hit` => Also has a problem where
auto compact should add a third request, but receives 4 requests.

1, 2, and 3 seem to be solved with increasing threads on windows runner
from 2 -> 4.

Don't know yet why # 4 is happening, but probably also because of
WireMock issues on windows causing races.
											
										
										
											2025-09-14 16:20:25 -07:00
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    // wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    let requests = server.received_requests().await.unwrap();
-												Fix flaky windows test (#3564)

There are exactly 4 types of flaky tests in Windows x86 right now:

1. `review_input_isolated_from_parent_history` => Times out waiting for
closing events
2. `review_does_not_emit_agent_message_on_structured_output` => Times
out waiting for closing events
3. `auto_compact_runs_after_token_limit_hit` => Times out waiting for
closing events
4. `auto_compact_runs_after_token_limit_hit` => Also has a problem where
auto compact should add a third request, but receives 4 requests.

1, 2, and 3 seem to be solved with increasing threads on windows runner
from 2 -> 4.

Don't know yet why # 4 is happening, but probably also because of
WireMock issues on windows causing races.
											
										
										
											2025-09-14 16:20:25 -07:00
+								    assert!(
 								        requests.len() >= 3,
 								        "auto compact should add at least a third request, got {}",
 								        requests.len()
 								    );
 								    let is_auto_compact = |req: &wiremock::Request| {
 								        std::str::from_utf8(&req.body)
 								            .unwrap_or("")
 								            .contains("You have exceeded the maximum number of tokens")
 								    };
 								    let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count();
 								    assert_eq!(
 								        auto_compact_count, 1,
 								        "expected exactly one auto compact request"
 								    );
 								    let auto_compact_index = requests
 								        .iter()
 								        .enumerate()
 								        .find_map(|(idx, req)| is_auto_compact(req).then_some(idx))
 								        .expect("auto compact request missing");
 								    assert_eq!(
 								        auto_compact_index, 2,
 								        "auto compact should add a third request"
 								    );
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    let body_first = requests[0].body_json::<serde_json::Value>().unwrap();
-												Fix flaky windows test (#3564)

There are exactly 4 types of flaky tests in Windows x86 right now:

1. `review_input_isolated_from_parent_history` => Times out waiting for
closing events
2. `review_does_not_emit_agent_message_on_structured_output` => Times
out waiting for closing events
3. `auto_compact_runs_after_token_limit_hit` => Times out waiting for
closing events
4. `auto_compact_runs_after_token_limit_hit` => Also has a problem where
auto compact should add a third request, but receives 4 requests.

1, 2, and 3 seem to be solved with increasing threads on windows runner
from 2 -> 4.

Don't know yet why # 4 is happening, but probably also because of
WireMock issues on windows causing races.
											
										
										
											2025-09-14 16:20:25 -07:00
+								    let body3 = requests[auto_compact_index]
 								        .body_json::<serde_json::Value>()
 								        .unwrap();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    let instructions = body3
 								        .get("instructions")
 								        .and_then(|v| v.as_str())
 								        .unwrap_or_default();
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    let baseline_instructions = body_first
 								        .get("instructions")
 								        .and_then(|v| v.as_str())
 								        .unwrap_or_default()
 								        .to_string();
 								    assert_eq!(
 								        instructions, baseline_instructions,
 								        "auto compact should keep the standard developer instructions",
 								    );
 								    let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();
 								    let last3 = input3
 								        .last()
 								        .expect("auto compact request should append a user message");
 								    assert_eq!(last3.get("type").and_then(|v| v.as_str()), Some("message"));
 								    assert_eq!(last3.get("role").and_then(|v| v.as_str()), Some("user"));
 								    let last_text = last3
 								        .get("content")
 								        .and_then(|v| v.as_array())
 								        .and_then(|items| items.first())
 								        .and_then(|item| item.get("text"))
 								        .and_then(|text| text.as_str())
 								        .unwrap_or_default();
 								    assert_eq!(
 								        last_text, SUMMARIZATION_PROMPT,
 								        "auto compact should send the summarization prompt as a user message",
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    );
 								}
 								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn auto_compact_persists_rollout_entries() {
-												make tests pass cleanly in sandbox (#4067)

This changes the reqwest client used in tests to be sandbox-friendly,
and skips a bunch of other tests that don't work inside the
sandbox/without network.
											
										
										
											2025-09-25 13:11:14 -07:00
+								    skip_if_no_network!();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let server = start_mock_server().await;
 								    let sse1 = sse(vec![
 								        ev_assistant_message("m1", FIRST_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r1", 70_000),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse2 = sse(vec![
 								        ev_assistant_message("m2", "SECOND_REPLY"),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r2", 330_000),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse3 = sse(vec![
 								        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r3", 200),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let first_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains(FIRST_AUTO_MSG)
 								            && !body.contains(SECOND_AUTO_MSG)
 								            && !body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, first_matcher, sse1).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let second_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains(SECOND_AUTO_MSG)
 								            && body.contains(FIRST_AUTO_MSG)
 								            && !body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, second_matcher, sse2).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let third_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, third_matcher, sse3).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
 								    let NewConversation {
 								        conversation: codex,
 								        session_configured,
 								        ..
 								    } = conversation_manager.new_conversation(config).await.unwrap();
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								                text: FIRST_AUTO_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								                text: SECOND_AUTO_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    codex.submit(Op::Shutdown).await.unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;
 								    let rollout_path = session_configured.rollout_path;
 								    let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
 								        panic!(
 								            "failed to read rollout file {}: {e}",
 								            rollout_path.display()
 								        )
 								    });
 								    let mut turn_context_count = 0usize;
 								    for line in text.lines() {
 								        let trimmed = line.trim();
 								        if trimmed.is_empty() {
 								            continue;
 								        }
 								        let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
 								            continue;
 								        };
 								        match entry.item {
 								            RolloutItem::TurnContext(_) => {
 								                turn_context_count += 1;
 								            }
 								            RolloutItem::Compacted(_) => {}
 								            _ => {}
 								        }
 								    }
 								    assert!(
 								        turn_context_count >= 2,
 								        "expected at least two turn context entries, got {turn_context_count}"
 								    );
 								}
 								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn auto_compact_stops_after_failed_attempt() {
-												make tests pass cleanly in sandbox (#4067)

This changes the reqwest client used in tests to be sandbox-friendly,
and skips a bunch of other tests that don't work inside the
sandbox/without network.
											
										
										
											2025-09-25 13:11:14 -07:00
+								    skip_if_no_network!();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let server = start_mock_server().await;
 								    let sse1 = sse(vec![
 								        ev_assistant_message("m1", FIRST_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r1", 500),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse2 = sse(vec![
 								        ev_assistant_message("m2", SUMMARY_TEXT),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r2", 50),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse3 = sse(vec![
 								        ev_assistant_message("m3", STILL_TOO_BIG_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r3", 500),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let first_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains(FIRST_AUTO_MSG)
 								            && !body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, first_matcher, sse1.clone()).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let second_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        body.contains("You have exceeded the maximum number of tokens")
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, second_matcher, sse2.clone()).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let third_matcher = |req: &wiremock::Request| {
 								        let body = std::str::from_utf8(&req.body).unwrap_or("");
 								        !body.contains("You have exceeded the maximum number of tokens")
 								            && body.contains(SUMMARY_TEXT)
 								    };
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_once_match(&server, third_matcher, sse3.clone()).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    config.model_auto_compact_token_limit = Some(200);
 								    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
 								    let codex = conversation_manager
 								        .new_conversation(config)
 								        .await
 								        .unwrap()
 								        .conversation;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								                text: FIRST_AUTO_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await;
 								    let EventMsg::Error(ErrorEvent { message }) = error_event else {
 								        panic!("expected error event");
 								    };
 								    assert!(
 								        message.contains("limit"),
 								        "error message should include limit information: {message}"
 								    );
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    let requests = server.received_requests().await.unwrap();
 								    assert_eq!(
 								        requests.len(),
 ,
 								        "auto compact should attempt at most one summarization before erroring"
 								    );
 								    let last_body = requests[2].body_json::<serde_json::Value>().unwrap();
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								    let input = last_body
 								        .get("input")
 								        .and_then(|v| v.as_array())
 								        .unwrap_or_else(|| panic!("unexpected request format: {last_body}"));
 								    let contains_prompt = input.iter().any(|item| {
 								        item.get("type").and_then(|v| v.as_str()) == Some("message")
 								            && item.get("role").and_then(|v| v.as_str()) == Some("user")
 								            && item
 								                .get("content")
 								                .and_then(|v| v.as_array())
 								                .and_then(|items| items.first())
 								                .and_then(|entry| entry.get("text"))
 								                .and_then(|text| text.as_str())
 								                .map(|text| text == SUMMARIZATION_PROMPT)
 								                .unwrap_or(false)
 								    });
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    assert!(
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								        !contains_prompt,
 								        "third request should be the follow-up turn, not another summarization",
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    );
 								}
-												feat: truncate on compact (#4942)

Truncate the message during compaction if it is just too large
Do it iteratively as tokenization is basically free on server-side
											
										
										
											2025-10-08 18:11:08 +01:00
+								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn manual_compact_retries_after_context_window_error() {
 								    skip_if_no_network!();
 								    let server = start_mock_server().await;
 								    let user_turn = sse(vec![
 								        ev_assistant_message("m1", FIRST_REPLY),
 								        ev_completed("r1"),
 								    ]);
 								    let compact_failed = sse_failed(
 								        "resp-fail",
 								        "context_length_exceeded",
 								        CONTEXT_LIMIT_MESSAGE,
 								    );
 								    let compact_succeeds = sse(vec![
 								        ev_assistant_message("m2", SUMMARY_TEXT),
 								        ev_completed("r2"),
 								    ]);
 								    let request_log = mount_sse_sequence(
 								        &server,
 								        vec![
 								            user_turn.clone(),
 								            compact_failed.clone(),
 								            compact_succeeds.clone(),
 								        ],
 								    )
 								    .await;
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    config.model_auto_compact_token_limit = Some(200_000);
 								    let codex = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"))
 								        .new_conversation(config)
 								        .await
 								        .unwrap()
 								        .conversation;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: truncate on compact (#4942)

Truncate the message during compaction if it is just too large
Do it iteratively as tokenization is basically free on server-side
											
										
										
											2025-10-08 18:11:08 +01:00
+								                text: "first turn".into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    codex.submit(Op::Compact).await.unwrap();
 								    let EventMsg::BackgroundEvent(event) =
 								        wait_for_event(&codex, |ev| matches!(ev, EventMsg::BackgroundEvent(_))).await
 								    else {
 								        panic!("expected background event after compact retry");
 								    };
 								    assert!(
 								        event.message.contains("Trimmed 1 older conversation item"),
 								        "background event should mention trimmed item count: {}",
 								        event.message
 								    );
 								    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 								    let requests = request_log.requests();
 								    assert_eq!(
 								        requests.len(),
 ,
 								        "expected user turn and two compact attempts"
 								    );
 								    let compact_attempt = requests[1].body_json();
 								    let retry_attempt = requests[2].body_json();
 								    let compact_input = compact_attempt["input"]
 								        .as_array()
 								        .unwrap_or_else(|| panic!("compact attempt missing input array: {compact_attempt}"));
 								    let retry_input = retry_attempt["input"]
 								        .as_array()
 								        .unwrap_or_else(|| panic!("retry attempt missing input array: {retry_attempt}"));
 								    assert_eq!(
 								        compact_input
 								            .last()
 								            .and_then(|item| item.get("content"))
 								            .and_then(|v| v.as_array())
 								            .and_then(|items| items.first())
 								            .and_then(|entry| entry.get("text"))
 								            .and_then(|text| text.as_str()),
 								        Some(SUMMARIZATION_PROMPT),
 								        "compact attempt should include summarization prompt"
 								    );
 								    assert_eq!(
 								        retry_input
 								            .last()
 								            .and_then(|item| item.get("content"))
 								            .and_then(|v| v.as_array())
 								            .and_then(|items| items.first())
 								            .and_then(|entry| entry.get("text"))
 								            .and_then(|text| text.as_str()),
 								        Some(SUMMARIZATION_PROMPT),
 								        "retry attempt should include summarization prompt"
 								    );
 								    assert_eq!(
 								        retry_input.len(),
 								        compact_input.len().saturating_sub(1),
 								        "retry should drop exactly one history item (before {} vs after {})",
 								        compact_input.len(),
 								        retry_input.len()
 								    );
 								    if let (Some(first_before), Some(first_after)) = (compact_input.first(), retry_input.first()) {
 								        assert_ne!(
 								            first_before, first_after,
 								            "retry should drop the oldest conversation item"
 								        );
 								    } else {
 								        panic!("expected non-empty compact inputs");
 								    }
 								}
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_events() {
-												make tests pass cleanly in sandbox (#4067)

This changes the reqwest client used in tests to be sandbox-friendly,
and skips a bunch of other tests that don't work inside the
sandbox/without network.
											
										
										
											2025-09-25 13:11:14 -07:00
+								    skip_if_no_network!();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let server = start_mock_server().await;
 								    let sse1 = sse(vec![
 								        ev_assistant_message("m1", FIRST_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r1", 500),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse2 = sse(vec![
 								        ev_assistant_message("m2", FIRST_AUTO_SUMMARY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r2", 50),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse3 = sse(vec![
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
 								        ev_completed_with_tokens("r3", 150),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse4 = sse(vec![
 								        ev_assistant_message("m4", SECOND_LARGE_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r4", 450),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse5 = sse(vec![
 								        ev_assistant_message("m5", SECOND_AUTO_SUMMARY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r5", 60),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
 								    let sse6 = sse(vec![
 								        ev_assistant_message("m6", FINAL_REPLY),
-												Add non_sandbox_test helper (#3880)

Makes tests shorter
											
										
										
											2025-09-22 07:50:41 -07:00
+								        ev_completed_with_tokens("r6", 120),
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    ]);
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    mount_sse_sequence(&server, vec![sse1, sse2, sse3, sse4, sse5, sse6]).await;
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    config.model_auto_compact_token_limit = Some(200);
 								    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
 								    let codex = conversation_manager
 								        .new_conversation(config)
 								        .await
 								        .unwrap()
 								        .conversation;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								                text: MULTI_AUTO_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
-												Make ESC button work when auto-compaction (#3857)

Only emit a task finished when the compaction comes from a `/compact`
											
										
										
											2025-09-18 16:34:16 +01:00
+								    let mut auto_compact_lifecycle_events = Vec::new();
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    loop {
 								        let event = codex.next_event().await.unwrap();
-												Make ESC button work when auto-compaction (#3857)

Only emit a task finished when the compaction comes from a `/compact`
											
										
										
											2025-09-18 16:34:16 +01:00
+								        if event.id.starts_with("auto-compact-")
 								            && matches!(
 								                event.msg,
 								                EventMsg::TaskStarted(_) | EventMsg::TaskComplete(_)
 								            )
 								        {
 								            auto_compact_lifecycle_events.push(event);
 								            continue;
 								        }
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        if let EventMsg::TaskComplete(_) = &event.msg
 								            && !event.id.starts_with("auto-compact-")
 								        {
 								            break;
 								        }
 								    }
-												Make ESC button work when auto-compaction (#3857)

Only emit a task finished when the compaction comes from a `/compact`
											
										
										
											2025-09-18 16:34:16 +01:00
+								    assert!(
 								        auto_compact_lifecycle_events.is_empty(),
 								        "auto compact should not emit task lifecycle events"
 								    );
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								    let request_bodies: Vec<String> = server
 								        .received_requests()
 								        .await
 								        .unwrap()
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        .into_iter()
-												Use response helpers when mounting SSE test responses (#4783)

## Summary
- replace manual wiremock SSE mounts in the compact suite with the
shared response helpers
- simplify the exec auth_env integration test by using the
mount_sse_once_match helper
- rely on mount_sse_sequence plus server request collection to replace
the bespoke SeqResponder utility in tests

## Testing
- just fmt

------
https://chatgpt.com/codex/tasks/task_i_68e2e238f2a88320a337f0b9e4098093
											
										
										
											2025-10-05 14:58:16 -07:00
+								        .map(|request| String::from_utf8(request.body).unwrap_or_default())
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								        .collect();
 								    assert_eq!(
 								        request_bodies.len(),
 ,
 								        "expected six requests including two auto compactions"
 								    );
 								    assert!(
 								        request_bodies[0].contains(MULTI_AUTO_MSG),
 								        "first request should contain the user input"
 								    );
 								    assert!(
 								        request_bodies[1].contains("You have exceeded the maximum number of tokens"),
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								        "first auto compact request should include the summarization prompt"
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    );
 								    assert!(
 								        request_bodies[3].contains(&format!("unsupported call: {DUMMY_FUNCTION_NAME}")),
 								        "function call output should be sent before the second auto compact"
 								    );
 								    assert!(
 								        request_bodies[4].contains("You have exceeded the maximum number of tokens"),
-												chore: compact do not modify instructions  (#4088)

Keep the developer instruction and insert the summarisation message as a
user message instead
											
										
										
											2025-09-23 17:59:17 +01:00
+								        "second auto compact request should include the summarization prompt"
-												feat: context compaction (#3446)

## Compact feature:
1. Stops the model when the context window become too large
2. Add a user turn, asking for the model to summarize
3. Build a bridge that contains all the previous user message + the
summary. Rendered from a template
4. Start sampling again from a clean conversation with only that bridge
											
										
										
											2025-09-12 13:07:10 -07:00
+								    );
 								}
-												Auto compact at ~90% (#5292)

Users now hit a window exceeded limit and they usually don't know what
to do. This starts auto compact at ~90% of the window.
											
										
										
											2025-10-20 11:29:49 -07:00
 								#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 								async fn auto_compact_triggers_after_function_call_over_95_percent_usage() {
 								    skip_if_no_network!();
 								    let server = start_mock_server().await;
 								    let context_window = 100;
 								    let limit = context_window * 90 / 100;
 								    let over_limit_tokens = context_window * 95 / 100 + 1;
 								    let first_turn = sse(vec![
 								        ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
 								        ev_completed_with_tokens("r1", 50),
 								    ]);
 								    let function_call_follow_up = sse(vec![
 								        ev_assistant_message("m2", FINAL_REPLY),
 								        ev_completed_with_tokens("r2", over_limit_tokens),
 								    ]);
 								    let auto_compact_turn = sse(vec![
 								        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
 								        ev_completed_with_tokens("r3", 10),
 								    ]);
 								    let post_auto_compact_turn = sse(vec![ev_completed_with_tokens("r4", 10)]);
 								    // Mount responses in order and keep mocks only for the ones we assert on.
 								    let first_turn_mock = mount_sse_once(&server, first_turn).await;
 								    let follow_up_mock = mount_sse_once(&server, function_call_follow_up).await;
 								    let auto_compact_mock = mount_sse_once(&server, auto_compact_turn).await;
 								    // We don't assert on the post-compact request, so no need to keep its mock.
 								    mount_sse_once(&server, post_auto_compact_turn).await;
 								    let model_provider = ModelProviderInfo {
 								        base_url: Some(format!("{}/v1", server.uri())),
 								        ..built_in_model_providers()["openai"].clone()
 								    };
 								    let home = TempDir::new().unwrap();
 								    let mut config = load_default_config_for_test(&home);
 								    config.model_provider = model_provider;
 								    config.model_context_window = Some(context_window);
 								    config.model_auto_compact_token_limit = Some(limit);
 								    let codex = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"))
 								        .new_conversation(config)
 								        .await
 								        .unwrap()
 								        .conversation;
 								    codex
 								        .submit(Op::UserInput {
-												Add ItemStarted/ItemCompleted events for UserInputItem (#5306)

Adds a new ItemStarted event and delivers UserMessage as the first item
type (more to come).


Renames `InputItem` to `UserInput` considering we're using the `Item`
suffix for actual items.
											
										
										
											2025-10-20 13:34:44 -07:00
+								            items: vec![UserInput::Text {
-												Auto compact at ~90% (#5292)

Users now hit a window exceeded limit and they usually don't know what
to do. This starts auto compact at ~90% of the window.
											
										
										
											2025-10-20 11:29:49 -07:00
+								                text: FUNCTION_CALL_LIMIT_MSG.into(),
 								            }],
 								        })
 								        .await
 								        .unwrap();
 								    wait_for_event(&codex, |msg| matches!(msg, EventMsg::TaskComplete(_))).await;
 								    // Assert first request captured expected user message that triggers function call.
 								    let first_request = first_turn_mock.single_request().input();
 								    assert!(
 								        first_request.iter().any(|item| {
 								            item.get("type").and_then(|value| value.as_str()) == Some("message")
 								                && item
 								                    .get("content")
 								                    .and_then(|content| content.as_array())
 								                    .and_then(|entries| entries.first())
 								                    .and_then(|entry| entry.get("text"))
 								                    .and_then(|value| value.as_str())
 								                    == Some(FUNCTION_CALL_LIMIT_MSG)
 								        }),
 								        "first request should include the user message that triggers the function call"
 								    );
 								    let function_call_output = follow_up_mock
 								        .single_request()
 								        .function_call_output(DUMMY_CALL_ID);
 								    let output_text = function_call_output
 								        .get("output")
 								        .and_then(|value| value.as_str())
 								        .unwrap_or_default();
 								    assert!(
 								        output_text.contains(DUMMY_FUNCTION_NAME),
 								        "function call output should be sent before auto compact"
 								    );
 								    let auto_compact_body = auto_compact_mock.single_request().body_json().to_string();
 								    assert!(
 								        auto_compact_body.contains("You have exceeded the maximum number of tokens"),
 								        "auto compact request should include the summarization prompt after exceeding 95% (limit {limit})"
 								    );
 								}