feat: context compaction (#3446)

## Compact feature: 1. Stops the model when the context window become too large 2. Add a user turn, asking for the model to summarize 3. Build a bridge that contains all the previous user message + the summary. Rendered from a template 4. Start sampling again from a clean conversation with only that bridge
2025-09-12 13:07:10 -07:00
parent d4848e558b
commit ea225df22e
14 changed files with 1243 additions and 326 deletions
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@@ -212,6 +212,50 @@ dependencies = [
 "term",
 ]

+[[package]]
+name = "askama"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b79091df18a97caea757e28cd2d5fda49c6cd4bd01ddffd7ff01ace0c0ad2c28"
+dependencies = [
+ "askama_derive",
+ "askama_escape",
+ "humansize",
+ "num-traits",
+ "percent-encoding",
+]
+
+[[package]]
+name = "askama_derive"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19fe8d6cb13c4714962c072ea496f3392015f0989b1a2847bb4b2d9effd71d83"
+dependencies = [
+ "askama_parser",
+ "basic-toml",
+ "mime",
+ "mime_guess",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn 2.0.104",
+]
+
+[[package]]
+name = "askama_escape"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
+
+[[package]]
+name = "askama_parser"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acb1161c6b64d1c3d83108213c2a2533a342ac225aabd0bda218278c2ddb00c0"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "assert-json-diff"
 version = "2.0.2"
@@ -305,6 +349,15 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"

+[[package]]
+name = "basic-toml"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba62675e8242a4c4e806d12f11d136e626e6c8361d6b829310732241652a178a"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "beef"
 version = "0.5.2"
@@ -606,6 +659,7 @@ name = "codex-core"
 version = "0.0.0"
 dependencies = [
 "anyhow",
+ "askama",
 "assert_cmd",
 "async-channel",
 "base64",
@@ -1981,6 +2035,15 @@ version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"

+[[package]]
+name = "humansize"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6cb51c9a029ddc91b07a787f1d86b53ccfa49b0e86688c946ebe8d3555685dd7"
+dependencies = [
+ "libm",
+]
+
 [[package]]
 name = "hyper"
 version = "1.7.0"
@@ -2536,6 +2599,12 @@ version = "0.2.175"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"

+[[package]]
+name = "libm"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"
+
 [[package]]
 name = "libredox"
 version = "0.1.6"
--- a/codex-rs/core/Cargo.toml
+++ b/codex-rs/core/Cargo.toml
@@ -13,6 +13,7 @@ workspace = true

 [dependencies]
 anyhow = "1"
+askama = "0.12"
 async-channel = "2.3.1"
 base64 = "0.22"
 bytes = "1.10.1"
--- a/codex-rs/core/src/client.rs
+++ b/codex-rs/core/src/client.rs
@@ -104,6 +104,12 @@ impl ModelClient {
            .or_else(|| get_model_info(&self.config.model_family).map(|info| info.context_window))
    }

+    pub fn get_auto_compact_token_limit(&self) -> Option<i64> {
+        self.config.model_auto_compact_token_limit.or_else(|| {
+            get_model_info(&self.config.model_family).and_then(|info| info.auto_compact_token_limit)
+        })
+    }
+
    /// Dispatches to either the Responses or Chat implementation depending on
    /// the provider config.  Public callers always invoke `stream()` – the
    /// specialised helpers are private to avoid accidental misuse.
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -16,7 +16,6 @@ use codex_apply_patch::ApplyPatchAction;
 use codex_apply_patch::MaybeApplyPatchVerified;
 use codex_apply_patch::maybe_parse_apply_patch_verified;
 use codex_protocol::mcp_protocol::ConversationId;
-use codex_protocol::protocol::CompactedItem;
 use codex_protocol::protocol::ConversationPathResponseEvent;
 use codex_protocol::protocol::RolloutItem;
 use codex_protocol::protocol::TaskStartedEvent;
@@ -77,7 +76,6 @@ use crate::parse_command::parse_command;
 use crate::plan_tool::handle_update_plan;
 use crate::project_doc::get_user_instructions;
 use crate::protocol::AgentMessageDeltaEvent;
-use crate::protocol::AgentMessageEvent;
 use crate::protocol::AgentReasoningDeltaEvent;
 use crate::protocol::AgentReasoningRawContentDeltaEvent;
 use crate::protocol::AgentReasoningSectionBreakEvent;
@@ -102,6 +100,7 @@ use crate::protocol::SessionConfiguredEvent;
 use crate::protocol::StreamErrorEvent;
 use crate::protocol::Submission;
 use crate::protocol::TaskCompleteEvent;
+use crate::protocol::TokenUsage;
 use crate::protocol::TokenUsageInfo;
 use crate::protocol::TurnDiffEvent;
 use crate::protocol::WebSearchBeginEvent;
@@ -127,6 +126,8 @@ use codex_protocol::models::ResponseItem;
 use codex_protocol::models::ShellToolCallParams;
 use codex_protocol::protocol::InitialHistory;

+mod compact;
+
 // A convenience extension trait for acquiring mutex locks where poisoning is
 // unrecoverable and should abort the program. This avoids scattered `.unwrap()`
 // calls on `lock()` while still surfacing a clear panic message when a lock is
@@ -264,6 +265,7 @@ struct State {
    pending_input: Vec<ResponseInputItem>,
    history: ConversationHistory,
    token_info: Option<TokenUsageInfo>,
+    next_internal_sub_id: u64,
 }

 /// Context for an initialized model agent
@@ -534,6 +536,13 @@ impl Session {
        }
    }

+    fn next_internal_sub_id(&self) -> String {
+        let mut state = self.state.lock_unchecked();
+        let id = state.next_internal_sub_id;
+        state.next_internal_sub_id += 1;
+        format!("auto-compact-{id}")
+    }
+
    async fn record_initial_history(
        &self,
        turn_context: &TurnContext,
@@ -707,6 +716,21 @@ impl Session {
        }
    }

+    fn update_token_usage_info(
+        &self,
+        turn_context: &TurnContext,
+        token_usage: &Option<TokenUsage>,
+    ) -> Option<TokenUsageInfo> {
+        let mut state = self.state.lock_unchecked();
+        let info = TokenUsageInfo::new_or_append(
+            &state.token_info,
+            token_usage,
+            turn_context.client.get_model_context_window(),
+        );
+        state.token_info = info.clone();
+        info
+    }
+
    /// Record a user input item to conversation history and also persist a
    /// corresponding UserMessage EventMsg to rollout.
    async fn record_input_and_rollout_usermsg(&self, response_input: &ResponseInputItem) {
@@ -1026,8 +1050,7 @@ impl AgentTask {
            let sess = sess.clone();
            let sub_id = sub_id.clone();
            let tc = Arc::clone(&turn_context);
-            tokio::spawn(async move { run_task(sess, tc.as_ref(), sub_id, input).await })
-                .abort_handle()
+            tokio::spawn(async move { run_task(sess, tc, sub_id, input).await }).abort_handle()
        };
        Self {
            sess,
@@ -1048,7 +1071,7 @@ impl AgentTask {
            let sub_id = sub_id.clone();
            let tc = Arc::clone(&turn_context);
            tokio::spawn(async move {
-                run_compact_task(sess, tc.as_ref(), sub_id, input, compact_instructions).await
+                compact::run_compact_task(sess, tc, sub_id, input, compact_instructions).await
            })
            .abort_handle()
        };
@@ -1342,21 +1365,16 @@ async fn submission_loop(
                sess.send_event(event).await;
            }
            Op::Compact => {
-                // Create a summarization request as user input
-                const SUMMARIZATION_PROMPT: &str = include_str!("prompt_for_compact_command.md");
-
                // Attempt to inject input into current task
                if let Err(items) = sess.inject_input(vec![InputItem::Text {
-                    text: "Start Summarization".to_string(),
+                    text: compact::COMPACT_TRIGGER_TEXT.to_string(),
                }]) {
-                    let task = AgentTask::compact(
+                    compact::spawn_compact_task(
                        sess.clone(),
                        Arc::clone(&turn_context),
                        sub.id,
                        items,
-                        SUMMARIZATION_PROMPT.to_string(),
                    );
-                    sess.set_task(task);
                }
            }
            Op::Shutdown => {
@@ -1435,7 +1453,7 @@ async fn submission_loop(
 ///   conversation history and consider the task complete.
 async fn run_task(
    sess: Arc<Session>,
-    turn_context: &TurnContext,
+    turn_context: Arc<TurnContext>,
    sub_id: String,
    input: Vec<InputItem>,
 ) {
@@ -1458,6 +1476,7 @@ async fn run_task(
    // Although from the perspective of codex.rs, TurnDiffTracker has the lifecycle of a Task which contains
    // many turns, from the perspective of the user, it is a single turn.
    let mut turn_diff_tracker = TurnDiffTracker::new();
+    let mut auto_compact_recently_attempted = false;

    loop {
        // Note that pending_input would be something like a message the user
@@ -1492,7 +1511,7 @@ async fn run_task(
            .collect();
        match run_turn(
            &sess,
-            turn_context,
+            turn_context.as_ref(),
            &mut turn_diff_tracker,
            sub_id.clone(),
            turn_input,
@@ -1500,9 +1519,23 @@ async fn run_task(
        .await
        {
            Ok(turn_output) => {
+                let TurnRunResult {
+                    processed_items,
+                    total_token_usage,
+                } = turn_output;
+                let limit = turn_context
+                    .client
+                    .get_auto_compact_token_limit()
+                    .unwrap_or(i64::MAX);
+                let total_usage_tokens = total_token_usage
+                    .as_ref()
+                    .map(|usage| usage.tokens_in_context_window());
+                let token_limit_reached = total_usage_tokens
+                    .map(|tokens| (tokens as i64) >= limit)
+                    .unwrap_or(false);
                let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
                let mut responses = Vec::<ResponseInputItem>::new();
-                for processed_response_item in turn_output {
+                for processed_response_item in processed_items {
                    let ProcessedResponseItem { item, response } = processed_response_item;
                    match (&item, &response) {
                        (ResponseItem::Message { role, .. }, None) if role == "assistant" => {
@@ -1599,8 +1632,31 @@ async fn run_task(
                        .await;
                }

+                if token_limit_reached {
+                    if auto_compact_recently_attempted {
+                        let limit_str = limit.to_string();
+                        let current_tokens = total_usage_tokens
+                            .map(|tokens| tokens.to_string())
+                            .unwrap_or_else(|| "unknown".to_string());
+                        let event = Event {
+                            id: sub_id.clone(),
+                            msg: EventMsg::Error(ErrorEvent {
+                                message: format!(
+                                    "Conversation is still above the token limit after automatic summarization (limit {limit_str}, current {current_tokens}). Please start a new session or trim your input."
+                                ),
+                            }),
+                        };
+                        sess.send_event(event).await;
+                        break;
+                    }
+                    auto_compact_recently_attempted = true;
+                    compact::run_inline_auto_compact_task(sess.clone(), turn_context.clone()).await;
+                    continue;
+                }
+
+                auto_compact_recently_attempted = false;
+
                if responses.is_empty() {
-                    debug!("Turn completed");
                    last_agent_message = get_last_assistant_message_from_turn(
                        &items_to_record_in_conversation_history,
                    );
@@ -1611,6 +1667,7 @@ async fn run_task(
                    });
                    break;
                }
+                continue;
            }
            Err(e) => {
                info!("Turn error: {e:#}");
@@ -1640,7 +1697,7 @@ async fn run_turn(
    turn_diff_tracker: &mut TurnDiffTracker,
    sub_id: String,
    input: Vec<ResponseItem>,
-) -> CodexResult<Vec<ProcessedResponseItem>> {
+) -> CodexResult<TurnRunResult> {
    let tools = get_openai_tools(
        &turn_context.tools_config,
        Some(sess.mcp_connection_manager.list_all_tools()),
@@ -1704,13 +1761,19 @@ struct ProcessedResponseItem {
    response: Option<ResponseInputItem>,
 }

+#[derive(Debug)]
+struct TurnRunResult {
+    processed_items: Vec<ProcessedResponseItem>,
+    total_token_usage: Option<TokenUsage>,
+}
+
 async fn try_run_turn(
    sess: &Session,
    turn_context: &TurnContext,
    turn_diff_tracker: &mut TurnDiffTracker,
    sub_id: &str,
    prompt: &Prompt,
-) -> CodexResult<Vec<ProcessedResponseItem>> {
+) -> CodexResult<TurnRunResult> {
    // call_ids that are part of this response.
    let completed_call_ids = prompt
        .input
@@ -1828,16 +1891,7 @@ async fn try_run_turn(
                response_id: _,
                token_usage,
            } => {
-                let info = {
-                    let mut st = sess.state.lock_unchecked();
-                    let info = TokenUsageInfo::new_or_append(
-                        &st.token_info,
-                        &token_usage,
-                        turn_context.client.get_model_context_window(),
-                    );
-                    st.token_info = info.clone();
-                    info
-                };
+                let info = sess.update_token_usage_info(turn_context, &token_usage);
                let _ = sess
                    .send_event(Event {
                        id: sub_id.to_string(),
@@ -1855,7 +1909,12 @@ async fn try_run_turn(
                    sess.send_event(event).await;
                }

-                return Ok(output);
+                let result = TurnRunResult {
+                    processed_items: output,
+                    total_token_usage: token_usage.clone(),
+                };
+
+                return Ok(result);
            }
            ResponseEvent::OutputTextDelta(delta) => {
                let event = Event {
@@ -1893,95 +1952,6 @@ async fn try_run_turn(
    }
 }

-async fn run_compact_task(
-    sess: Arc<Session>,
-    turn_context: &TurnContext,
-    sub_id: String,
-    input: Vec<InputItem>,
-    compact_instructions: String,
-) {
-    let model_context_window = turn_context.client.get_model_context_window();
-    let start_event = Event {
-        id: sub_id.clone(),
-        msg: EventMsg::TaskStarted(TaskStartedEvent {
-            model_context_window,
-        }),
-    };
-    sess.send_event(start_event).await;
-
-    let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input);
-    let turn_input: Vec<ResponseItem> =
-        sess.turn_input_with_history(vec![initial_input_for_turn.clone().into()]);
-
-    let prompt = Prompt {
-        input: turn_input,
-        tools: Vec::new(),
-        base_instructions_override: Some(compact_instructions.clone()),
-    };
-
-    let max_retries = turn_context.client.get_provider().stream_max_retries();
-    let mut retries = 0;
-
-    loop {
-        let attempt_result = drain_to_completed(&sess, turn_context, &sub_id, &prompt).await;
-
-        match attempt_result {
-            Ok(()) => break,
-            Err(CodexErr::Interrupted) => return,
-            Err(e) => {
-                if retries < max_retries {
-                    retries += 1;
-                    let delay = backoff(retries);
-                    sess.notify_stream_error(
-                        &sub_id,
-                        format!(
-                            "stream error: {e}; retrying {retries}/{max_retries} in {delay:?}…"
-                        ),
-                    )
-                    .await;
-                    tokio::time::sleep(delay).await;
-                    continue;
-                } else {
-                    let event = Event {
-                        id: sub_id.clone(),
-                        msg: EventMsg::Error(ErrorEvent {
-                            message: e.to_string(),
-                        }),
-                    };
-                    sess.send_event(event).await;
-                    return;
-                }
-            }
-        }
-    }
-
-    sess.remove_task(&sub_id);
-
-    let rollout_item = {
-        let mut state = sess.state.lock_unchecked();
-        state.history.keep_last_messages(1);
-        RolloutItem::Compacted(CompactedItem {
-            message: state.history.last_agent_message(),
-        })
-    };
-    sess.persist_rollout_items(&[rollout_item]).await;
-
-    let event = Event {
-        id: sub_id.clone(),
-        msg: EventMsg::AgentMessage(AgentMessageEvent {
-            message: "Compact task completed".to_string(),
-        }),
-    };
-    sess.send_event(event).await;
-    let event = Event {
-        id: sub_id.clone(),
-        msg: EventMsg::TaskComplete(TaskCompleteEvent {
-            last_agent_message: None,
-        }),
-    };
-    sess.send_event(event).await;
-}
-
 async fn handle_response_item(
    sess: &Session,
    turn_context: &TurnContext,
@@ -2964,7 +2934,7 @@ fn format_exec_output(exec_output: &ExecToolCallOutput) -> String {
    serde_json::to_string(&payload).expect("serialize ExecOutput")
 }

-fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -> Option<String> {
+pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -> Option<String> {
    responses.iter().rev().find_map(|item| {
        if let ResponseItem::Message { role, content, .. } = item {
            if role == "assistant" {
@@ -2983,68 +2953,6 @@ fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -> Option<St
        }
    })
 }
-
-async fn drain_to_completed(
-    sess: &Session,
-    turn_context: &TurnContext,
-    sub_id: &str,
-    prompt: &Prompt,
-) -> CodexResult<()> {
-    let rollout_item = RolloutItem::TurnContext(TurnContextItem {
-        cwd: turn_context.cwd.clone(),
-        approval_policy: turn_context.approval_policy,
-        sandbox_policy: turn_context.sandbox_policy.clone(),
-        model: turn_context.client.get_model(),
-        effort: turn_context.client.get_reasoning_effort(),
-        summary: turn_context.client.get_reasoning_summary(),
-    });
-    sess.persist_rollout_items(&[rollout_item]).await;
-    let mut stream = turn_context.client.clone().stream(prompt).await?;
-    loop {
-        let maybe_event = stream.next().await;
-        let Some(event) = maybe_event else {
-            return Err(CodexErr::Stream(
-                "stream closed before response.completed".into(),
-                None,
-            ));
-        };
-        match event {
-            Ok(ResponseEvent::OutputItemDone(item)) => {
-                // Record only to in-memory conversation history; avoid state snapshot.
-                let mut state = sess.state.lock_unchecked();
-                state.history.record_items(std::slice::from_ref(&item));
-            }
-            Ok(ResponseEvent::Completed {
-                response_id: _,
-                token_usage,
-            }) => {
-                let info = {
-                    let mut st = sess.state.lock_unchecked();
-                    let info = TokenUsageInfo::new_or_append(
-                        &st.token_info,
-                        &token_usage,
-                        turn_context.client.get_model_context_window(),
-                    );
-                    st.token_info = info.clone();
-                    info
-                };
-
-                sess.tx_event
-                    .send(Event {
-                        id: sub_id.to_string(),
-                        msg: EventMsg::TokenCount(crate::protocol::TokenCountEvent { info }),
-                    })
-                    .await
-                    .ok();
-
-                return Ok(());
-            }
-            Ok(_) => continue,
-            Err(e) => return Err(e),
-        }
-    }
-}
-
 fn convert_call_tool_result_to_function_call_output_payload(
    call_tool_result: &CallToolResult,
 ) -> FunctionCallOutputPayload {
--- a/codex-rs/core/src/codex/compact.rs
+++ b/codex-rs/core/src/codex/compact.rs
@@ -0,0 +1,401 @@
+use std::sync::Arc;
+
+use super::AgentTask;
+use super::MutexExt;
+use super::Session;
+use super::TurnContext;
+use super::get_last_assistant_message_from_turn;
+use crate::Prompt;
+use crate::client_common::ResponseEvent;
+use crate::error::CodexErr;
+use crate::error::Result as CodexResult;
+use crate::protocol::AgentMessageEvent;
+use crate::protocol::CompactedItem;
+use crate::protocol::ErrorEvent;
+use crate::protocol::Event;
+use crate::protocol::EventMsg;
+use crate::protocol::InputItem;
+use crate::protocol::InputMessageKind;
+use crate::protocol::TaskCompleteEvent;
+use crate::protocol::TaskStartedEvent;
+use crate::protocol::TurnContextItem;
+use crate::util::backoff;
+use askama::Template;
+use codex_protocol::models::ContentItem;
+use codex_protocol::models::ResponseInputItem;
+use codex_protocol::models::ResponseItem;
+use codex_protocol::protocol::RolloutItem;
+use futures::prelude::*;
+
+pub(super) const COMPACT_TRIGGER_TEXT: &str = "Start Summarization";
+const SUMMARIZATION_PROMPT: &str = include_str!("../../templates/compact/prompt.md");
+
+#[derive(Template)]
+#[template(path = "compact/history_bridge.md", escape = "none")]
+struct HistoryBridgeTemplate<'a> {
+    user_messages_text: &'a str,
+    summary_text: &'a str,
+}
+
+pub(super) fn spawn_compact_task(
+    sess: Arc<Session>,
+    turn_context: Arc<TurnContext>,
+    sub_id: String,
+    input: Vec<InputItem>,
+) {
+    let task = AgentTask::compact(
+        sess.clone(),
+        turn_context,
+        sub_id,
+        input,
+        SUMMARIZATION_PROMPT.to_string(),
+    );
+    sess.set_task(task);
+}
+
+pub(super) async fn run_inline_auto_compact_task(
+    sess: Arc<Session>,
+    turn_context: Arc<TurnContext>,
+) {
+    let sub_id = sess.next_internal_sub_id();
+    let input = vec![InputItem::Text {
+        text: COMPACT_TRIGGER_TEXT.to_string(),
+    }];
+    run_compact_task_inner(
+        sess,
+        turn_context,
+        sub_id,
+        input,
+        SUMMARIZATION_PROMPT.to_string(),
+        false,
+    )
+    .await;
+}
+
+pub(super) async fn run_compact_task(
+    sess: Arc<Session>,
+    turn_context: Arc<TurnContext>,
+    sub_id: String,
+    input: Vec<InputItem>,
+    compact_instructions: String,
+) {
+    run_compact_task_inner(
+        sess,
+        turn_context,
+        sub_id,
+        input,
+        compact_instructions,
+        true,
+    )
+    .await;
+}
+
+async fn run_compact_task_inner(
+    sess: Arc<Session>,
+    turn_context: Arc<TurnContext>,
+    sub_id: String,
+    input: Vec<InputItem>,
+    compact_instructions: String,
+    remove_task_on_completion: bool,
+) {
+    let model_context_window = turn_context.client.get_model_context_window();
+    let start_event = Event {
+        id: sub_id.clone(),
+        msg: EventMsg::TaskStarted(TaskStartedEvent {
+            model_context_window,
+        }),
+    };
+    sess.send_event(start_event).await;
+
+    let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input);
+    let instructions_override = compact_instructions;
+    let turn_input = sess.turn_input_with_history(vec![initial_input_for_turn.clone().into()]);
+
+    let prompt = Prompt {
+        input: turn_input,
+        tools: Vec::new(),
+        base_instructions_override: Some(instructions_override),
+    };
+
+    let max_retries = turn_context.client.get_provider().stream_max_retries();
+    let mut retries = 0;
+
+    let rollout_item = RolloutItem::TurnContext(TurnContextItem {
+        cwd: turn_context.cwd.clone(),
+        approval_policy: turn_context.approval_policy,
+        sandbox_policy: turn_context.sandbox_policy.clone(),
+        model: turn_context.client.get_model(),
+        effort: turn_context.client.get_reasoning_effort(),
+        summary: turn_context.client.get_reasoning_summary(),
+    });
+    sess.persist_rollout_items(&[rollout_item]).await;
+
+    loop {
+        let attempt_result = drain_to_completed(&sess, turn_context.as_ref(), &prompt).await;
+
+        match attempt_result {
+            Ok(()) => {
+                break;
+            }
+            Err(CodexErr::Interrupted) => {
+                return;
+            }
+            Err(e) => {
+                if retries < max_retries {
+                    retries += 1;
+                    let delay = backoff(retries);
+                    sess.notify_stream_error(
+                        &sub_id,
+                        format!(
+                            "stream error: {e}; retrying {retries}/{max_retries} in {delay:?}…"
+                        ),
+                    )
+                    .await;
+                    tokio::time::sleep(delay).await;
+                    continue;
+                } else {
+                    let event = Event {
+                        id: sub_id.clone(),
+                        msg: EventMsg::Error(ErrorEvent {
+                            message: e.to_string(),
+                        }),
+                    };
+                    sess.send_event(event).await;
+                    return;
+                }
+            }
+        }
+    }
+
+    if remove_task_on_completion {
+        sess.remove_task(&sub_id);
+    }
+    let history_snapshot = {
+        let state = sess.state.lock_unchecked();
+        state.history.contents()
+    };
+    let summary_text = get_last_assistant_message_from_turn(&history_snapshot).unwrap_or_default();
+    let user_messages = collect_user_messages(&history_snapshot);
+    let new_history =
+        build_compacted_history(&sess, turn_context.as_ref(), &user_messages, &summary_text);
+    {
+        let mut state = sess.state.lock_unchecked();
+        state.history.replace(new_history);
+    }
+
+    let rollout_item = RolloutItem::Compacted(CompactedItem {
+        message: summary_text.clone(),
+    });
+    sess.persist_rollout_items(&[rollout_item]).await;
+
+    let event = Event {
+        id: sub_id.clone(),
+        msg: EventMsg::AgentMessage(AgentMessageEvent {
+            message: "Compact task completed".to_string(),
+        }),
+    };
+    sess.send_event(event).await;
+    let event = Event {
+        id: sub_id.clone(),
+        msg: EventMsg::TaskComplete(TaskCompleteEvent {
+            last_agent_message: None,
+        }),
+    };
+    sess.send_event(event).await;
+}
+
+fn content_items_to_text(content: &[ContentItem]) -> Option<String> {
+    let mut pieces = Vec::new();
+    for item in content {
+        match item {
+            ContentItem::InputText { text } | ContentItem::OutputText { text } => {
+                if !text.is_empty() {
+                    pieces.push(text.as_str());
+                }
+            }
+            ContentItem::InputImage { .. } => {}
+        }
+    }
+    if pieces.is_empty() {
+        None
+    } else {
+        Some(pieces.join("\n"))
+    }
+}
+
+fn collect_user_messages(items: &[ResponseItem]) -> Vec<String> {
+    items
+        .iter()
+        .filter_map(|item| match item {
+            ResponseItem::Message { role, content, .. } if role == "user" => {
+                content_items_to_text(content)
+            }
+            _ => None,
+        })
+        .filter(|text| !is_session_prefix_message(text))
+        .collect()
+}
+
+fn is_session_prefix_message(text: &str) -> bool {
+    matches!(
+        InputMessageKind::from(("user", text)),
+        InputMessageKind::UserInstructions | InputMessageKind::EnvironmentContext
+    )
+}
+
+fn build_compacted_history(
+    sess: &Session,
+    turn_context: &TurnContext,
+    user_messages: &[String],
+    summary_text: &str,
+) -> Vec<ResponseItem> {
+    let mut history = sess.build_initial_context(turn_context);
+    let user_messages_text = if user_messages.is_empty() {
+        "(none)".to_string()
+    } else {
+        user_messages.join("\n\n")
+    };
+    let summary_text = if summary_text.is_empty() {
+        "(no summary available)".to_string()
+    } else {
+        summary_text.to_string()
+    };
+    let Ok(bridge) = HistoryBridgeTemplate {
+        user_messages_text: &user_messages_text,
+        summary_text: &summary_text,
+    }
+    .render() else {
+        return vec![];
+    };
+    history.push(ResponseItem::Message {
+        id: None,
+        role: "user".to_string(),
+        content: vec![ContentItem::InputText { text: bridge }],
+    });
+    history
+}
+
+async fn drain_to_completed(
+    sess: &Session,
+    turn_context: &TurnContext,
+    prompt: &Prompt,
+) -> CodexResult<()> {
+    let mut stream = turn_context.client.clone().stream(prompt).await?;
+    loop {
+        let maybe_event = stream.next().await;
+        let Some(event) = maybe_event else {
+            return Err(CodexErr::Stream(
+                "stream closed before response.completed".into(),
+                None,
+            ));
+        };
+        match event {
+            Ok(ResponseEvent::OutputItemDone(item)) => {
+                let mut state = sess.state.lock_unchecked();
+                state.history.record_items(std::slice::from_ref(&item));
+            }
+            Ok(ResponseEvent::Completed { .. }) => {
+                return Ok(());
+            }
+            Ok(_) => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn content_items_to_text_joins_non_empty_segments() {
+        let items = vec![
+            ContentItem::InputText {
+                text: "hello".to_string(),
+            },
+            ContentItem::OutputText {
+                text: String::new(),
+            },
+            ContentItem::OutputText {
+                text: "world".to_string(),
+            },
+        ];
+
+        let joined = content_items_to_text(&items);
+
+        assert_eq!(Some("hello\nworld".to_string()), joined);
+    }
+
+    #[test]
+    fn content_items_to_text_ignores_image_only_content() {
+        let items = vec![ContentItem::InputImage {
+            image_url: "file://image.png".to_string(),
+        }];
+
+        let joined = content_items_to_text(&items);
+
+        assert_eq!(None, joined);
+    }
+
+    #[test]
+    fn collect_user_messages_extracts_user_text_only() {
+        let items = vec![
+            ResponseItem::Message {
+                id: Some("assistant".to_string()),
+                role: "assistant".to_string(),
+                content: vec![ContentItem::OutputText {
+                    text: "ignored".to_string(),
+                }],
+            },
+            ResponseItem::Message {
+                id: Some("user".to_string()),
+                role: "user".to_string(),
+                content: vec![
+                    ContentItem::InputText {
+                        text: "first".to_string(),
+                    },
+                    ContentItem::OutputText {
+                        text: "second".to_string(),
+                    },
+                ],
+            },
+            ResponseItem::Other,
+        ];
+
+        let collected = collect_user_messages(&items);
+
+        assert_eq!(vec!["first\nsecond".to_string()], collected);
+    }
+
+    #[test]
+    fn collect_user_messages_filters_session_prefix_entries() {
+        let items = vec![
+            ResponseItem::Message {
+                id: None,
+                role: "user".to_string(),
+                content: vec![ContentItem::InputText {
+                    text: "<user_instructions>do things</user_instructions>".to_string(),
+                }],
+            },
+            ResponseItem::Message {
+                id: None,
+                role: "user".to_string(),
+                content: vec![ContentItem::InputText {
+                    text: "<ENVIRONMENT_CONTEXT>cwd=/tmp</ENVIRONMENT_CONTEXT>".to_string(),
+                }],
+            },
+            ResponseItem::Message {
+                id: None,
+                role: "user".to_string(),
+                content: vec![ContentItem::InputText {
+                    text: "real user message".to_string(),
+                }],
+            },
+        ];
+
+        let collected = collect_user_messages(&items);
+
+        assert_eq!(vec!["real user message".to_string()], collected);
+    }
+}
--- a/codex-rs/core/src/config.rs
+++ b/codex-rs/core/src/config.rs
@@ -55,6 +55,9 @@ pub struct Config {
    /// Maximum number of output tokens.
    pub model_max_output_tokens: Option<u64>,

+    /// Token usage threshold triggering auto-compaction of conversation history.
+    pub model_auto_compact_token_limit: Option<i64>,
+
    /// Key into the model_providers map that specifies which provider to use.
    pub model_provider_id: String,

@@ -519,6 +522,9 @@ pub struct ConfigToml {
    /// Maximum number of output tokens.
    pub model_max_output_tokens: Option<u64>,

+    /// Token usage threshold triggering auto-compaction of conversation history.
+    pub model_auto_compact_token_limit: Option<i64>,
+
    /// Default approval policy for executing commands.
    pub approval_policy: Option<AskForApproval>,

@@ -877,6 +883,11 @@ impl Config {
                .as_ref()
                .map(|info| info.max_output_tokens)
        });
+        let model_auto_compact_token_limit = cfg.model_auto_compact_token_limit.or_else(|| {
+            openai_model_info
+                .as_ref()
+                .and_then(|info| info.auto_compact_token_limit)
+        });

        let experimental_resume = cfg.experimental_resume;

@@ -896,6 +907,7 @@ impl Config {
            model_family,
            model_context_window,
            model_max_output_tokens,
+            model_auto_compact_token_limit,
            model_provider_id,
            model_provider,
            cwd: resolved_cwd,
@@ -1430,6 +1442,7 @@ model_verbosity = "high"
                model_family: find_family_for_model("o3").expect("known model slug"),
                model_context_window: Some(200_000),
                model_max_output_tokens: Some(100_000),
+                model_auto_compact_token_limit: None,
                model_provider_id: "openai".to_string(),
                model_provider: fixture.openai_provider.clone(),
                approval_policy: AskForApproval::Never,
@@ -1486,6 +1499,7 @@ model_verbosity = "high"
            model_family: find_family_for_model("gpt-3.5-turbo").expect("known model slug"),
            model_context_window: Some(16_385),
            model_max_output_tokens: Some(4_096),
+            model_auto_compact_token_limit: None,
            model_provider_id: "openai-chat-completions".to_string(),
            model_provider: fixture.openai_chat_completions_provider.clone(),
            approval_policy: AskForApproval::UnlessTrusted,
@@ -1557,6 +1571,7 @@ model_verbosity = "high"
            model_family: find_family_for_model("o3").expect("known model slug"),
            model_context_window: Some(200_000),
            model_max_output_tokens: Some(100_000),
+            model_auto_compact_token_limit: None,
            model_provider_id: "openai".to_string(),
            model_provider: fixture.openai_provider.clone(),
            approval_policy: AskForApproval::OnFailure,
@@ -1614,6 +1629,7 @@ model_verbosity = "high"
            model_family: find_family_for_model("gpt-5").expect("known model slug"),
            model_context_window: Some(272_000),
            model_max_output_tokens: Some(128_000),
+            model_auto_compact_token_limit: None,
            model_provider_id: "openai".to_string(),
            model_provider: fixture.openai_provider.clone(),
            approval_policy: AskForApproval::OnFailure,
--- a/codex-rs/core/src/conversation_history.rs
+++ b/codex-rs/core/src/conversation_history.rs
@@ -1,4 +1,3 @@
-use codex_protocol::models::ContentItem;
 use codex_protocol::models::ResponseItem;

 /// Transcript of conversation history
@@ -33,52 +32,8 @@ impl ConversationHistory {
        }
    }

-    pub(crate) fn keep_last_messages(&mut self, n: usize) {
-        if n == 0 {
-            self.items.clear();
-            return;
-        }
-
-        // Collect the last N message items (assistant/user), newest to oldest.
-        let mut kept: Vec<ResponseItem> = Vec::with_capacity(n);
-        for item in self.items.iter().rev() {
-            if let ResponseItem::Message { role, content, .. } = item {
-                kept.push(ResponseItem::Message {
-                    // we need to remove the id or the model will complain that messages are sent without
-                    // their reasonings
-                    id: None,
-                    role: role.clone(),
-                    content: content.clone(),
-                });
-                if kept.len() == n {
-                    break;
-                }
-            }
-        }
-
-        // Preserve chronological order (oldest to newest) within the kept slice.
-        kept.reverse();
-        self.items = kept;
-    }
-
-    pub(crate) fn last_agent_message(&self) -> String {
-        for item in self.items.iter().rev() {
-            if let ResponseItem::Message { role, content, .. } = item
-                && role == "assistant"
-            {
-                return content
-                    .iter()
-                    .find_map(|ci| {
-                        if let ContentItem::OutputText { text } = ci {
-                            Some(text.clone())
-                        } else {
-                            None
-                        }
-                    })
-                    .unwrap_or_default();
-            }
-        }
-        String::new()
+    pub(crate) fn replace(&mut self, items: Vec<ResponseItem>) {
+        self.items = items;
    }
 }

--- a/codex-rs/core/src/openai_model_info.rs
+++ b/codex-rs/core/src/openai_model_info.rs
@@ -12,6 +12,19 @@ pub(crate) struct ModelInfo {

    /// Maximum number of output tokens that can be generated for the model.
    pub(crate) max_output_tokens: u64,
+
+    /// Token threshold where we should automatically compact conversation history.
+    pub(crate) auto_compact_token_limit: Option<i64>,
+}
+
+impl ModelInfo {
+    const fn new(context_window: u64, max_output_tokens: u64) -> Self {
+        Self {
+            context_window,
+            max_output_tokens,
+            auto_compact_token_limit: None,
+        }
+    }
 }

 pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
@@ -20,73 +33,37 @@ pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
        // OSS models have a 128k shared token pool.
        // Arbitrarily splitting it: 3/4 input context, 1/4 output.
        // https://openai.com/index/gpt-oss-model-card/
-        "gpt-oss-20b" => Some(ModelInfo {
-            context_window: 96_000,
-            max_output_tokens: 32_000,
-        }),
-        "gpt-oss-120b" => Some(ModelInfo {
-            context_window: 96_000,
-            max_output_tokens: 32_000,
-        }),
+        "gpt-oss-20b" => Some(ModelInfo::new(96_000, 32_000)),
+        "gpt-oss-120b" => Some(ModelInfo::new(96_000, 32_000)),
        // https://platform.openai.com/docs/models/o3
-        "o3" => Some(ModelInfo {
-            context_window: 200_000,
-            max_output_tokens: 100_000,
-        }),
+        "o3" => Some(ModelInfo::new(200_000, 100_000)),

        // https://platform.openai.com/docs/models/o4-mini
-        "o4-mini" => Some(ModelInfo {
-            context_window: 200_000,
-            max_output_tokens: 100_000,
-        }),
+        "o4-mini" => Some(ModelInfo::new(200_000, 100_000)),

        // https://platform.openai.com/docs/models/codex-mini-latest
-        "codex-mini-latest" => Some(ModelInfo {
-            context_window: 200_000,
-            max_output_tokens: 100_000,
-        }),
+        "codex-mini-latest" => Some(ModelInfo::new(200_000, 100_000)),

        // As of Jun 25, 2025, gpt-4.1 defaults to gpt-4.1-2025-04-14.
        // https://platform.openai.com/docs/models/gpt-4.1
-        "gpt-4.1" | "gpt-4.1-2025-04-14" => Some(ModelInfo {
-            context_window: 1_047_576,
-            max_output_tokens: 32_768,
-        }),
+        "gpt-4.1" | "gpt-4.1-2025-04-14" => Some(ModelInfo::new(1_047_576, 32_768)),

        // As of Jun 25, 2025, gpt-4o defaults to gpt-4o-2024-08-06.
        // https://platform.openai.com/docs/models/gpt-4o
-        "gpt-4o" | "gpt-4o-2024-08-06" => Some(ModelInfo {
-            context_window: 128_000,
-            max_output_tokens: 16_384,
-        }),
+        "gpt-4o" | "gpt-4o-2024-08-06" => Some(ModelInfo::new(128_000, 16_384)),

        // https://platform.openai.com/docs/models/gpt-4o?snapshot=gpt-4o-2024-05-13
-        "gpt-4o-2024-05-13" => Some(ModelInfo {
-            context_window: 128_000,
-            max_output_tokens: 4_096,
-        }),
+        "gpt-4o-2024-05-13" => Some(ModelInfo::new(128_000, 4_096)),

        // https://platform.openai.com/docs/models/gpt-4o?snapshot=gpt-4o-2024-11-20
-        "gpt-4o-2024-11-20" => Some(ModelInfo {
-            context_window: 128_000,
-            max_output_tokens: 16_384,
-        }),
+        "gpt-4o-2024-11-20" => Some(ModelInfo::new(128_000, 16_384)),

        // https://platform.openai.com/docs/models/gpt-3.5-turbo
-        "gpt-3.5-turbo" => Some(ModelInfo {
-            context_window: 16_385,
-            max_output_tokens: 4_096,
-        }),
+        "gpt-3.5-turbo" => Some(ModelInfo::new(16_385, 4_096)),

-        _ if slug.starts_with("gpt-5") => Some(ModelInfo {
-            context_window: 272_000,
-            max_output_tokens: 128_000,
-        }),
+        _ if slug.starts_with("gpt-5") => Some(ModelInfo::new(272_000, 128_000)),

-        _ if slug.starts_with("codex-") => Some(ModelInfo {
-            context_window: 272_000,
-            max_output_tokens: 128_000,
-        }),
+        _ if slug.starts_with("codex-") => Some(ModelInfo::new(272_000, 128_000)),

        _ => None,
    }
--- a/codex-rs/core/src/prompt_for_compact_command.md
+++ b/codex-rs/core/src/prompt_for_compact_command.md
@@ -1,21 +0,0 @@
-You are a summarization assistant. A conversation follows between a user and a coding-focused AI (Codex). Your task is to generate a clear summary capturing:
-
-• High-level objective or problem being solved  
-• Key instructions or design decisions given by the user  
-• Main code actions or behaviors from the AI  
-• Important variables, functions, modules, or outputs discussed  
-• Any unresolved questions or next steps
-
-Produce the summary in a structured format like:
-
-**Objective:** …
-
-**User instructions:** … (bulleted)
-
-**AI actions / code behavior:** … (bulleted)
-
-**Important entities:** … (e.g. function names, variables, files)
-
-**Open issues / next steps:** … (if any)
-
-**Summary (concise):** (one or two sentences)
--- a/codex-rs/core/src/unified_exec/mod.rs
+++ b/codex-rs/core/src/unified_exec/mod.rs
@@ -421,7 +421,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: None,
                input_chunks: &["bash".to_string(), "-i".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        let session_id = open_shell.session_id.expect("expected session_id");
@@ -441,7 +441,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: Some(session_id),
                input_chunks: &["echo $CODEX_INTERACTIVE_SHELL_VAR\n".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        assert!(out_2.output.contains("codex"));
@@ -458,7 +458,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: None,
                input_chunks: &["/bin/bash".to_string(), "-i".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        let session_a = shell_a.session_id.expect("expected session id");
@@ -467,7 +467,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: Some(session_a),
                input_chunks: &["export CODEX_INTERACTIVE_SHELL_VAR=codex\n".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;

@@ -478,7 +478,7 @@ mod tests {
                    "echo".to_string(),
                    "$CODEX_INTERACTIVE_SHELL_VAR\n".to_string(),
                ],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        assert!(!out_2.output.contains("codex"));
@@ -487,7 +487,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: Some(session_a),
                input_chunks: &["echo $CODEX_INTERACTIVE_SHELL_VAR\n".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        assert!(out_3.output.contains("codex"));
@@ -504,7 +504,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: None,
                input_chunks: &["bash".to_string(), "-i".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        let session_id = open_shell.session_id.expect("expected session id");
@@ -516,7 +516,7 @@ mod tests {
                    "export".to_string(),
                    "CODEX_INTERACTIVE_SHELL_VAR=codex\n".to_string(),
                ],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;

@@ -574,7 +574,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: None,
                input_chunks: &["/bin/echo".to_string(), "codex".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;

@@ -595,7 +595,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: None,
                input_chunks: &["/bin/bash".to_string(), "-i".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;
        let session_id = open_shell.session_id.expect("expected session id");
@@ -604,7 +604,7 @@ mod tests {
            .handle_request(UnifiedExecRequest {
                session_id: Some(session_id),
                input_chunks: &["exit\n".to_string()],
-                timeout_ms: Some(1_500),
+                timeout_ms: Some(2_500),
            })
            .await?;

--- a/codex-rs/core/templates/compact/history_bridge.md
+++ b/codex-rs/core/templates/compact/history_bridge.md
@@ -0,0 +1,7 @@
+You were originally given instructions from a user over one or more turns. Here were the user messages:
+
+{{ user_messages_text }}
+
+Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:
+
+{{ summary_text }}
--- a/codex-rs/core/templates/compact/prompt.md
+++ b/codex-rs/core/templates/compact/prompt.md
@@ -0,0 +1,5 @@
+You have exceeded the maximum number of tokens, please stop coding and instead write a short memento message for the next agent. Your note should:
+- Summarize what you finished and what still needs work. If there was a recent update_plan call, repeat its steps verbatim.
+- List outstanding TODOs with file paths / line numbers so they're easy to find.
+- Flag code that needs more tests (edge cases, performance, integration, etc.).
+- Record any open bugs, quirks, or setup steps that will make it easier for the next agent to pick up where you left off.
--- a/codex-rs/core/tests/suite/compact.rs
+++ b/codex-rs/core/tests/suite/compact.rs
@@ -5,6 +5,7 @@ use codex_core::ConversationManager;
 use codex_core::ModelProviderInfo;
 use codex_core::NewConversation;
 use codex_core::built_in_model_providers;
+use codex_core::protocol::ErrorEvent;
 use codex_core::protocol::EventMsg;
 use codex_core::protocol::InputItem;
 use codex_core::protocol::Op;
@@ -15,13 +16,20 @@ use core_test_support::load_default_config_for_test;
 use core_test_support::wait_for_event;
 use serde_json::Value;
 use tempfile::TempDir;
+use wiremock::BodyPrintLimit;
 use wiremock::Mock;
 use wiremock::MockServer;
+use wiremock::Request;
+use wiremock::Respond;
 use wiremock::ResponseTemplate;
 use wiremock::matchers::method;
 use wiremock::matchers::path;

 use pretty_assertions::assert_eq;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering;

 // --- Test helpers -----------------------------------------------------------

@@ -52,6 +60,22 @@ fn ev_completed(id: &str) -> Value {
    })
 }

+fn ev_completed_with_tokens(id: &str, total_tokens: u64) -> Value {
+    serde_json::json!({
+        "type": "response.completed",
+        "response": {
+            "id": id,
+            "usage": {
+                "input_tokens": total_tokens,
+                "input_tokens_details": null,
+                "output_tokens": 0,
+                "output_tokens_details": null,
+                "total_tokens": total_tokens
+            }
+        }
+    })
+}
+
 /// Convenience: SSE event for a single assistant message output item.
 fn ev_assistant_message(id: &str, text: &str) -> Value {
    serde_json::json!({
@@ -65,6 +89,18 @@ fn ev_assistant_message(id: &str, text: &str) -> Value {
    })
 }

+fn ev_function_call(call_id: &str, name: &str, arguments: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "function_call",
+            "call_id": call_id,
+            "name": name,
+            "arguments": arguments
+        }
+    })
+}
+
 fn sse_response(body: String) -> ResponseTemplate {
    ResponseTemplate::new(200)
        .insert_header("content-type", "text/event-stream")
@@ -84,10 +120,28 @@ where
        .await;
 }

+async fn start_mock_server() -> MockServer {
+    MockServer::builder()
+        .body_print_limit(BodyPrintLimit::Limited(80_000))
+        .start()
+        .await
+}
+
 const FIRST_REPLY: &str = "FIRST_REPLY";
 const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT";
 const SUMMARIZE_TRIGGER: &str = "Start Summarization";
 const THIRD_USER_MSG: &str = "next turn";
+const AUTO_SUMMARY_TEXT: &str = "AUTO_SUMMARY";
+const FIRST_AUTO_MSG: &str = "token limit start";
+const SECOND_AUTO_MSG: &str = "token limit push";
+const STILL_TOO_BIG_REPLY: &str = "STILL_TOO_BIG";
+const MULTI_AUTO_MSG: &str = "multi auto";
+const SECOND_LARGE_REPLY: &str = "SECOND_LARGE_REPLY";
+const FIRST_AUTO_SUMMARY: &str = "FIRST_AUTO_SUMMARY";
+const SECOND_AUTO_SUMMARY: &str = "SECOND_AUTO_SUMMARY";
+const FINAL_REPLY: &str = "FINAL_REPLY";
+const DUMMY_FUNCTION_NAME: &str = "unsupported_tool";
+const DUMMY_CALL_ID: &str = "call-multi-auto";

 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn summarize_context_three_requests_and_instructions() {
@@ -99,7 +153,7 @@ async fn summarize_context_three_requests_and_instructions() {
    }

    // Set up a mock server that we can inspect after the run.
-    let server = MockServer::start().await;
+    let server = start_mock_server().await;

    // SSE 1: assistant replies normally so it is recorded in history.
    let sse1 = sse(vec![
@@ -144,6 +198,7 @@ async fn summarize_context_three_requests_and_instructions() {
    let home = TempDir::new().unwrap();
    let mut config = load_default_config_for_test(&home);
    config.model_provider = model_provider;
+    config.model_auto_compact_token_limit = Some(200_000);
    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
    let NewConversation {
        conversation: codex,
@@ -198,7 +253,7 @@ async fn summarize_context_three_requests_and_instructions() {
        "summarization should override base instructions"
    );
    assert!(
-        instr2.contains("You are a summarization assistant"),
+        instr2.contains("You have exceeded the maximum number of tokens"),
        "summarization instructions not applied"
    );

@@ -209,14 +264,17 @@ async fn summarize_context_three_requests_and_instructions() {
    assert_eq!(last2.get("type").unwrap().as_str().unwrap(), "message");
    assert_eq!(last2.get("role").unwrap().as_str().unwrap(), "user");
    let text2 = last2["content"][0]["text"].as_str().unwrap();
-    assert!(text2.contains(SUMMARIZE_TRIGGER));
+    assert!(
+        text2.contains(SUMMARIZE_TRIGGER),
+        "expected summarize trigger, got `{text2}`"
+    );

-    // Third request must contain only the summary from step 2 as prior history plus new user msg.
+    // Third request must contain the refreshed instructions, bridge summary message and new user msg.
    let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap();
    println!("third request body: {body3}");
    assert!(
-        input3.len() >= 2,
-        "expected summary + new user message in third request"
+        input3.len() >= 3,
+        "expected refreshed context and new user message in third request"
    );

    // Collect all (role, text) message tuples.
@@ -232,24 +290,35 @@ async fn summarize_context_three_requests_and_instructions() {
        }
    }

-    // Exactly one assistant message should remain after compaction and the new user message is present.
+    // No previous assistant messages should remain and the new user message is present.
    let assistant_count = messages.iter().filter(|(r, _)| r == "assistant").count();
-    assert_eq!(
-        assistant_count, 1,
-        "exactly one assistant message should remain after compaction"
-    );
+    assert_eq!(assistant_count, 0, "assistant history should be cleared");
    assert!(
        messages
            .iter()
            .any(|(r, t)| r == "user" && t == THIRD_USER_MSG),
        "third request should include the new user message"
    );
+    let Some((_, bridge_text)) = messages.iter().find(|(role, text)| {
+        role == "user"
+            && (text.contains("Here were the user messages")
+                || text.contains("Here are all the user messages"))
+            && text.contains(SUMMARY_TEXT)
+    }) else {
+        panic!("expected a bridge message containing the summary");
+    };
    assert!(
-        !messages.iter().any(|(_, t)| t.contains("hello world")),
-        "third request should not include the original user input"
+        bridge_text.contains("hello world"),
+        "bridge should capture earlier user messages"
    );
    assert!(
-        !messages.iter().any(|(_, t)| t.contains(SUMMARIZE_TRIGGER)),
+        !bridge_text.contains(SUMMARIZE_TRIGGER),
+        "bridge text should not echo the summarize trigger"
+    );
+    assert!(
+        !messages
+            .iter()
+            .any(|(_, text)| text.contains(SUMMARIZE_TRIGGER)),
        "third request should not include the summarize trigger"
    );

@@ -258,6 +327,7 @@ async fn summarize_context_three_requests_and_instructions() {
    wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;

    // Verify rollout contains APITurn entries for each API call and a Compacted entry.
+    println!("rollout path: {}", rollout_path.display());
    let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
        panic!(
            "failed to read rollout file {}: {e}",
@@ -296,3 +366,506 @@ async fn summarize_context_three_requests_and_instructions() {
        "expected a Compacted entry containing the summarizer output"
    );
 }
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn auto_compact_runs_after_token_limit_hit() {
+    if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
+        println!(
+            "Skipping test because it cannot execute when network is disabled in a Codex sandbox."
+        );
+        return;
+    }
+
+    let server = start_mock_server().await;
+
+    let sse1 = sse(vec![
+        ev_assistant_message("m1", FIRST_REPLY),
+        ev_completed_with_tokens("r1", 70_000),
+    ]);
+
+    let sse2 = sse(vec![
+        ev_assistant_message("m2", "SECOND_REPLY"),
+        ev_completed_with_tokens("r2", 330_000),
+    ]);
+
+    let sse3 = sse(vec![
+        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
+        ev_completed_with_tokens("r3", 200),
+    ]);
+
+    let first_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains(FIRST_AUTO_MSG)
+            && !body.contains(SECOND_AUTO_MSG)
+            && !body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(first_matcher)
+        .respond_with(sse_response(sse1))
+        .mount(&server)
+        .await;
+
+    let second_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains(SECOND_AUTO_MSG)
+            && body.contains(FIRST_AUTO_MSG)
+            && !body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(second_matcher)
+        .respond_with(sse_response(sse2))
+        .mount(&server)
+        .await;
+
+    let third_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(third_matcher)
+        .respond_with(sse_response(sse3))
+        .mount(&server)
+        .await;
+
+    let model_provider = ModelProviderInfo {
+        base_url: Some(format!("{}/v1", server.uri())),
+        ..built_in_model_providers()["openai"].clone()
+    };
+
+    let home = TempDir::new().unwrap();
+    let mut config = load_default_config_for_test(&home);
+    config.model_provider = model_provider;
+    config.model_auto_compact_token_limit = Some(200_000);
+    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
+    let codex = conversation_manager
+        .new_conversation(config)
+        .await
+        .unwrap()
+        .conversation;
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![InputItem::Text {
+                text: FIRST_AUTO_MSG.into(),
+            }],
+        })
+        .await
+        .unwrap();
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![InputItem::Text {
+                text: SECOND_AUTO_MSG.into(),
+            }],
+        })
+        .await
+        .unwrap();
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+    // wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    let requests = server.received_requests().await.unwrap();
+    assert_eq!(requests.len(), 3, "auto compact should add a third request");
+
+    let body3 = requests[2].body_json::<serde_json::Value>().unwrap();
+    let instructions = body3
+        .get("instructions")
+        .and_then(|v| v.as_str())
+        .unwrap_or_default();
+    assert!(
+        instructions.contains("You have exceeded the maximum number of tokens"),
+        "auto compact should reuse summarization instructions"
+    );
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn auto_compact_persists_rollout_entries() {
+    if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
+        println!(
+            "Skipping test because it cannot execute when network is disabled in a Codex sandbox."
+        );
+        return;
+    }
+
+    let server = start_mock_server().await;
+
+    let sse1 = sse(vec![
+        ev_assistant_message("m1", FIRST_REPLY),
+        ev_completed_with_tokens("r1", 70_000),
+    ]);
+
+    let sse2 = sse(vec![
+        ev_assistant_message("m2", "SECOND_REPLY"),
+        ev_completed_with_tokens("r2", 330_000),
+    ]);
+
+    let sse3 = sse(vec![
+        ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
+        ev_completed_with_tokens("r3", 200),
+    ]);
+
+    let first_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains(FIRST_AUTO_MSG)
+            && !body.contains(SECOND_AUTO_MSG)
+            && !body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(first_matcher)
+        .respond_with(sse_response(sse1))
+        .mount(&server)
+        .await;
+
+    let second_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains(SECOND_AUTO_MSG)
+            && body.contains(FIRST_AUTO_MSG)
+            && !body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(second_matcher)
+        .respond_with(sse_response(sse2))
+        .mount(&server)
+        .await;
+
+    let third_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(third_matcher)
+        .respond_with(sse_response(sse3))
+        .mount(&server)
+        .await;
+
+    let model_provider = ModelProviderInfo {
+        base_url: Some(format!("{}/v1", server.uri())),
+        ..built_in_model_providers()["openai"].clone()
+    };
+
+    let home = TempDir::new().unwrap();
+    let mut config = load_default_config_for_test(&home);
+    config.model_provider = model_provider;
+    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
+    let NewConversation {
+        conversation: codex,
+        session_configured,
+        ..
+    } = conversation_manager.new_conversation(config).await.unwrap();
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![InputItem::Text {
+                text: FIRST_AUTO_MSG.into(),
+            }],
+        })
+        .await
+        .unwrap();
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![InputItem::Text {
+                text: SECOND_AUTO_MSG.into(),
+            }],
+        })
+        .await
+        .unwrap();
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    codex.submit(Op::Shutdown).await.unwrap();
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::ShutdownComplete)).await;
+
+    let rollout_path = session_configured.rollout_path;
+    let text = std::fs::read_to_string(&rollout_path).unwrap_or_else(|e| {
+        panic!(
+            "failed to read rollout file {}: {e}",
+            rollout_path.display()
+        )
+    });
+
+    let mut turn_context_count = 0usize;
+    for line in text.lines() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        let Ok(entry): Result<RolloutLine, _> = serde_json::from_str(trimmed) else {
+            continue;
+        };
+        match entry.item {
+            RolloutItem::TurnContext(_) => {
+                turn_context_count += 1;
+            }
+            RolloutItem::Compacted(_) => {}
+            _ => {}
+        }
+    }
+
+    assert!(
+        turn_context_count >= 2,
+        "expected at least two turn context entries, got {turn_context_count}"
+    );
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn auto_compact_stops_after_failed_attempt() {
+    if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
+        println!(
+            "Skipping test because it cannot execute when network is disabled in a Codex sandbox."
+        );
+        return;
+    }
+
+    let server = start_mock_server().await;
+
+    let sse1 = sse(vec![
+        ev_assistant_message("m1", FIRST_REPLY),
+        ev_completed_with_tokens("r1", 500),
+    ]);
+
+    let sse2 = sse(vec![
+        ev_assistant_message("m2", SUMMARY_TEXT),
+        ev_completed_with_tokens("r2", 50),
+    ]);
+
+    let sse3 = sse(vec![
+        ev_assistant_message("m3", STILL_TOO_BIG_REPLY),
+        ev_completed_with_tokens("r3", 500),
+    ]);
+
+    let first_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains(FIRST_AUTO_MSG)
+            && !body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(first_matcher)
+        .respond_with(sse_response(sse1.clone()))
+        .mount(&server)
+        .await;
+
+    let second_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        body.contains("You have exceeded the maximum number of tokens")
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(second_matcher)
+        .respond_with(sse_response(sse2.clone()))
+        .mount(&server)
+        .await;
+
+    let third_matcher = |req: &wiremock::Request| {
+        let body = std::str::from_utf8(&req.body).unwrap_or("");
+        !body.contains("You have exceeded the maximum number of tokens")
+            && body.contains(SUMMARY_TEXT)
+    };
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .and(third_matcher)
+        .respond_with(sse_response(sse3.clone()))
+        .mount(&server)
+        .await;
+
+    let model_provider = ModelProviderInfo {
+        base_url: Some(format!("{}/v1", server.uri())),
+        ..built_in_model_providers()["openai"].clone()
+    };
+
+    let home = TempDir::new().unwrap();
+    let mut config = load_default_config_for_test(&home);
+    config.model_provider = model_provider;
+    config.model_auto_compact_token_limit = Some(200);
+    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
+    let codex = conversation_manager
+        .new_conversation(config)
+        .await
+        .unwrap()
+        .conversation;
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![InputItem::Text {
+                text: FIRST_AUTO_MSG.into(),
+            }],
+        })
+        .await
+        .unwrap();
+
+    let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await;
+    let EventMsg::Error(ErrorEvent { message }) = error_event else {
+        panic!("expected error event");
+    };
+    assert!(
+        message.contains("limit"),
+        "error message should include limit information: {message}"
+    );
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    let requests = server.received_requests().await.unwrap();
+    assert_eq!(
+        requests.len(),
+        3,
+        "auto compact should attempt at most one summarization before erroring"
+    );
+
+    let last_body = requests[2].body_json::<serde_json::Value>().unwrap();
+    let instructions = last_body
+        .get("instructions")
+        .and_then(|v| v.as_str())
+        .unwrap_or_default();
+    assert!(
+        !instructions.contains("You have exceeded the maximum number of tokens"),
+        "third request should be the follow-up turn, not another summarization"
+    );
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_events() {
+    if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
+        println!(
+            "Skipping test because it cannot execute when network is disabled in a Codex sandbox."
+        );
+        return;
+    }
+
+    let server = start_mock_server().await;
+
+    let sse1 = sse(vec![
+        ev_assistant_message("m1", FIRST_REPLY),
+        ev_completed_with_tokens("r1", 500),
+    ]);
+    let sse2 = sse(vec![
+        ev_assistant_message("m2", FIRST_AUTO_SUMMARY),
+        ev_completed_with_tokens("r2", 50),
+    ]);
+    let sse3 = sse(vec![
+        ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
+        ev_completed_with_tokens("r3", 150),
+    ]);
+    let sse4 = sse(vec![
+        ev_assistant_message("m4", SECOND_LARGE_REPLY),
+        ev_completed_with_tokens("r4", 450),
+    ]);
+    let sse5 = sse(vec![
+        ev_assistant_message("m5", SECOND_AUTO_SUMMARY),
+        ev_completed_with_tokens("r5", 60),
+    ]);
+    let sse6 = sse(vec![
+        ev_assistant_message("m6", FINAL_REPLY),
+        ev_completed_with_tokens("r6", 120),
+    ]);
+
+    #[derive(Clone)]
+    struct SeqResponder {
+        bodies: Arc<Vec<String>>,
+        calls: Arc<AtomicUsize>,
+        requests: Arc<Mutex<Vec<Vec<u8>>>>,
+    }
+
+    impl SeqResponder {
+        fn new(bodies: Vec<String>) -> Self {
+            Self {
+                bodies: Arc::new(bodies),
+                calls: Arc::new(AtomicUsize::new(0)),
+                requests: Arc::new(Mutex::new(Vec::new())),
+            }
+        }
+
+        fn recorded_requests(&self) -> Vec<Vec<u8>> {
+            self.requests.lock().unwrap().clone()
+        }
+    }
+
+    impl Respond for SeqResponder {
+        fn respond(&self, req: &Request) -> ResponseTemplate {
+            let idx = self.calls.fetch_add(1, Ordering::SeqCst);
+            self.requests.lock().unwrap().push(req.body.clone());
+            let body = self
+                .bodies
+                .get(idx)
+                .unwrap_or_else(|| panic!("unexpected request index {idx}"))
+                .clone();
+            ResponseTemplate::new(200)
+                .insert_header("content-type", "text/event-stream")
+                .set_body_raw(body, "text/event-stream")
+        }
+    }
+
+    let responder = SeqResponder::new(vec![sse1, sse2, sse3, sse4, sse5, sse6]);
+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .respond_with(responder.clone())
+        .expect(6)
+        .mount(&server)
+        .await;
+
+    let model_provider = ModelProviderInfo {
+        base_url: Some(format!("{}/v1", server.uri())),
+        ..built_in_model_providers()["openai"].clone()
+    };
+
+    let home = TempDir::new().unwrap();
+    let mut config = load_default_config_for_test(&home);
+    config.model_provider = model_provider;
+    config.model_auto_compact_token_limit = Some(200);
+    let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
+    let codex = conversation_manager
+        .new_conversation(config)
+        .await
+        .unwrap()
+        .conversation;
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![InputItem::Text {
+                text: MULTI_AUTO_MSG.into(),
+            }],
+        })
+        .await
+        .unwrap();
+
+    loop {
+        let event = codex.next_event().await.unwrap();
+        if let EventMsg::TaskComplete(_) = &event.msg
+            && !event.id.starts_with("auto-compact-")
+        {
+            break;
+        }
+    }
+
+    let request_bodies: Vec<String> = responder
+        .recorded_requests()
+        .into_iter()
+        .map(|body| String::from_utf8(body).unwrap_or_default())
+        .collect();
+    assert_eq!(
+        request_bodies.len(),
+        6,
+        "expected six requests including two auto compactions"
+    );
+    assert!(
+        request_bodies[0].contains(MULTI_AUTO_MSG),
+        "first request should contain the user input"
+    );
+    assert!(
+        request_bodies[1].contains("You have exceeded the maximum number of tokens"),
+        "first auto compact request should use summarization instructions"
+    );
+    assert!(
+        request_bodies[3].contains(&format!("unsupported call: {DUMMY_FUNCTION_NAME}")),
+        "function call output should be sent before the second auto compact"
+    );
+    assert!(
+        request_bodies[4].contains("You have exceeded the maximum number of tokens"),
+        "second auto compact request should reuse summarization instructions"
+    );
+}
--- a/codex-rs/protocol/src/protocol.rs
+++ b/codex-rs/protocol/src/protocol.rs
@@ -712,12 +712,12 @@ where
        let (_role, message) = value;
        let message = message.as_ref();
        let trimmed = message.trim();
-        if trimmed.starts_with(ENVIRONMENT_CONTEXT_OPEN_TAG)
-            && trimmed.ends_with(ENVIRONMENT_CONTEXT_CLOSE_TAG)
+        if starts_with_ignore_ascii_case(trimmed, ENVIRONMENT_CONTEXT_OPEN_TAG)
+            && ends_with_ignore_ascii_case(trimmed, ENVIRONMENT_CONTEXT_CLOSE_TAG)
        {
            InputMessageKind::EnvironmentContext
-        } else if trimmed.starts_with(USER_INSTRUCTIONS_OPEN_TAG)
-            && trimmed.ends_with(USER_INSTRUCTIONS_CLOSE_TAG)
+        } else if starts_with_ignore_ascii_case(trimmed, USER_INSTRUCTIONS_OPEN_TAG)
+            && ends_with_ignore_ascii_case(trimmed, USER_INSTRUCTIONS_CLOSE_TAG)
        {
            InputMessageKind::UserInstructions
        } else {
@@ -726,6 +726,26 @@ where
    }
 }

+fn starts_with_ignore_ascii_case(text: &str, prefix: &str) -> bool {
+    let text_bytes = text.as_bytes();
+    let prefix_bytes = prefix.as_bytes();
+    text_bytes.len() >= prefix_bytes.len()
+        && text_bytes
+            .iter()
+            .zip(prefix_bytes.iter())
+            .all(|(a, b)| a.eq_ignore_ascii_case(b))
+}
+
+fn ends_with_ignore_ascii_case(text: &str, suffix: &str) -> bool {
+    let text_bytes = text.as_bytes();
+    let suffix_bytes = suffix.as_bytes();
+    text_bytes.len() >= suffix_bytes.len()
+        && text_bytes[text_bytes.len() - suffix_bytes.len()..]
+            .iter()
+            .zip(suffix_bytes.iter())
+            .all(|(a, b)| a.eq_ignore_ascii_case(b))
+}
+
 #[derive(Debug, Clone, Deserialize, Serialize, TS)]
 pub struct AgentMessageDeltaEvent {
    pub delta: String,