Auto compact at ~90% (#5292)

Users now hit a window exceeded limit and they usually don't know what to do. This starts auto compact at ~90% of the window.
2025-10-20 11:29:49 -07:00
parent cda6db6ccf
commit 049a61bcfc
21 changed files with 236 additions and 110 deletions
--- a/codex-rs/core/src/client.rs
+++ b/codex-rs/core/src/client.rs
@@ -112,10 +112,12 @@ impl ModelClient {
        }
    }

-    pub fn get_model_context_window(&self) -> Option<u64> {
+    pub fn get_model_context_window(&self) -> Option<i64> {
+        let pct = self.config.model_family.effective_context_window_percent;
        self.config
            .model_context_window
            .or_else(|| get_model_info(&self.config.model_family).map(|info| info.context_window))
+            .map(|w| w.saturating_mul(pct) / 100)
    }

    pub fn get_auto_compact_token_limit(&self) -> Option<i64> {
@@ -544,11 +546,11 @@ struct ResponseCompleted {

 #[derive(Debug, Deserialize)]
 struct ResponseCompletedUsage {
-    input_tokens: u64,
+    input_tokens: i64,
    input_tokens_details: Option<ResponseCompletedInputTokensDetails>,
-    output_tokens: u64,
+    output_tokens: i64,
    output_tokens_details: Option<ResponseCompletedOutputTokensDetails>,
-    total_tokens: u64,
+    total_tokens: i64,
 }

 impl From<ResponseCompletedUsage> for TokenUsage {
@@ -571,12 +573,12 @@ impl From<ResponseCompletedUsage> for TokenUsage {

 #[derive(Debug, Deserialize)]
 struct ResponseCompletedInputTokensDetails {
-    cached_tokens: u64,
+    cached_tokens: i64,
 }

 #[derive(Debug, Deserialize)]
 struct ResponseCompletedOutputTokensDetails {
-    reasoning_tokens: u64,
+    reasoning_tokens: i64,
 }

 fn attach_item_ids(payload_json: &mut Value, original_items: &[ResponseItem]) {
@@ -633,7 +635,7 @@ fn parse_rate_limit_window(
    let used_percent: Option<f64> = parse_header_f64(headers, used_percent_header);

    used_percent.and_then(|used_percent| {
-        let window_minutes = parse_header_u64(headers, window_minutes_header);
+        let window_minutes = parse_header_i64(headers, window_minutes_header);
        let resets_at = parse_header_str(headers, resets_header)
            .map(str::trim)
            .filter(|value| !value.is_empty())
@@ -658,8 +660,8 @@ fn parse_header_f64(headers: &HeaderMap, name: &str) -> Option<f64> {
        .filter(|v| v.is_finite())
 }

-fn parse_header_u64(headers: &HeaderMap, name: &str) -> Option<u64> {
-    parse_header_str(headers, name)?.parse::<u64>().ok()
+fn parse_header_i64(headers: &HeaderMap, name: &str) -> Option<i64> {
+    parse_header_str(headers, name)?.parse::<i64>().ok()
 }

 fn parse_header_str<'a>(headers: &'a HeaderMap, name: &str) -> Option<&'a str> {
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -1778,7 +1778,7 @@ pub(crate) async fn run_task(
                    .as_ref()
                    .map(TokenUsage::tokens_in_context_window);
                let token_limit_reached = total_usage_tokens
-                    .map(|tokens| (tokens as i64) >= limit)
+                    .map(|tokens| tokens >= limit)
                    .unwrap_or(false);
                let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
                let mut responses = Vec::<ResponseInputItem>::new();
--- a/codex-rs/core/src/config.rs
+++ b/codex-rs/core/src/config.rs
@@ -85,10 +85,10 @@ pub struct Config {
    pub model_family: ModelFamily,

    /// Size of the context window for the model, in tokens.
-    pub model_context_window: Option<u64>,
+    pub model_context_window: Option<i64>,

    /// Maximum number of output tokens.
-    pub model_max_output_tokens: Option<u64>,
+    pub model_max_output_tokens: Option<i64>,

    /// Token usage threshold triggering auto-compaction of conversation history.
    pub model_auto_compact_token_limit: Option<i64>,
@@ -824,10 +824,10 @@ pub struct ConfigToml {
    pub model_provider: Option<String>,

    /// Size of the context window for the model, in tokens.
-    pub model_context_window: Option<u64>,
+    pub model_context_window: Option<i64>,

    /// Maximum number of output tokens.
-    pub model_max_output_tokens: Option<u64>,
+    pub model_max_output_tokens: Option<i64>,

    /// Token usage threshold triggering auto-compaction of conversation history.
    pub model_auto_compact_token_limit: Option<i64>,
@@ -2805,7 +2805,7 @@ model_verbosity = "high"
                model_family: find_family_for_model("o3").expect("known model slug"),
                model_context_window: Some(200_000),
                model_max_output_tokens: Some(100_000),
-                model_auto_compact_token_limit: None,
+                model_auto_compact_token_limit: Some(180_000),
                model_provider_id: "openai".to_string(),
                model_provider: fixture.openai_provider.clone(),
                approval_policy: AskForApproval::Never,
@@ -2874,7 +2874,7 @@ model_verbosity = "high"
            model_family: find_family_for_model("gpt-3.5-turbo").expect("known model slug"),
            model_context_window: Some(16_385),
            model_max_output_tokens: Some(4_096),
-            model_auto_compact_token_limit: None,
+            model_auto_compact_token_limit: Some(14_746),
            model_provider_id: "openai-chat-completions".to_string(),
            model_provider: fixture.openai_chat_completions_provider.clone(),
            approval_policy: AskForApproval::UnlessTrusted,
@@ -2958,7 +2958,7 @@ model_verbosity = "high"
            model_family: find_family_for_model("o3").expect("known model slug"),
            model_context_window: Some(200_000),
            model_max_output_tokens: Some(100_000),
-            model_auto_compact_token_limit: None,
+            model_auto_compact_token_limit: Some(180_000),
            model_provider_id: "openai".to_string(),
            model_provider: fixture.openai_provider.clone(),
            approval_policy: AskForApproval::OnFailure,
@@ -3028,7 +3028,7 @@ model_verbosity = "high"
            model_family: find_family_for_model("gpt-5").expect("known model slug"),
            model_context_window: Some(272_000),
            model_max_output_tokens: Some(128_000),
-            model_auto_compact_token_limit: None,
+            model_auto_compact_token_limit: Some(244_800),
            model_provider_id: "openai".to_string(),
            model_provider: fixture.openai_provider.clone(),
            approval_policy: AskForApproval::OnFailure,
--- a/codex-rs/core/src/model_family.rs
+++ b/codex-rs/core/src/model_family.rs
@@ -48,6 +48,12 @@ pub struct ModelFamily {

    /// Names of beta tools that should be exposed to this model family.
    pub experimental_supported_tools: Vec<String>,
+
+    /// Percentage of the context window considered usable for inputs, after
+    /// reserving headroom for system prompts, tool overhead, and model output.
+    /// This is applied when computing the effective context window seen by
+    /// consumers.
+    pub effective_context_window_percent: i64,
 }

 macro_rules! model_family {
@@ -66,6 +72,7 @@ macro_rules! model_family {
            apply_patch_tool_type: None,
            base_instructions: BASE_INSTRUCTIONS.to_string(),
            experimental_supported_tools: Vec::new(),
+            effective_context_window_percent: 95,
        };
        // apply overrides
        $(
@@ -175,5 +182,6 @@ pub fn derive_default_model_family(model: &str) -> ModelFamily {
        apply_patch_tool_type: None,
        base_instructions: BASE_INSTRUCTIONS.to_string(),
        experimental_supported_tools: Vec::new(),
+        effective_context_window_percent: 95,
    }
 }
--- a/codex-rs/core/src/openai_model_info.rs
+++ b/codex-rs/core/src/openai_model_info.rs
@@ -1,5 +1,9 @@
 use crate::model_family::ModelFamily;

+// Shared constants for commonly used window/token sizes.
+pub(crate) const CONTEXT_WINDOW_272K: i64 = 272_000;
+pub(crate) const MAX_OUTPUT_TOKENS_128K: i64 = 128_000;
+
 /// Metadata about a model, particularly OpenAI models.
 /// We may want to consider including details like the pricing for
 /// input tokens, output tokens, etc., though users will need to be able to
@@ -8,10 +12,10 @@ use crate::model_family::ModelFamily;
 #[derive(Debug)]
 pub(crate) struct ModelInfo {
    /// Size of the context window in tokens. This is the maximum size of the input context.
-    pub(crate) context_window: u64,
+    pub(crate) context_window: i64,

    /// Maximum number of output tokens that can be generated for the model.
-    pub(crate) max_output_tokens: u64,
+    pub(crate) max_output_tokens: i64,

    /// Token threshold where we should automatically compact conversation history. This considers
    /// input tokens + output tokens of this turn.
@@ -19,13 +23,17 @@ pub(crate) struct ModelInfo {
 }

 impl ModelInfo {
-    const fn new(context_window: u64, max_output_tokens: u64) -> Self {
+    const fn new(context_window: i64, max_output_tokens: i64) -> Self {
        Self {
            context_window,
            max_output_tokens,
-            auto_compact_token_limit: None,
+            auto_compact_token_limit: Some(Self::default_auto_compact_limit(context_window)),
        }
    }
+
+    const fn default_auto_compact_limit(context_window: i64) -> i64 {
+        (context_window * 9) / 10
+    }
 }

 pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
@@ -62,15 +70,17 @@ pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
        // https://platform.openai.com/docs/models/gpt-3.5-turbo
        "gpt-3.5-turbo" => Some(ModelInfo::new(16_385, 4_096)),

-        _ if slug.starts_with("gpt-5-codex") => Some(ModelInfo {
-            context_window: 272_000,
-            max_output_tokens: 128_000,
-            auto_compact_token_limit: Some(350_000),
-        }),
+        _ if slug.starts_with("gpt-5-codex") => {
+            Some(ModelInfo::new(CONTEXT_WINDOW_272K, MAX_OUTPUT_TOKENS_128K))
+        }

-        _ if slug.starts_with("gpt-5") => Some(ModelInfo::new(272_000, 128_000)),
+        _ if slug.starts_with("gpt-5") => {
+            Some(ModelInfo::new(CONTEXT_WINDOW_272K, MAX_OUTPUT_TOKENS_128K))
+        }

-        _ if slug.starts_with("codex-") => Some(ModelInfo::new(272_000, 128_000)),
+        _ if slug.starts_with("codex-") => {
+            Some(ModelInfo::new(CONTEXT_WINDOW_272K, MAX_OUTPUT_TOKENS_128K))
+        }

        _ => None,
    }
--- a/codex-rs/core/src/state/session.rs
+++ b/codex-rs/core/src/state/session.rs
@@ -48,7 +48,7 @@ impl SessionState {
    pub(crate) fn update_token_info_from_usage(
        &mut self,
        usage: &TokenUsage,
-        model_context_window: Option<u64>,
+        model_context_window: Option<i64>,
    ) {
        self.token_info = TokenUsageInfo::new_or_append(
            &self.token_info,
@@ -67,7 +67,7 @@ impl SessionState {
        (self.token_info.clone(), self.latest_rate_limits.clone())
    }

-    pub(crate) fn set_token_usage_full(&mut self, context_window: u64) {
+    pub(crate) fn set_token_usage_full(&mut self, context_window: i64) {
        match &mut self.token_info {
            Some(info) => info.fill_to_context_window(context_window),
            None => {