Auto compact at ~90% (#5292)

Users now hit a window exceeded limit and they usually don't know what
to do. This starts auto compact at ~90% of the window.
This commit is contained in:
Ahmed Ibrahim
2025-10-20 11:29:49 -07:00
committed by GitHub
parent cda6db6ccf
commit 049a61bcfc
21 changed files with 236 additions and 110 deletions

View File

@@ -138,7 +138,7 @@ pub fn ev_response_created(id: &str) -> Value {
})
}
pub fn ev_completed_with_tokens(id: &str, total_tokens: u64) -> Value {
pub fn ev_completed_with_tokens(id: &str, total_tokens: i64) -> Value {
serde_json::json!({
"type": "response.completed",
"response": {

View File

@@ -858,8 +858,8 @@ async fn token_count_includes_rate_limits_snapshot() {
"reasoning_output_tokens": 0,
"total_tokens": 123
},
// Default model is gpt-5-codex in tests → 272000 context window
"model_context_window": 272000
// Default model is gpt-5-codex in tests → 95% usable context window
"model_context_window": 258400
},
"rate_limits": {
"primary": {
@@ -985,6 +985,8 @@ async fn context_window_error_sets_total_tokens_to_model_window() -> anyhow::Res
skip_if_no_network!(Ok(()));
let server = MockServer::start().await;
const EFFECTIVE_CONTEXT_WINDOW: i64 = (272_000 * 95) / 100;
responses::mount_sse_once_match(
&server,
body_string_contains("trigger context window"),
@@ -1056,8 +1058,11 @@ async fn context_window_error_sets_total_tokens_to_model_window() -> anyhow::Res
.info
.expect("token usage info present when context window is exceeded");
assert_eq!(info.model_context_window, Some(272_000));
assert_eq!(info.total_token_usage.total_tokens, 272_000);
assert_eq!(info.model_context_window, Some(EFFECTIVE_CONTEXT_WINDOW));
assert_eq!(
info.total_token_usage.total_tokens,
EFFECTIVE_CONTEXT_WINDOW
);
let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await;
let expected_context_window_message = CodexErr::ContextWindowExceeded.to_string();

View File

@@ -19,6 +19,7 @@ use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_completed_with_tokens;
use core_test_support::responses::ev_function_call;
use core_test_support::responses::mount_sse_once;
use core_test_support::responses::mount_sse_once_match;
use core_test_support::responses::mount_sse_sequence;
use core_test_support::responses::sse;
@@ -43,6 +44,7 @@ const CONTEXT_LIMIT_MESSAGE: &str =
"Your input exceeds the context window of this model. Please adjust your input and try again.";
const DUMMY_FUNCTION_NAME: &str = "unsupported_tool";
const DUMMY_CALL_ID: &str = "call-multi-auto";
const FUNCTION_CALL_LIMIT_MSG: &str = "function call limit push";
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn summarize_context_three_requests_and_instructions() {
@@ -860,3 +862,97 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_
"second auto compact request should include the summarization prompt"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn auto_compact_triggers_after_function_call_over_95_percent_usage() {
skip_if_no_network!();
let server = start_mock_server().await;
let context_window = 100;
let limit = context_window * 90 / 100;
let over_limit_tokens = context_window * 95 / 100 + 1;
let first_turn = sse(vec![
ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
ev_completed_with_tokens("r1", 50),
]);
let function_call_follow_up = sse(vec![
ev_assistant_message("m2", FINAL_REPLY),
ev_completed_with_tokens("r2", over_limit_tokens),
]);
let auto_compact_turn = sse(vec![
ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
ev_completed_with_tokens("r3", 10),
]);
let post_auto_compact_turn = sse(vec![ev_completed_with_tokens("r4", 10)]);
// Mount responses in order and keep mocks only for the ones we assert on.
let first_turn_mock = mount_sse_once(&server, first_turn).await;
let follow_up_mock = mount_sse_once(&server, function_call_follow_up).await;
let auto_compact_mock = mount_sse_once(&server, auto_compact_turn).await;
// We don't assert on the post-compact request, so no need to keep its mock.
mount_sse_once(&server, post_auto_compact_turn).await;
let model_provider = ModelProviderInfo {
base_url: Some(format!("{}/v1", server.uri())),
..built_in_model_providers()["openai"].clone()
};
let home = TempDir::new().unwrap();
let mut config = load_default_config_for_test(&home);
config.model_provider = model_provider;
config.model_context_window = Some(context_window);
config.model_auto_compact_token_limit = Some(limit);
let codex = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"))
.new_conversation(config)
.await
.unwrap()
.conversation;
codex
.submit(Op::UserInput {
items: vec![InputItem::Text {
text: FUNCTION_CALL_LIMIT_MSG.into(),
}],
})
.await
.unwrap();
wait_for_event(&codex, |msg| matches!(msg, EventMsg::TaskComplete(_))).await;
// Assert first request captured expected user message that triggers function call.
let first_request = first_turn_mock.single_request().input();
assert!(
first_request.iter().any(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("message")
&& item
.get("content")
.and_then(|content| content.as_array())
.and_then(|entries| entries.first())
.and_then(|entry| entry.get("text"))
.and_then(|value| value.as_str())
== Some(FUNCTION_CALL_LIMIT_MSG)
}),
"first request should include the user message that triggers the function call"
);
let function_call_output = follow_up_mock
.single_request()
.function_call_output(DUMMY_CALL_ID);
let output_text = function_call_output
.get("output")
.and_then(|value| value.as_str())
.unwrap_or_default();
assert!(
output_text.contains(DUMMY_FUNCTION_NAME),
"function call output should be sent before auto compact"
);
let auto_compact_body = auto_compact_mock.single_request().body_json().to_string();
assert!(
auto_compact_body.contains("You have exceeded the maximum number of tokens"),
"auto compact request should include the summarization prompt after exceeding 95% (limit {limit})"
);
}