Auto compact at ~90% (#5292)
Users now hit a window exceeded limit and they usually don't know what to do. This starts auto compact at ~90% of the window.
This commit is contained in:
@@ -112,10 +112,12 @@ impl ModelClient {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_model_context_window(&self) -> Option<u64> {
|
||||
pub fn get_model_context_window(&self) -> Option<i64> {
|
||||
let pct = self.config.model_family.effective_context_window_percent;
|
||||
self.config
|
||||
.model_context_window
|
||||
.or_else(|| get_model_info(&self.config.model_family).map(|info| info.context_window))
|
||||
.map(|w| w.saturating_mul(pct) / 100)
|
||||
}
|
||||
|
||||
pub fn get_auto_compact_token_limit(&self) -> Option<i64> {
|
||||
@@ -544,11 +546,11 @@ struct ResponseCompleted {
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ResponseCompletedUsage {
|
||||
input_tokens: u64,
|
||||
input_tokens: i64,
|
||||
input_tokens_details: Option<ResponseCompletedInputTokensDetails>,
|
||||
output_tokens: u64,
|
||||
output_tokens: i64,
|
||||
output_tokens_details: Option<ResponseCompletedOutputTokensDetails>,
|
||||
total_tokens: u64,
|
||||
total_tokens: i64,
|
||||
}
|
||||
|
||||
impl From<ResponseCompletedUsage> for TokenUsage {
|
||||
@@ -571,12 +573,12 @@ impl From<ResponseCompletedUsage> for TokenUsage {
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ResponseCompletedInputTokensDetails {
|
||||
cached_tokens: u64,
|
||||
cached_tokens: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ResponseCompletedOutputTokensDetails {
|
||||
reasoning_tokens: u64,
|
||||
reasoning_tokens: i64,
|
||||
}
|
||||
|
||||
fn attach_item_ids(payload_json: &mut Value, original_items: &[ResponseItem]) {
|
||||
@@ -633,7 +635,7 @@ fn parse_rate_limit_window(
|
||||
let used_percent: Option<f64> = parse_header_f64(headers, used_percent_header);
|
||||
|
||||
used_percent.and_then(|used_percent| {
|
||||
let window_minutes = parse_header_u64(headers, window_minutes_header);
|
||||
let window_minutes = parse_header_i64(headers, window_minutes_header);
|
||||
let resets_at = parse_header_str(headers, resets_header)
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
@@ -658,8 +660,8 @@ fn parse_header_f64(headers: &HeaderMap, name: &str) -> Option<f64> {
|
||||
.filter(|v| v.is_finite())
|
||||
}
|
||||
|
||||
fn parse_header_u64(headers: &HeaderMap, name: &str) -> Option<u64> {
|
||||
parse_header_str(headers, name)?.parse::<u64>().ok()
|
||||
fn parse_header_i64(headers: &HeaderMap, name: &str) -> Option<i64> {
|
||||
parse_header_str(headers, name)?.parse::<i64>().ok()
|
||||
}
|
||||
|
||||
fn parse_header_str<'a>(headers: &'a HeaderMap, name: &str) -> Option<&'a str> {
|
||||
|
||||
@@ -1778,7 +1778,7 @@ pub(crate) async fn run_task(
|
||||
.as_ref()
|
||||
.map(TokenUsage::tokens_in_context_window);
|
||||
let token_limit_reached = total_usage_tokens
|
||||
.map(|tokens| (tokens as i64) >= limit)
|
||||
.map(|tokens| tokens >= limit)
|
||||
.unwrap_or(false);
|
||||
let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
|
||||
let mut responses = Vec::<ResponseInputItem>::new();
|
||||
|
||||
@@ -85,10 +85,10 @@ pub struct Config {
|
||||
pub model_family: ModelFamily,
|
||||
|
||||
/// Size of the context window for the model, in tokens.
|
||||
pub model_context_window: Option<u64>,
|
||||
pub model_context_window: Option<i64>,
|
||||
|
||||
/// Maximum number of output tokens.
|
||||
pub model_max_output_tokens: Option<u64>,
|
||||
pub model_max_output_tokens: Option<i64>,
|
||||
|
||||
/// Token usage threshold triggering auto-compaction of conversation history.
|
||||
pub model_auto_compact_token_limit: Option<i64>,
|
||||
@@ -824,10 +824,10 @@ pub struct ConfigToml {
|
||||
pub model_provider: Option<String>,
|
||||
|
||||
/// Size of the context window for the model, in tokens.
|
||||
pub model_context_window: Option<u64>,
|
||||
pub model_context_window: Option<i64>,
|
||||
|
||||
/// Maximum number of output tokens.
|
||||
pub model_max_output_tokens: Option<u64>,
|
||||
pub model_max_output_tokens: Option<i64>,
|
||||
|
||||
/// Token usage threshold triggering auto-compaction of conversation history.
|
||||
pub model_auto_compact_token_limit: Option<i64>,
|
||||
@@ -2805,7 +2805,7 @@ model_verbosity = "high"
|
||||
model_family: find_family_for_model("o3").expect("known model slug"),
|
||||
model_context_window: Some(200_000),
|
||||
model_max_output_tokens: Some(100_000),
|
||||
model_auto_compact_token_limit: None,
|
||||
model_auto_compact_token_limit: Some(180_000),
|
||||
model_provider_id: "openai".to_string(),
|
||||
model_provider: fixture.openai_provider.clone(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
@@ -2874,7 +2874,7 @@ model_verbosity = "high"
|
||||
model_family: find_family_for_model("gpt-3.5-turbo").expect("known model slug"),
|
||||
model_context_window: Some(16_385),
|
||||
model_max_output_tokens: Some(4_096),
|
||||
model_auto_compact_token_limit: None,
|
||||
model_auto_compact_token_limit: Some(14_746),
|
||||
model_provider_id: "openai-chat-completions".to_string(),
|
||||
model_provider: fixture.openai_chat_completions_provider.clone(),
|
||||
approval_policy: AskForApproval::UnlessTrusted,
|
||||
@@ -2958,7 +2958,7 @@ model_verbosity = "high"
|
||||
model_family: find_family_for_model("o3").expect("known model slug"),
|
||||
model_context_window: Some(200_000),
|
||||
model_max_output_tokens: Some(100_000),
|
||||
model_auto_compact_token_limit: None,
|
||||
model_auto_compact_token_limit: Some(180_000),
|
||||
model_provider_id: "openai".to_string(),
|
||||
model_provider: fixture.openai_provider.clone(),
|
||||
approval_policy: AskForApproval::OnFailure,
|
||||
@@ -3028,7 +3028,7 @@ model_verbosity = "high"
|
||||
model_family: find_family_for_model("gpt-5").expect("known model slug"),
|
||||
model_context_window: Some(272_000),
|
||||
model_max_output_tokens: Some(128_000),
|
||||
model_auto_compact_token_limit: None,
|
||||
model_auto_compact_token_limit: Some(244_800),
|
||||
model_provider_id: "openai".to_string(),
|
||||
model_provider: fixture.openai_provider.clone(),
|
||||
approval_policy: AskForApproval::OnFailure,
|
||||
|
||||
@@ -48,6 +48,12 @@ pub struct ModelFamily {
|
||||
|
||||
/// Names of beta tools that should be exposed to this model family.
|
||||
pub experimental_supported_tools: Vec<String>,
|
||||
|
||||
/// Percentage of the context window considered usable for inputs, after
|
||||
/// reserving headroom for system prompts, tool overhead, and model output.
|
||||
/// This is applied when computing the effective context window seen by
|
||||
/// consumers.
|
||||
pub effective_context_window_percent: i64,
|
||||
}
|
||||
|
||||
macro_rules! model_family {
|
||||
@@ -66,6 +72,7 @@ macro_rules! model_family {
|
||||
apply_patch_tool_type: None,
|
||||
base_instructions: BASE_INSTRUCTIONS.to_string(),
|
||||
experimental_supported_tools: Vec::new(),
|
||||
effective_context_window_percent: 95,
|
||||
};
|
||||
// apply overrides
|
||||
$(
|
||||
@@ -175,5 +182,6 @@ pub fn derive_default_model_family(model: &str) -> ModelFamily {
|
||||
apply_patch_tool_type: None,
|
||||
base_instructions: BASE_INSTRUCTIONS.to_string(),
|
||||
experimental_supported_tools: Vec::new(),
|
||||
effective_context_window_percent: 95,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
use crate::model_family::ModelFamily;
|
||||
|
||||
// Shared constants for commonly used window/token sizes.
|
||||
pub(crate) const CONTEXT_WINDOW_272K: i64 = 272_000;
|
||||
pub(crate) const MAX_OUTPUT_TOKENS_128K: i64 = 128_000;
|
||||
|
||||
/// Metadata about a model, particularly OpenAI models.
|
||||
/// We may want to consider including details like the pricing for
|
||||
/// input tokens, output tokens, etc., though users will need to be able to
|
||||
@@ -8,10 +12,10 @@ use crate::model_family::ModelFamily;
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ModelInfo {
|
||||
/// Size of the context window in tokens. This is the maximum size of the input context.
|
||||
pub(crate) context_window: u64,
|
||||
pub(crate) context_window: i64,
|
||||
|
||||
/// Maximum number of output tokens that can be generated for the model.
|
||||
pub(crate) max_output_tokens: u64,
|
||||
pub(crate) max_output_tokens: i64,
|
||||
|
||||
/// Token threshold where we should automatically compact conversation history. This considers
|
||||
/// input tokens + output tokens of this turn.
|
||||
@@ -19,13 +23,17 @@ pub(crate) struct ModelInfo {
|
||||
}
|
||||
|
||||
impl ModelInfo {
|
||||
const fn new(context_window: u64, max_output_tokens: u64) -> Self {
|
||||
const fn new(context_window: i64, max_output_tokens: i64) -> Self {
|
||||
Self {
|
||||
context_window,
|
||||
max_output_tokens,
|
||||
auto_compact_token_limit: None,
|
||||
auto_compact_token_limit: Some(Self::default_auto_compact_limit(context_window)),
|
||||
}
|
||||
}
|
||||
|
||||
const fn default_auto_compact_limit(context_window: i64) -> i64 {
|
||||
(context_window * 9) / 10
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
|
||||
@@ -62,15 +70,17 @@ pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
|
||||
// https://platform.openai.com/docs/models/gpt-3.5-turbo
|
||||
"gpt-3.5-turbo" => Some(ModelInfo::new(16_385, 4_096)),
|
||||
|
||||
_ if slug.starts_with("gpt-5-codex") => Some(ModelInfo {
|
||||
context_window: 272_000,
|
||||
max_output_tokens: 128_000,
|
||||
auto_compact_token_limit: Some(350_000),
|
||||
}),
|
||||
_ if slug.starts_with("gpt-5-codex") => {
|
||||
Some(ModelInfo::new(CONTEXT_WINDOW_272K, MAX_OUTPUT_TOKENS_128K))
|
||||
}
|
||||
|
||||
_ if slug.starts_with("gpt-5") => Some(ModelInfo::new(272_000, 128_000)),
|
||||
_ if slug.starts_with("gpt-5") => {
|
||||
Some(ModelInfo::new(CONTEXT_WINDOW_272K, MAX_OUTPUT_TOKENS_128K))
|
||||
}
|
||||
|
||||
_ if slug.starts_with("codex-") => Some(ModelInfo::new(272_000, 128_000)),
|
||||
_ if slug.starts_with("codex-") => {
|
||||
Some(ModelInfo::new(CONTEXT_WINDOW_272K, MAX_OUTPUT_TOKENS_128K))
|
||||
}
|
||||
|
||||
_ => None,
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ impl SessionState {
|
||||
pub(crate) fn update_token_info_from_usage(
|
||||
&mut self,
|
||||
usage: &TokenUsage,
|
||||
model_context_window: Option<u64>,
|
||||
model_context_window: Option<i64>,
|
||||
) {
|
||||
self.token_info = TokenUsageInfo::new_or_append(
|
||||
&self.token_info,
|
||||
@@ -67,7 +67,7 @@ impl SessionState {
|
||||
(self.token_info.clone(), self.latest_rate_limits.clone())
|
||||
}
|
||||
|
||||
pub(crate) fn set_token_usage_full(&mut self, context_window: u64) {
|
||||
pub(crate) fn set_token_usage_full(&mut self, context_window: i64) {
|
||||
match &mut self.token_info {
|
||||
Some(info) => info.fill_to_context_window(context_window),
|
||||
None => {
|
||||
|
||||
@@ -138,7 +138,7 @@ pub fn ev_response_created(id: &str) -> Value {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn ev_completed_with_tokens(id: &str, total_tokens: u64) -> Value {
|
||||
pub fn ev_completed_with_tokens(id: &str, total_tokens: i64) -> Value {
|
||||
serde_json::json!({
|
||||
"type": "response.completed",
|
||||
"response": {
|
||||
|
||||
@@ -858,8 +858,8 @@ async fn token_count_includes_rate_limits_snapshot() {
|
||||
"reasoning_output_tokens": 0,
|
||||
"total_tokens": 123
|
||||
},
|
||||
// Default model is gpt-5-codex in tests → 272000 context window
|
||||
"model_context_window": 272000
|
||||
// Default model is gpt-5-codex in tests → 95% usable context window
|
||||
"model_context_window": 258400
|
||||
},
|
||||
"rate_limits": {
|
||||
"primary": {
|
||||
@@ -985,6 +985,8 @@ async fn context_window_error_sets_total_tokens_to_model_window() -> anyhow::Res
|
||||
skip_if_no_network!(Ok(()));
|
||||
let server = MockServer::start().await;
|
||||
|
||||
const EFFECTIVE_CONTEXT_WINDOW: i64 = (272_000 * 95) / 100;
|
||||
|
||||
responses::mount_sse_once_match(
|
||||
&server,
|
||||
body_string_contains("trigger context window"),
|
||||
@@ -1056,8 +1058,11 @@ async fn context_window_error_sets_total_tokens_to_model_window() -> anyhow::Res
|
||||
.info
|
||||
.expect("token usage info present when context window is exceeded");
|
||||
|
||||
assert_eq!(info.model_context_window, Some(272_000));
|
||||
assert_eq!(info.total_token_usage.total_tokens, 272_000);
|
||||
assert_eq!(info.model_context_window, Some(EFFECTIVE_CONTEXT_WINDOW));
|
||||
assert_eq!(
|
||||
info.total_token_usage.total_tokens,
|
||||
EFFECTIVE_CONTEXT_WINDOW
|
||||
);
|
||||
|
||||
let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await;
|
||||
let expected_context_window_message = CodexErr::ContextWindowExceeded.to_string();
|
||||
|
||||
@@ -19,6 +19,7 @@ use core_test_support::responses::ev_assistant_message;
|
||||
use core_test_support::responses::ev_completed;
|
||||
use core_test_support::responses::ev_completed_with_tokens;
|
||||
use core_test_support::responses::ev_function_call;
|
||||
use core_test_support::responses::mount_sse_once;
|
||||
use core_test_support::responses::mount_sse_once_match;
|
||||
use core_test_support::responses::mount_sse_sequence;
|
||||
use core_test_support::responses::sse;
|
||||
@@ -43,6 +44,7 @@ const CONTEXT_LIMIT_MESSAGE: &str =
|
||||
"Your input exceeds the context window of this model. Please adjust your input and try again.";
|
||||
const DUMMY_FUNCTION_NAME: &str = "unsupported_tool";
|
||||
const DUMMY_CALL_ID: &str = "call-multi-auto";
|
||||
const FUNCTION_CALL_LIMIT_MSG: &str = "function call limit push";
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn summarize_context_three_requests_and_instructions() {
|
||||
@@ -860,3 +862,97 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_
|
||||
"second auto compact request should include the summarization prompt"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn auto_compact_triggers_after_function_call_over_95_percent_usage() {
|
||||
skip_if_no_network!();
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
||||
let context_window = 100;
|
||||
let limit = context_window * 90 / 100;
|
||||
let over_limit_tokens = context_window * 95 / 100 + 1;
|
||||
|
||||
let first_turn = sse(vec![
|
||||
ev_function_call(DUMMY_CALL_ID, DUMMY_FUNCTION_NAME, "{}"),
|
||||
ev_completed_with_tokens("r1", 50),
|
||||
]);
|
||||
let function_call_follow_up = sse(vec![
|
||||
ev_assistant_message("m2", FINAL_REPLY),
|
||||
ev_completed_with_tokens("r2", over_limit_tokens),
|
||||
]);
|
||||
let auto_compact_turn = sse(vec![
|
||||
ev_assistant_message("m3", AUTO_SUMMARY_TEXT),
|
||||
ev_completed_with_tokens("r3", 10),
|
||||
]);
|
||||
let post_auto_compact_turn = sse(vec![ev_completed_with_tokens("r4", 10)]);
|
||||
|
||||
// Mount responses in order and keep mocks only for the ones we assert on.
|
||||
let first_turn_mock = mount_sse_once(&server, first_turn).await;
|
||||
let follow_up_mock = mount_sse_once(&server, function_call_follow_up).await;
|
||||
let auto_compact_mock = mount_sse_once(&server, auto_compact_turn).await;
|
||||
// We don't assert on the post-compact request, so no need to keep its mock.
|
||||
mount_sse_once(&server, post_auto_compact_turn).await;
|
||||
|
||||
let model_provider = ModelProviderInfo {
|
||||
base_url: Some(format!("{}/v1", server.uri())),
|
||||
..built_in_model_providers()["openai"].clone()
|
||||
};
|
||||
|
||||
let home = TempDir::new().unwrap();
|
||||
let mut config = load_default_config_for_test(&home);
|
||||
config.model_provider = model_provider;
|
||||
config.model_context_window = Some(context_window);
|
||||
config.model_auto_compact_token_limit = Some(limit);
|
||||
|
||||
let codex = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"))
|
||||
.new_conversation(config)
|
||||
.await
|
||||
.unwrap()
|
||||
.conversation;
|
||||
|
||||
codex
|
||||
.submit(Op::UserInput {
|
||||
items: vec![InputItem::Text {
|
||||
text: FUNCTION_CALL_LIMIT_MSG.into(),
|
||||
}],
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
wait_for_event(&codex, |msg| matches!(msg, EventMsg::TaskComplete(_))).await;
|
||||
|
||||
// Assert first request captured expected user message that triggers function call.
|
||||
let first_request = first_turn_mock.single_request().input();
|
||||
assert!(
|
||||
first_request.iter().any(|item| {
|
||||
item.get("type").and_then(|value| value.as_str()) == Some("message")
|
||||
&& item
|
||||
.get("content")
|
||||
.and_then(|content| content.as_array())
|
||||
.and_then(|entries| entries.first())
|
||||
.and_then(|entry| entry.get("text"))
|
||||
.and_then(|value| value.as_str())
|
||||
== Some(FUNCTION_CALL_LIMIT_MSG)
|
||||
}),
|
||||
"first request should include the user message that triggers the function call"
|
||||
);
|
||||
|
||||
let function_call_output = follow_up_mock
|
||||
.single_request()
|
||||
.function_call_output(DUMMY_CALL_ID);
|
||||
let output_text = function_call_output
|
||||
.get("output")
|
||||
.and_then(|value| value.as_str())
|
||||
.unwrap_or_default();
|
||||
assert!(
|
||||
output_text.contains(DUMMY_FUNCTION_NAME),
|
||||
"function call output should be sent before auto compact"
|
||||
);
|
||||
|
||||
let auto_compact_body = auto_compact_mock.single_request().body_json().to_string();
|
||||
assert!(
|
||||
auto_compact_body.contains("You have exceeded the maximum number of tokens"),
|
||||
"auto compact request should include the summarization prompt after exceeding 95% (limit {limit})"
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user