Correctly calculate remaining context size (#3190)
We had multiple issues with context size calculation: 1. `initial_prompt_tokens` calculation based on cache size is not reliable, cache misses might set it to much higher value. For now hardcoded to a safer constant. 2. Input context size for GPT-5 is 272k (that's where 33% came from). Fixes.
This commit is contained in:
@@ -1382,7 +1382,7 @@ model_verbosity = "high"
|
|||||||
let expected_gpt5_profile_config = Config {
|
let expected_gpt5_profile_config = Config {
|
||||||
model: "gpt-5".to_string(),
|
model: "gpt-5".to_string(),
|
||||||
model_family: find_family_for_model("gpt-5").expect("known model slug"),
|
model_family: find_family_for_model("gpt-5").expect("known model slug"),
|
||||||
model_context_window: Some(400_000),
|
model_context_window: Some(272_000),
|
||||||
model_max_output_tokens: Some(128_000),
|
model_max_output_tokens: Some(128_000),
|
||||||
model_provider_id: "openai".to_string(),
|
model_provider_id: "openai".to_string(),
|
||||||
model_provider: fixture.openai_provider.clone(),
|
model_provider: fixture.openai_provider.clone(),
|
||||||
|
|||||||
@@ -79,12 +79,12 @@ pub(crate) fn get_model_info(model_family: &ModelFamily) -> Option<ModelInfo> {
|
|||||||
}),
|
}),
|
||||||
|
|
||||||
"gpt-5" => Some(ModelInfo {
|
"gpt-5" => Some(ModelInfo {
|
||||||
context_window: 400_000,
|
context_window: 272_000,
|
||||||
max_output_tokens: 128_000,
|
max_output_tokens: 128_000,
|
||||||
}),
|
}),
|
||||||
|
|
||||||
_ if slug.starts_with("codex-") => Some(ModelInfo {
|
_ if slug.starts_with("codex-") => Some(ModelInfo {
|
||||||
context_window: 400_000,
|
context_window: 272_000,
|
||||||
max_output_tokens: 128_000,
|
max_output_tokens: 128_000,
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
|||||||
@@ -527,6 +527,9 @@ pub struct TokenUsage {
|
|||||||
pub total_tokens: u64,
|
pub total_tokens: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Includes prompts, tools and space to call compact.
|
||||||
|
const BASELINE_TOKENS: u64 = 12000;
|
||||||
|
|
||||||
impl TokenUsage {
|
impl TokenUsage {
|
||||||
pub fn is_zero(&self) -> bool {
|
pub fn is_zero(&self) -> bool {
|
||||||
self.total_tokens == 0
|
self.total_tokens == 0
|
||||||
@@ -557,26 +560,22 @@ impl TokenUsage {
|
|||||||
/// Estimate the remaining user-controllable percentage of the model's context window.
|
/// Estimate the remaining user-controllable percentage of the model's context window.
|
||||||
///
|
///
|
||||||
/// `context_window` is the total size of the model's context window.
|
/// `context_window` is the total size of the model's context window.
|
||||||
/// `baseline_used_tokens` should capture tokens that are always present in
|
/// `BASELINE_TOKENS` should capture tokens that are always present in
|
||||||
/// the context (e.g., system prompt and fixed tool instructions) so that
|
/// the context (e.g., system prompt and fixed tool instructions) so that
|
||||||
/// the percentage reflects the portion the user can influence.
|
/// the percentage reflects the portion the user can influence.
|
||||||
///
|
///
|
||||||
/// This normalizes both the numerator and denominator by subtracting the
|
/// This normalizes both the numerator and denominator by subtracting the
|
||||||
/// baseline, so immediately after the first prompt the UI shows 100% left
|
/// baseline, so immediately after the first prompt the UI shows 100% left
|
||||||
/// and trends toward 0% as the user fills the effective window.
|
/// and trends toward 0% as the user fills the effective window.
|
||||||
pub fn percent_of_context_window_remaining(
|
pub fn percent_of_context_window_remaining(&self, context_window: u64) -> u8 {
|
||||||
&self,
|
if context_window <= BASELINE_TOKENS {
|
||||||
context_window: u64,
|
|
||||||
baseline_used_tokens: u64,
|
|
||||||
) -> u8 {
|
|
||||||
if context_window <= baseline_used_tokens {
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let effective_window = context_window - baseline_used_tokens;
|
let effective_window = context_window - BASELINE_TOKENS;
|
||||||
let used = self
|
let used = self
|
||||||
.tokens_in_context_window()
|
.tokens_in_context_window()
|
||||||
.saturating_sub(baseline_used_tokens);
|
.saturating_sub(BASELINE_TOKENS);
|
||||||
let remaining = effective_window.saturating_sub(used);
|
let remaining = effective_window.saturating_sub(used);
|
||||||
((remaining as f32 / effective_window as f32) * 100.0).clamp(0.0, 100.0) as u8
|
((remaining as f32 / effective_window as f32) * 100.0).clamp(0.0, 100.0) as u8
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,15 +67,6 @@ struct TokenUsageInfo {
|
|||||||
total_token_usage: TokenUsage,
|
total_token_usage: TokenUsage,
|
||||||
last_token_usage: TokenUsage,
|
last_token_usage: TokenUsage,
|
||||||
model_context_window: Option<u64>,
|
model_context_window: Option<u64>,
|
||||||
/// Baseline token count present in the context before the user's first
|
|
||||||
/// message content is considered. This is used to normalize the
|
|
||||||
/// "context left" percentage so it reflects the portion the user can
|
|
||||||
/// influence rather than fixed prompt overhead (system prompt, tool
|
|
||||||
/// instructions, etc.).
|
|
||||||
///
|
|
||||||
/// Preferred source is `cached_input_tokens` from the first turn (when
|
|
||||||
/// available), otherwise we fall back to 0.
|
|
||||||
initial_prompt_tokens: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct ChatComposer {
|
pub(crate) struct ChatComposer {
|
||||||
@@ -181,17 +172,10 @@ impl ChatComposer {
|
|||||||
last_token_usage: TokenUsage,
|
last_token_usage: TokenUsage,
|
||||||
model_context_window: Option<u64>,
|
model_context_window: Option<u64>,
|
||||||
) {
|
) {
|
||||||
let initial_prompt_tokens = self
|
|
||||||
.token_usage_info
|
|
||||||
.as_ref()
|
|
||||||
.map(|info| info.initial_prompt_tokens)
|
|
||||||
.unwrap_or_else(|| last_token_usage.cached_input_tokens.unwrap_or(0));
|
|
||||||
|
|
||||||
self.token_usage_info = Some(TokenUsageInfo {
|
self.token_usage_info = Some(TokenUsageInfo {
|
||||||
total_token_usage,
|
total_token_usage,
|
||||||
last_token_usage,
|
last_token_usage,
|
||||||
model_context_window,
|
model_context_window,
|
||||||
initial_prompt_tokens,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1302,10 +1286,7 @@ impl WidgetRef for ChatComposer {
|
|||||||
let last_token_usage = &token_usage_info.last_token_usage;
|
let last_token_usage = &token_usage_info.last_token_usage;
|
||||||
if let Some(context_window) = token_usage_info.model_context_window {
|
if let Some(context_window) = token_usage_info.model_context_window {
|
||||||
let percent_remaining: u8 = if context_window > 0 {
|
let percent_remaining: u8 = if context_window > 0 {
|
||||||
last_token_usage.percent_of_context_window_remaining(
|
last_token_usage.percent_of_context_window_remaining(context_window)
|
||||||
context_window,
|
|
||||||
token_usage_info.initial_prompt_tokens,
|
|
||||||
)
|
|
||||||
} else {
|
} else {
|
||||||
100
|
100
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user