Auto compact at ~90% (#5292)
Users now hit a window exceeded limit and they usually don't know what to do. This starts auto compact at ~90% of the window.
This commit is contained in:
@@ -22,27 +22,28 @@ fn formatter() -> &'static DecimalFormatter {
|
||||
FORMATTER.get_or_init(|| make_local_formatter().unwrap_or_else(make_en_us_formatter))
|
||||
}
|
||||
|
||||
/// Format a u64 with locale-aware digit separators (e.g. "12345" -> "12,345"
|
||||
/// Format an i64 with locale-aware digit separators (e.g. "12345" -> "12,345"
|
||||
/// for en-US).
|
||||
pub fn format_with_separators(n: u64) -> String {
|
||||
pub fn format_with_separators(n: i64) -> String {
|
||||
formatter().format(&Decimal::from(n)).to_string()
|
||||
}
|
||||
|
||||
fn format_si_suffix_with_formatter(n: u64, formatter: &DecimalFormatter) -> String {
|
||||
fn format_si_suffix_with_formatter(n: i64, formatter: &DecimalFormatter) -> String {
|
||||
let n = n.max(0);
|
||||
if n < 1000 {
|
||||
return formatter.format(&Decimal::from(n)).to_string();
|
||||
}
|
||||
|
||||
// Format `n / scale` with the requested number of fractional digits.
|
||||
let format_scaled = |n: u64, scale: u64, frac_digits: u32| -> String {
|
||||
let format_scaled = |n: i64, scale: i64, frac_digits: u32| -> String {
|
||||
let value = n as f64 / scale as f64;
|
||||
let scaled: u64 = (value * 10f64.powi(frac_digits as i32)).round() as u64;
|
||||
let scaled: i64 = (value * 10f64.powi(frac_digits as i32)).round() as i64;
|
||||
let mut dec = Decimal::from(scaled);
|
||||
dec.multiply_pow10(-(frac_digits as i16));
|
||||
formatter.format(&dec).to_string()
|
||||
};
|
||||
|
||||
const UNITS: [(u64, &str); 3] = [(1_000, "K"), (1_000_000, "M"), (1_000_000_000, "G")];
|
||||
const UNITS: [(i64, &str); 3] = [(1_000, "K"), (1_000_000, "M"), (1_000_000_000, "G")];
|
||||
let f = n as f64;
|
||||
for &(scale, suffix) in &UNITS {
|
||||
if (100.0 * f / scale as f64).round() < 1000.0 {
|
||||
@@ -57,7 +58,7 @@ fn format_si_suffix_with_formatter(n: u64, formatter: &DecimalFormatter) -> Stri
|
||||
// Above 1000G, keep whole‑G precision.
|
||||
format!(
|
||||
"{}G",
|
||||
format_with_separators(((n as f64) / 1e9).round() as u64)
|
||||
format_with_separators(((n as f64) / 1e9).round() as i64)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -67,7 +68,7 @@ fn format_si_suffix_with_formatter(n: u64, formatter: &DecimalFormatter) -> Stri
|
||||
/// - 999 -> "999"
|
||||
/// - 1200 -> "1.20K"
|
||||
/// - 123456789 -> "123M"
|
||||
pub fn format_si_suffix(n: u64) -> String {
|
||||
pub fn format_si_suffix(n: i64) -> String {
|
||||
format_si_suffix_with_formatter(n, formatter())
|
||||
}
|
||||
|
||||
@@ -78,7 +79,7 @@ mod tests {
|
||||
#[test]
|
||||
fn kmg() {
|
||||
let formatter = make_en_us_formatter();
|
||||
let fmt = |n: u64| format_si_suffix_with_formatter(n, &formatter);
|
||||
let fmt = |n: i64| format_si_suffix_with_formatter(n, &formatter);
|
||||
assert_eq!(fmt(0), "0");
|
||||
assert_eq!(fmt(999), "999");
|
||||
assert_eq!(fmt(1_000), "1.00K");
|
||||
|
||||
@@ -545,21 +545,21 @@ pub struct TaskCompleteEvent {
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, TS)]
|
||||
pub struct TaskStartedEvent {
|
||||
pub model_context_window: Option<u64>,
|
||||
pub model_context_window: Option<i64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, Default, TS)]
|
||||
pub struct TokenUsage {
|
||||
#[ts(type = "number")]
|
||||
pub input_tokens: u64,
|
||||
pub input_tokens: i64,
|
||||
#[ts(type = "number")]
|
||||
pub cached_input_tokens: u64,
|
||||
pub cached_input_tokens: i64,
|
||||
#[ts(type = "number")]
|
||||
pub output_tokens: u64,
|
||||
pub output_tokens: i64,
|
||||
#[ts(type = "number")]
|
||||
pub reasoning_output_tokens: u64,
|
||||
pub reasoning_output_tokens: i64,
|
||||
#[ts(type = "number")]
|
||||
pub total_tokens: u64,
|
||||
pub total_tokens: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, TS)]
|
||||
@@ -567,14 +567,14 @@ pub struct TokenUsageInfo {
|
||||
pub total_token_usage: TokenUsage,
|
||||
pub last_token_usage: TokenUsage,
|
||||
#[ts(type = "number | null")]
|
||||
pub model_context_window: Option<u64>,
|
||||
pub model_context_window: Option<i64>,
|
||||
}
|
||||
|
||||
impl TokenUsageInfo {
|
||||
pub fn new_or_append(
|
||||
info: &Option<TokenUsageInfo>,
|
||||
last: &Option<TokenUsage>,
|
||||
model_context_window: Option<u64>,
|
||||
model_context_window: Option<i64>,
|
||||
) -> Option<Self> {
|
||||
if info.is_none() && last.is_none() {
|
||||
return None;
|
||||
@@ -599,9 +599,9 @@ impl TokenUsageInfo {
|
||||
self.last_token_usage = last.clone();
|
||||
}
|
||||
|
||||
pub fn fill_to_context_window(&mut self, context_window: u64) {
|
||||
pub fn fill_to_context_window(&mut self, context_window: i64) {
|
||||
let previous_total = self.total_token_usage.total_tokens;
|
||||
let delta = context_window.saturating_sub(previous_total);
|
||||
let delta = (context_window - previous_total).max(0);
|
||||
|
||||
self.model_context_window = Some(context_window);
|
||||
self.total_token_usage = TokenUsage {
|
||||
@@ -614,7 +614,7 @@ impl TokenUsageInfo {
|
||||
};
|
||||
}
|
||||
|
||||
pub fn full_context_window(context_window: u64) -> Self {
|
||||
pub fn full_context_window(context_window: i64) -> Self {
|
||||
let mut info = Self {
|
||||
total_token_usage: TokenUsage::default(),
|
||||
last_token_usage: TokenUsage::default(),
|
||||
@@ -643,40 +643,39 @@ pub struct RateLimitWindow {
|
||||
pub used_percent: f64,
|
||||
/// Rolling window duration, in minutes.
|
||||
#[ts(type = "number | null")]
|
||||
pub window_minutes: Option<u64>,
|
||||
pub window_minutes: Option<i64>,
|
||||
/// Timestamp (RFC3339) when the window resets.
|
||||
#[ts(type = "string | null")]
|
||||
pub resets_at: Option<String>,
|
||||
}
|
||||
|
||||
// Includes prompts, tools and space to call compact.
|
||||
const BASELINE_TOKENS: u64 = 12000;
|
||||
const BASELINE_TOKENS: i64 = 12000;
|
||||
|
||||
impl TokenUsage {
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.total_tokens == 0
|
||||
}
|
||||
|
||||
pub fn cached_input(&self) -> u64 {
|
||||
self.cached_input_tokens
|
||||
pub fn cached_input(&self) -> i64 {
|
||||
self.cached_input_tokens.max(0)
|
||||
}
|
||||
|
||||
pub fn non_cached_input(&self) -> u64 {
|
||||
self.input_tokens.saturating_sub(self.cached_input())
|
||||
pub fn non_cached_input(&self) -> i64 {
|
||||
(self.input_tokens - self.cached_input()).max(0)
|
||||
}
|
||||
|
||||
/// Primary count for display as a single absolute value: non-cached input + output.
|
||||
pub fn blended_total(&self) -> u64 {
|
||||
self.non_cached_input() + self.output_tokens
|
||||
pub fn blended_total(&self) -> i64 {
|
||||
(self.non_cached_input() + self.output_tokens.max(0)).max(0)
|
||||
}
|
||||
|
||||
/// For estimating what % of the model's context window is used, we need to account
|
||||
/// for reasoning output tokens from prior turns being dropped from the context window.
|
||||
/// We approximate this here by subtracting reasoning output tokens from the total.
|
||||
/// This will be off for the current turn and pending function calls.
|
||||
pub fn tokens_in_context_window(&self) -> u64 {
|
||||
self.total_tokens
|
||||
.saturating_sub(self.reasoning_output_tokens)
|
||||
pub fn tokens_in_context_window(&self) -> i64 {
|
||||
(self.total_tokens - self.reasoning_output_tokens).max(0)
|
||||
}
|
||||
|
||||
/// Estimate the remaining user-controllable percentage of the model's context window.
|
||||
@@ -689,17 +688,17 @@ impl TokenUsage {
|
||||
/// This normalizes both the numerator and denominator by subtracting the
|
||||
/// baseline, so immediately after the first prompt the UI shows 100% left
|
||||
/// and trends toward 0% as the user fills the effective window.
|
||||
pub fn percent_of_context_window_remaining(&self, context_window: u64) -> u8 {
|
||||
pub fn percent_of_context_window_remaining(&self, context_window: i64) -> i64 {
|
||||
if context_window <= BASELINE_TOKENS {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let effective_window = context_window - BASELINE_TOKENS;
|
||||
let used = self
|
||||
.tokens_in_context_window()
|
||||
.saturating_sub(BASELINE_TOKENS);
|
||||
let remaining = effective_window.saturating_sub(used);
|
||||
((remaining as f32 / effective_window as f32) * 100.0).clamp(0.0, 100.0) as u8
|
||||
let used = (self.tokens_in_context_window() - BASELINE_TOKENS).max(0);
|
||||
let remaining = (effective_window - used).max(0);
|
||||
((remaining as f64 / effective_window as f64) * 100.0)
|
||||
.clamp(0.0, 100.0)
|
||||
.round() as i64
|
||||
}
|
||||
|
||||
/// In-place element-wise sum of token counts.
|
||||
|
||||
Reference in New Issue
Block a user