Surface context window error to the client (#4675)
In the past, we were treating `input exceeded context window` as a streaming error and retrying on it. Retrying on it has no point because it won't change the behavior. In this PR, we surface the error to the client without retry and also send a token count event to indicate that the context window is full. <img width="650" height="125" alt="image" src="https://github.com/user-attachments/assets/c26b1213-4c27-4bfc-90f4-51a270a3efd5" />
This commit is contained in:
@@ -590,6 +590,31 @@ impl TokenUsageInfo {
|
||||
self.total_token_usage.add_assign(last);
|
||||
self.last_token_usage = last.clone();
|
||||
}
|
||||
|
||||
pub fn fill_to_context_window(&mut self, context_window: u64) {
|
||||
let previous_total = self.total_token_usage.total_tokens;
|
||||
let delta = context_window.saturating_sub(previous_total);
|
||||
|
||||
self.model_context_window = Some(context_window);
|
||||
self.total_token_usage = TokenUsage {
|
||||
total_tokens: context_window,
|
||||
..TokenUsage::default()
|
||||
};
|
||||
self.last_token_usage = TokenUsage {
|
||||
total_tokens: delta,
|
||||
..TokenUsage::default()
|
||||
};
|
||||
}
|
||||
|
||||
pub fn full_context_window(context_window: u64) -> Self {
|
||||
let mut info = Self {
|
||||
total_token_usage: TokenUsage::default(),
|
||||
last_token_usage: TokenUsage::default(),
|
||||
model_context_window: Some(context_window),
|
||||
};
|
||||
info.fill_to_context_window(context_window);
|
||||
info
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, TS)]
|
||||
|
||||
Reference in New Issue
Block a user