Files
llmx/codex-rs/core/src/util.rs

68 lines
1.9 KiB
Rust
Raw Normal View History

use std::time::Duration;
use rand::Rng;
use tracing::debug;
use tracing::error;
[codex-rs] Reliability pass on networking (#658) We currently see a behavior that looks like this: ``` 2025-04-25T16:52:24.552789Z WARN codex_core::codex: stream disconnected - retrying turn (1/10 in 232ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 1/10 in 232ms…" } 2025-04-25T16:52:54.789885Z WARN codex_core::codex: stream disconnected - retrying turn (2/10 in 418ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 2/10 in 418ms…" } ``` This PR contains a few different fixes that attempt to resolve/improve this: 1. **Remove overall client timeout.** I think [this](https://github.com/openai/codex/pull/658/files#diff-c39945d3c42f29b506ff54b7fa2be0795b06d7ad97f1bf33956f60e3c6f19c19L173) is perhaps the big fix -- it looks to me like this was actually timing out even if events were still coming through, and that was causing a disconnect right in the middle of a healthy stream. 2. **Cap response sizes.** We were frequently sending MUCH larger responses than the upstream typescript `codex`, and that was definitely not helping. [Fix here](https://github.com/openai/codex/pull/658/files#diff-d792bef59aa3ee8cb0cbad8b176dbfefe451c227ac89919da7c3e536a9d6cdc0R21-R26) for that one. 3. **Much higher idle timeout.** Our idle timeout value was much lower than typescript. 4. **Sub-linear backoff.** We were much too aggressively backing off, [this](https://github.com/openai/codex/pull/658/files#diff-5d5959b95c6239e6188516da5c6b7eb78154cd9cfedfb9f753d30a7b6d6b8b06R30-R33) makes it sub-exponential but maintains the jitter and such. I was seeing that `stream error: stream disconnected` behavior constantly, and anecdotally I can no longer reproduce. It feels much snappier.
2025-04-25 11:44:22 -07:00
const INITIAL_DELAY_MS: u64 = 200;
const BACKOFF_FACTOR: f64 = 2.0;
[codex-rs] Reliability pass on networking (#658) We currently see a behavior that looks like this: ``` 2025-04-25T16:52:24.552789Z WARN codex_core::codex: stream disconnected - retrying turn (1/10 in 232ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 1/10 in 232ms…" } 2025-04-25T16:52:54.789885Z WARN codex_core::codex: stream disconnected - retrying turn (2/10 in 418ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 2/10 in 418ms…" } ``` This PR contains a few different fixes that attempt to resolve/improve this: 1. **Remove overall client timeout.** I think [this](https://github.com/openai/codex/pull/658/files#diff-c39945d3c42f29b506ff54b7fa2be0795b06d7ad97f1bf33956f60e3c6f19c19L173) is perhaps the big fix -- it looks to me like this was actually timing out even if events were still coming through, and that was causing a disconnect right in the middle of a healthy stream. 2. **Cap response sizes.** We were frequently sending MUCH larger responses than the upstream typescript `codex`, and that was definitely not helping. [Fix here](https://github.com/openai/codex/pull/658/files#diff-d792bef59aa3ee8cb0cbad8b176dbfefe451c227ac89919da7c3e536a9d6cdc0R21-R26) for that one. 3. **Much higher idle timeout.** Our idle timeout value was much lower than typescript. 4. **Sub-linear backoff.** We were much too aggressively backing off, [this](https://github.com/openai/codex/pull/658/files#diff-5d5959b95c6239e6188516da5c6b7eb78154cd9cfedfb9f753d30a7b6d6b8b06R30-R33) makes it sub-exponential but maintains the jitter and such. I was seeing that `stream error: stream disconnected` behavior constantly, and anecdotally I can no longer reproduce. It feels much snappier.
2025-04-25 11:44:22 -07:00
pub(crate) fn backoff(attempt: u64) -> Duration {
[codex-rs] Reliability pass on networking (#658) We currently see a behavior that looks like this: ``` 2025-04-25T16:52:24.552789Z WARN codex_core::codex: stream disconnected - retrying turn (1/10 in 232ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 1/10 in 232ms…" } 2025-04-25T16:52:54.789885Z WARN codex_core::codex: stream disconnected - retrying turn (2/10 in 418ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 2/10 in 418ms…" } ``` This PR contains a few different fixes that attempt to resolve/improve this: 1. **Remove overall client timeout.** I think [this](https://github.com/openai/codex/pull/658/files#diff-c39945d3c42f29b506ff54b7fa2be0795b06d7ad97f1bf33956f60e3c6f19c19L173) is perhaps the big fix -- it looks to me like this was actually timing out even if events were still coming through, and that was causing a disconnect right in the middle of a healthy stream. 2. **Cap response sizes.** We were frequently sending MUCH larger responses than the upstream typescript `codex`, and that was definitely not helping. [Fix here](https://github.com/openai/codex/pull/658/files#diff-d792bef59aa3ee8cb0cbad8b176dbfefe451c227ac89919da7c3e536a9d6cdc0R21-R26) for that one. 3. **Much higher idle timeout.** Our idle timeout value was much lower than typescript. 4. **Sub-linear backoff.** We were much too aggressively backing off, [this](https://github.com/openai/codex/pull/658/files#diff-5d5959b95c6239e6188516da5c6b7eb78154cd9cfedfb9f753d30a7b6d6b8b06R30-R33) makes it sub-exponential but maintains the jitter and such. I was seeing that `stream error: stream disconnected` behavior constantly, and anecdotally I can no longer reproduce. It feels much snappier.
2025-04-25 11:44:22 -07:00
let exp = BACKOFF_FACTOR.powi(attempt.saturating_sub(1) as i32);
let base = (INITIAL_DELAY_MS as f64 * exp) as u64;
let jitter = rand::rng().random_range(0.9..1.1);
Duration::from_millis((base as f64 * jitter) as u64)
}
pub(crate) fn error_or_panic(message: String) {
if cfg!(debug_assertions) || env!("CARGO_PKG_VERSION").contains("alpha") {
panic!("{message}");
} else {
error!("{message}");
}
}
pub(crate) fn try_parse_error_message(text: &str) -> String {
debug!("Parsing server error response: {}", text);
let json = serde_json::from_str::<serde_json::Value>(text).unwrap_or_default();
if let Some(error) = json.get("error")
&& let Some(message) = error.get("message")
&& let Some(message_str) = message.as_str()
{
return message_str.to_string();
}
if text.is_empty() {
return "Unknown error".to_string();
}
text.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_try_parse_error_message() {
let text = r#"{
"error": {
"message": "Your refresh token has already been used to generate a new access token. Please try signing in again.",
"type": "invalid_request_error",
"param": null,
"code": "refresh_token_reused"
}
}"#;
let message = try_parse_error_message(text);
assert_eq!(
message,
"Your refresh token has already been used to generate a new access token. Please try signing in again."
);
}
#[test]
fn test_try_parse_error_message_no_error() {
let text = r#"{"message": "test"}"#;
let message = try_parse_error_message(text);
assert_eq!(message, r#"{"message": "test"}"#);
}
}