Files
llmx/codex-rs/core/src/util.rs

67 lines
1.9 KiB
Rust
Raw Normal View History

use std::sync::Arc;
use std::time::Duration;
use rand::Rng;
use tokio::sync::Notify;
use tracing::debug;
use crate::config::Config;
[codex-rs] Reliability pass on networking (#658) We currently see a behavior that looks like this: ``` 2025-04-25T16:52:24.552789Z WARN codex_core::codex: stream disconnected - retrying turn (1/10 in 232ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 1/10 in 232ms…" } 2025-04-25T16:52:54.789885Z WARN codex_core::codex: stream disconnected - retrying turn (2/10 in 418ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 2/10 in 418ms…" } ``` This PR contains a few different fixes that attempt to resolve/improve this: 1. **Remove overall client timeout.** I think [this](https://github.com/openai/codex/pull/658/files#diff-c39945d3c42f29b506ff54b7fa2be0795b06d7ad97f1bf33956f60e3c6f19c19L173) is perhaps the big fix -- it looks to me like this was actually timing out even if events were still coming through, and that was causing a disconnect right in the middle of a healthy stream. 2. **Cap response sizes.** We were frequently sending MUCH larger responses than the upstream typescript `codex`, and that was definitely not helping. [Fix here](https://github.com/openai/codex/pull/658/files#diff-d792bef59aa3ee8cb0cbad8b176dbfefe451c227ac89919da7c3e536a9d6cdc0R21-R26) for that one. 3. **Much higher idle timeout.** Our idle timeout value was much lower than typescript. 4. **Sub-linear backoff.** We were much too aggressively backing off, [this](https://github.com/openai/codex/pull/658/files#diff-5d5959b95c6239e6188516da5c6b7eb78154cd9cfedfb9f753d30a7b6d6b8b06R30-R33) makes it sub-exponential but maintains the jitter and such. I was seeing that `stream error: stream disconnected` behavior constantly, and anecdotally I can no longer reproduce. It feels much snappier.
2025-04-25 11:44:22 -07:00
const INITIAL_DELAY_MS: u64 = 200;
const BACKOFF_FACTOR: f64 = 1.3;
/// Make a CancellationToken that is fulfilled when SIGINT occurs.
pub fn notify_on_sigint() -> Arc<Notify> {
let notify = Arc::new(Notify::new());
tokio::spawn({
let notify = Arc::clone(&notify);
async move {
loop {
tokio::signal::ctrl_c().await.ok();
debug!("Keyboard interrupt");
notify.notify_waiters();
}
}
});
notify
}
pub(crate) fn backoff(attempt: u64) -> Duration {
[codex-rs] Reliability pass on networking (#658) We currently see a behavior that looks like this: ``` 2025-04-25T16:52:24.552789Z WARN codex_core::codex: stream disconnected - retrying turn (1/10 in 232ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 1/10 in 232ms…" } 2025-04-25T16:52:54.789885Z WARN codex_core::codex: stream disconnected - retrying turn (2/10 in 418ms)... codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 2/10 in 418ms…" } ``` This PR contains a few different fixes that attempt to resolve/improve this: 1. **Remove overall client timeout.** I think [this](https://github.com/openai/codex/pull/658/files#diff-c39945d3c42f29b506ff54b7fa2be0795b06d7ad97f1bf33956f60e3c6f19c19L173) is perhaps the big fix -- it looks to me like this was actually timing out even if events were still coming through, and that was causing a disconnect right in the middle of a healthy stream. 2. **Cap response sizes.** We were frequently sending MUCH larger responses than the upstream typescript `codex`, and that was definitely not helping. [Fix here](https://github.com/openai/codex/pull/658/files#diff-d792bef59aa3ee8cb0cbad8b176dbfefe451c227ac89919da7c3e536a9d6cdc0R21-R26) for that one. 3. **Much higher idle timeout.** Our idle timeout value was much lower than typescript. 4. **Sub-linear backoff.** We were much too aggressively backing off, [this](https://github.com/openai/codex/pull/658/files#diff-5d5959b95c6239e6188516da5c6b7eb78154cd9cfedfb9f753d30a7b6d6b8b06R30-R33) makes it sub-exponential but maintains the jitter and such. I was seeing that `stream error: stream disconnected` behavior constantly, and anecdotally I can no longer reproduce. It feels much snappier.
2025-04-25 11:44:22 -07:00
let exp = BACKOFF_FACTOR.powi(attempt.saturating_sub(1) as i32);
let base = (INITIAL_DELAY_MS as f64 * exp) as u64;
let jitter = rand::rng().random_range(0.9..1.1);
Duration::from_millis((base as f64 * jitter) as u64)
}
/// Return `true` if the project folder specified by the `Config` is inside a
/// Git repository.
///
/// The check walks up the directory hierarchy looking for a `.git` file or
/// directory (note `.git` can be a file that contains a `gitdir` entry). This
/// approach does **not** require the `git` binary or the `git2` crate and is
/// therefore fairly lightweight.
///
/// Note that this does **not** detect *worktrees* created with
/// `git worktree add` where the checkout lives outside the main repository
/// directory. If you need Codex to work from such a checkout simply pass the
/// `--allow-no-git-exec` CLI flag that disables the repo requirement.
pub fn is_inside_git_repo(config: &Config) -> bool {
let mut dir = config.cwd.to_path_buf();
loop {
if dir.join(".git").exists() {
return true;
}
// Pop one component (go up one directory). `pop` returns false when
// we have reached the filesystem root.
if !dir.pop() {
break;
}
}
false
}