[codex-rs] Reliability pass on networking (#658)
We currently see a behavior that looks like this:
```
2025-04-25T16:52:24.552789Z WARN codex_core::codex: stream disconnected - retrying turn (1/10 in 232ms)...
codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 1/10 in 232ms…" }
2025-04-25T16:52:54.789885Z WARN codex_core::codex: stream disconnected - retrying turn (2/10 in 418ms)...
codex> event: BackgroundEvent { message: "stream error: stream disconnected before completion: Transport error: error decoding response body; retrying 2/10 in 418ms…" }
```
This PR contains a few different fixes that attempt to resolve/improve
this:
1. **Remove overall client timeout.** I think
[this](https://github.com/openai/codex/pull/658/files#diff-c39945d3c42f29b506ff54b7fa2be0795b06d7ad97f1bf33956f60e3c6f19c19L173)
is perhaps the big fix -- it looks to me like this was actually timing
out even if events were still coming through, and that was causing a
disconnect right in the middle of a healthy stream.
2. **Cap response sizes.** We were frequently sending MUCH larger
responses than the upstream typescript `codex`, and that was definitely
not helping. [Fix
here](https://github.com/openai/codex/pull/658/files#diff-d792bef59aa3ee8cb0cbad8b176dbfefe451c227ac89919da7c3e536a9d6cdc0R21-R26)
for that one.
3. **Much higher idle timeout.** Our idle timeout value was much lower
than typescript.
4. **Sub-linear backoff.** We were much too aggressively backing off,
[this](https://github.com/openai/codex/pull/658/files#diff-5d5959b95c6239e6188516da5c6b7eb78154cd9cfedfb9f753d30a7b6d6b8b06R30-R33)
makes it sub-exponential but maintains the jitter and such.
I was seeing that `stream error: stream disconnected` behavior
constantly, and anecdotally I can no longer reproduce. It feels much
snappier.
This commit is contained in:
@@ -7,6 +7,7 @@ use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use serde::Deserialize;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::process::Command;
|
||||
@@ -17,9 +18,11 @@ use crate::error::Result;
|
||||
use crate::error::SandboxErr;
|
||||
use crate::protocol::SandboxPolicy;
|
||||
|
||||
/// Maximum we keep for each stream (100 KiB).
|
||||
/// TODO(ragona) this should be reduced
|
||||
const MAX_STREAM_OUTPUT: usize = 100 * 1024;
|
||||
/// Maximum we send for each stream, which is either:
|
||||
/// - 10KiB OR
|
||||
/// - 256 lines
|
||||
const MAX_STREAM_OUTPUT: usize = 10 * 1024;
|
||||
const MAX_STREAM_OUTPUT_LINES: usize = 256;
|
||||
|
||||
const DEFAULT_TIMEOUT_MS: u64 = 10_000;
|
||||
|
||||
@@ -222,10 +225,12 @@ pub async fn exec(
|
||||
let stdout_handle = tokio::spawn(read_capped(
|
||||
BufReader::new(child.stdout.take().expect("stdout is not piped")),
|
||||
MAX_STREAM_OUTPUT,
|
||||
MAX_STREAM_OUTPUT_LINES,
|
||||
));
|
||||
let stderr_handle = tokio::spawn(read_capped(
|
||||
BufReader::new(child.stderr.take().expect("stderr is not piped")),
|
||||
MAX_STREAM_OUTPUT,
|
||||
MAX_STREAM_OUTPUT_LINES,
|
||||
));
|
||||
|
||||
let interrupted = ctrl_c.notified();
|
||||
@@ -259,23 +264,41 @@ pub async fn exec(
|
||||
})
|
||||
}
|
||||
|
||||
async fn read_capped<R: AsyncReadExt + Unpin>(
|
||||
async fn read_capped<R: AsyncRead + Unpin>(
|
||||
mut reader: R,
|
||||
max_output: usize,
|
||||
max_lines: usize,
|
||||
) -> io::Result<Vec<u8>> {
|
||||
let mut buf = Vec::with_capacity(max_output.min(8 * 1024));
|
||||
let mut tmp = [0u8; 8192];
|
||||
|
||||
let mut remaining_bytes = max_output;
|
||||
let mut remaining_lines = max_lines;
|
||||
|
||||
loop {
|
||||
let n = reader.read(&mut tmp).await?;
|
||||
if n == 0 {
|
||||
break;
|
||||
}
|
||||
if buf.len() < max_output {
|
||||
let remaining = max_output - buf.len();
|
||||
buf.extend_from_slice(&tmp[..remaining.min(n)]);
|
||||
|
||||
// Copy into the buffer only while we still have byte and line budget.
|
||||
if remaining_bytes > 0 && remaining_lines > 0 {
|
||||
let mut copy_len = 0;
|
||||
for &b in &tmp[..n] {
|
||||
if remaining_bytes == 0 || remaining_lines == 0 {
|
||||
break;
|
||||
}
|
||||
copy_len += 1;
|
||||
remaining_bytes -= 1;
|
||||
if b == b'\n' {
|
||||
remaining_lines -= 1;
|
||||
}
|
||||
}
|
||||
buf.extend_from_slice(&tmp[..copy_len]);
|
||||
}
|
||||
// Continue reading to EOF to avoid back-pressure, but discard once caps are hit.
|
||||
}
|
||||
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user