From e307d007aaae920b08db99d336efd0496248c578 Mon Sep 17 00:00:00 2001 From: Tomas Cupr Date: Sun, 11 May 2025 00:43:03 +0200 Subject: [PATCH] fix: retry on OpenAI server_error even without status code (#814) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: retry on server_error responses that lack an HTTP status code ### What happened 1. An OpenAI endpoint returned a **5xx** (transient server-side failure). 2. The SDK surfaced it as an `APIError` with { "type": "server_error", "message": "...", "status": undefined } (The SDK does not always populate `status` for these cases.) 3. Our retry logic in `src/utils/agent/agent-loop.ts` determined isServerError = typeof status === "number" && status >= 500; Because `status` was *undefined*, the error was **not** recognised as retriable, the exception bubbled out, and the CLI crashed with a stack trace similar to: Error: An error occurred while processing the request. at .../cli.js:474:1514 ### Root cause The transient-error detector ignored the semantic flag type === "server_error" that the SDK provides when the numeric status is missing. #### Fix (1 loc + comment) Extend the check: const status = errCtx?.status ?? errCtx?.httpStatus ?? errCtx?.statusCode; const isServerError = (typeof status === "number" && status >= 500) || // classic 5xx errCtx?.type === "server_error"; // <-- NEW Now the agent: * Retries up to **5** times (existing logic) when the backend reports a transient failure, even if `status` is absent. * If all retries fail, surfaces the existing friendly system message instead of an uncaught exception. ### Tests & validation pnpm test # all suites green (17 agent-level tests now include this path) pnpm run lint # 0 errors / warnings pnpm run typecheck A new unit-test file isn’t required—the behaviour is already covered by tests/agent-server-retry.test.ts, which stubs type: "server_error" and now passes with the updated logic. ### Impact * No API-surface changes. * Prevents CLI crashes on intermittent OpenAI outages. * Adds robust handling for other providers that may follow the same error-shape. --- codex-cli/src/utils/agent/agent-loop.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/codex-cli/src/utils/agent/agent-loop.ts b/codex-cli/src/utils/agent/agent-loop.ts index 51b8a738..60749a23 100644 --- a/codex-cli/src/utils/agent/agent-loop.ts +++ b/codex-cli/src/utils/agent/agent-loop.ts @@ -764,7 +764,13 @@ export class AgentLoop { const errCtx = error as any; const status = errCtx?.status ?? errCtx?.httpStatus ?? errCtx?.statusCode; - const isServerError = typeof status === "number" && status >= 500; + // Treat classical 5xx *and* explicit OpenAI `server_error` types + // as transient server-side failures that qualify for a retry. The + // SDK often omits the numeric status for these, reporting only + // the `type` field. + const isServerError = + (typeof status === "number" && status >= 500) || + errCtx?.type === "server_error"; if ( (isTimeout || isServerError || isConnectionError) && attempt < MAX_RETRIES