Improved token refresh handling to address "Re-connecting" behavior (#6231)

Currently, when the access token expires, we attempt to use the refresh token to acquire a new access token. This works most of the time. However, there are situations where the refresh token is expired, exhausted (already used to perform a refresh), or revoked. In those cases, the current logic treats the error as transient and attempts to retry it repeatedly. This PR changes the token refresh logic to differentiate between permanent and transient errors. It also changes callers to treat the permanent errors as fatal rather than retrying them. And it provides better error messages to users so they understand how to address the problem. These error messages should also help us further understand why we're seeing examples of refresh token exhaustion. Here is the error message in the CLI. The same text appears within the extension. <img width="863" height="38" alt="image" src="https://github.com/user-attachments/assets/7ffc0d08-ebf0-4900-b9a9-265064202f4f" /> I also correct the spelling of "Re-connecting", which shouldn't have a hyphen in it. Testing: I manually tested these code paths by adding temporary code to programmatically cause my refresh token to be exhausted (by calling the token refresh endpoint in a tight loop more than 50 times). I then simulated an access token expiration, which caused the token refresh logic to be invoked. I confirmed that the updated logic properly handled the error condition. Note: We earlier discussed the idea of forcefully logging out the user at the point where token refresh failed. I made several attempts to do this, and all of them resulted in a bad UX. It's important to surface this error to users in a way that explains the problem and tells them that they need to log in again. We also previously discussed deleting the auth.json file when this condition is detected. That also creates problems because it effectively changes the auth status from logged in to logged out, and this causes odd failures and inconsistent UX. I think it's therefore better not to delete auth.json in this case. If the user closes the CLI or VSCE and starts it again, we properly detect that the access token is expired and the refresh token is "dead", and we force the user to go through the login flow at that time. This should address aspects of #6191, #5679, and #5505
2025-11-05 12:51:57 -06:00
parent 1a89f70015
commit c4ebe4b078
6 changed files with 458 additions and 32 deletions
--- a/codex-rs/core/src/auth.rs
+++ b/codex-rs/core/src/auth.rs
@@ -1,12 +1,14 @@
 mod storage;

 use chrono::Utc;
+use reqwest::StatusCode;
 use serde::Deserialize;
 use serde::Serialize;
 #[cfg(test)]
 use serial_test::serial;
 use std::env;
 use std::fmt::Debug;
+use std::io::ErrorKind;
 use std::path::Path;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -22,10 +24,14 @@ use crate::auth::storage::AuthStorageBackend;
 use crate::auth::storage::create_auth_storage;
 use crate::config::Config;
 use crate::default_client::CodexHttpClient;
+use crate::error::RefreshTokenFailedError;
+use crate::error::RefreshTokenFailedReason;
 use crate::token_data::PlanType;
 use crate::token_data::TokenData;
 use crate::token_data::parse_id_token;
 use crate::util::try_parse_error_message;
+use serde_json::Value;
+use thiserror::Error;

 #[derive(Debug, Clone)]
 pub struct CodexAuth {
@@ -46,18 +52,54 @@ impl PartialEq for CodexAuth {
 // TODO(pakrym): use token exp field to check for expiration instead
 const TOKEN_REFRESH_INTERVAL: i64 = 8;

+const REFRESH_TOKEN_EXPIRED_MESSAGE: &str = "Your access token could not be refreshed because your refresh token has expired. Please log out and sign in again.";
+const REFRESH_TOKEN_REUSED_MESSAGE: &str = "Your access token could not be refreshed because your refresh token was already used. Please log out and sign in again.";
+const REFRESH_TOKEN_INVALIDATED_MESSAGE: &str = "Your access token could not be refreshed because your refresh token was revoked. Please log out and sign in again.";
+const REFRESH_TOKEN_UNKNOWN_MESSAGE: &str =
+    "Your access token could not be refreshed. Please log out and sign in again.";
+const REFRESH_TOKEN_URL: &str = "https://auth.openai.com/oauth/token";
+pub const REFRESH_TOKEN_URL_OVERRIDE_ENV_VAR: &str = "CODEX_REFRESH_TOKEN_URL_OVERRIDE";
+
+#[derive(Debug, Error)]
+pub enum RefreshTokenError {
+    #[error("{0}")]
+    Permanent(#[from] RefreshTokenFailedError),
+    #[error(transparent)]
+    Transient(#[from] std::io::Error),
+}
+
+impl RefreshTokenError {
+    pub fn failed_reason(&self) -> Option<RefreshTokenFailedReason> {
+        match self {
+            Self::Permanent(error) => Some(error.reason),
+            Self::Transient(_) => None,
+        }
+    }
+
+    fn other_with_message(message: impl Into<String>) -> Self {
+        Self::Transient(std::io::Error::other(message.into()))
+    }
+}
+
+impl From<RefreshTokenError> for std::io::Error {
+    fn from(err: RefreshTokenError) -> Self {
+        match err {
+            RefreshTokenError::Permanent(failed) => std::io::Error::other(failed),
+            RefreshTokenError::Transient(inner) => inner,
+        }
+    }
+}
+
 impl CodexAuth {
-    pub async fn refresh_token(&self) -> Result<String, std::io::Error> {
+    pub async fn refresh_token(&self) -> Result<String, RefreshTokenError> {
        tracing::info!("Refreshing token");

-        let token_data = self
-            .get_current_token_data()
-            .ok_or(std::io::Error::other("Token data is not available."))?;
+        let token_data = self.get_current_token_data().ok_or_else(|| {
+            RefreshTokenError::Transient(std::io::Error::other("Token data is not available."))
+        })?;
        let token = token_data.refresh_token;

-        let refresh_response = try_refresh_token(token, &self.client)
-            .await
-            .map_err(std::io::Error::other)?;
+        let refresh_response = try_refresh_token(token, &self.client).await?;

        let updated = update_tokens(
            &self.storage,
@@ -65,7 +107,8 @@ impl CodexAuth {
            refresh_response.access_token,
            refresh_response.refresh_token,
        )
-        .await?;
+        .await
+        .map_err(RefreshTokenError::from)?;

        if let Ok(mut auth_lock) = self.auth_dot_json.lock() {
            *auth_lock = Some(updated.clone());
@@ -74,7 +117,7 @@ impl CodexAuth {
        let access = match updated.tokens {
            Some(t) => t.access_token,
            None => {
-                return Err(std::io::Error::other(
+                return Err(RefreshTokenError::other_with_message(
                    "Token data is not available after refresh.",
                ));
            }
@@ -99,15 +142,21 @@ impl CodexAuth {
                ..
            }) => {
                if last_refresh < Utc::now() - chrono::Duration::days(TOKEN_REFRESH_INTERVAL) {
-                    let refresh_response = tokio::time::timeout(
+                    let refresh_result = tokio::time::timeout(
                        Duration::from_secs(60),
                        try_refresh_token(tokens.refresh_token.clone(), &self.client),
                    )
-                    .await
-                    .map_err(|_| {
-                        std::io::Error::other("timed out while refreshing OpenAI API key")
-                    })?
-                    .map_err(std::io::Error::other)?;
+                    .await;
+                    let refresh_response = match refresh_result {
+                        Ok(Ok(response)) => response,
+                        Ok(Err(err)) => return Err(err.into()),
+                        Err(_) => {
+                            return Err(std::io::Error::new(
+                                ErrorKind::TimedOut,
+                                "timed out while refreshing OpenAI API key",
+                            ));
+                        }
+                    };

                    let updated_auth_dot_json = update_tokens(
                        &self.storage,
@@ -425,7 +474,7 @@ async fn update_tokens(
 async fn try_refresh_token(
    refresh_token: String,
    client: &CodexHttpClient,
-) -> std::io::Result<RefreshResponse> {
+) -> Result<RefreshResponse, RefreshTokenError> {
    let refresh_request = RefreshRequest {
        client_id: CLIENT_ID,
        grant_type: "refresh_token",
@@ -433,30 +482,93 @@ async fn try_refresh_token(
        scope: "openid profile email",
    };

+    let endpoint = refresh_token_endpoint();
+
    // Use shared client factory to include standard headers
    let response = client
-        .post("https://auth.openai.com/oauth/token")
+        .post(endpoint.as_str())
        .header("Content-Type", "application/json")
        .json(&refresh_request)
        .send()
        .await
-        .map_err(std::io::Error::other)?;
+        .map_err(|err| RefreshTokenError::Transient(std::io::Error::other(err)))?;

-    if response.status().is_success() {
+    let status = response.status();
+    if status.is_success() {
        let refresh_response = response
            .json::<RefreshResponse>()
            .await
-            .map_err(std::io::Error::other)?;
+            .map_err(|err| RefreshTokenError::Transient(std::io::Error::other(err)))?;
        Ok(refresh_response)
    } else {
-        Err(std::io::Error::other(format!(
-            "Failed to refresh token: {}: {}",
-            response.status(),
-            try_parse_error_message(&response.text().await.unwrap_or_default()),
-        )))
+        let body = response.text().await.unwrap_or_default();
+        if status == StatusCode::UNAUTHORIZED {
+            let failed = classify_refresh_token_failure(&body);
+            Err(RefreshTokenError::Permanent(failed))
+        } else {
+            let message = try_parse_error_message(&body);
+            Err(RefreshTokenError::Transient(std::io::Error::other(
+                format!("Failed to refresh token: {status}: {message}"),
+            )))
+        }
    }
 }

+fn classify_refresh_token_failure(body: &str) -> RefreshTokenFailedError {
+    let code = extract_refresh_token_error_code(body);
+
+    let normalized_code = code.as_deref().map(str::to_ascii_lowercase);
+    let reason = match normalized_code.as_deref() {
+        Some("refresh_token_expired") => RefreshTokenFailedReason::Expired,
+        Some("refresh_token_reused") => RefreshTokenFailedReason::Exhausted,
+        Some("refresh_token_invalidated") => RefreshTokenFailedReason::Revoked,
+        _ => RefreshTokenFailedReason::Other,
+    };
+
+    if reason == RefreshTokenFailedReason::Other {
+        tracing::warn!(
+            backend_code = normalized_code.as_deref(),
+            backend_body = body,
+            "Encountered unknown 401 response while refreshing token"
+        );
+    }
+
+    let message = match reason {
+        RefreshTokenFailedReason::Expired => REFRESH_TOKEN_EXPIRED_MESSAGE.to_string(),
+        RefreshTokenFailedReason::Exhausted => REFRESH_TOKEN_REUSED_MESSAGE.to_string(),
+        RefreshTokenFailedReason::Revoked => REFRESH_TOKEN_INVALIDATED_MESSAGE.to_string(),
+        RefreshTokenFailedReason::Other => REFRESH_TOKEN_UNKNOWN_MESSAGE.to_string(),
+    };
+
+    RefreshTokenFailedError::new(reason, message)
+}
+
+fn extract_refresh_token_error_code(body: &str) -> Option<String> {
+    if body.trim().is_empty() {
+        return None;
+    }
+
+    let Value::Object(map) = serde_json::from_str::<Value>(body).ok()? else {
+        return None;
+    };
+
+    if let Some(error_value) = map.get("error") {
+        match error_value {
+            Value::Object(obj) => {
+                if let Some(code) = obj.get("code").and_then(Value::as_str) {
+                    return Some(code.to_string());
+                }
+            }
+            Value::String(code) => {
+                return Some(code.to_string());
+            }
+            _ => {}
+        }
+    }
+
+    map.get("code").and_then(Value::as_str).map(str::to_string)
+}
+
 #[derive(Serialize)]
 struct RefreshRequest {
    client_id: &'static str,
@@ -475,6 +587,11 @@ struct RefreshResponse {
 // Shared constant for token refresh (client id used for oauth token refresh flow)
 pub const CLIENT_ID: &str = "app_EMoamEEZ73f0CkXaXp7hrann";

+fn refresh_token_endpoint() -> String {
+    std::env::var(REFRESH_TOKEN_URL_OVERRIDE_ENV_VAR)
+        .unwrap_or_else(|_| REFRESH_TOKEN_URL.to_string())
+}
+
 use std::sync::RwLock;

 /// Internal cached auth state.
@@ -965,7 +1082,9 @@ impl AuthManager {

    /// Attempt to refresh the current auth token (if any). On success, reload
    /// the auth state from disk so other components observe refreshed token.
-    pub async fn refresh_token(&self) -> std::io::Result<Option<String>> {
+    /// If the token refresh fails in a permanent (non‑transient) way, logs out
+    /// to clear invalid auth state.
+    pub async fn refresh_token(&self) -> Result<Option<String>, RefreshTokenError> {
        let auth = match self.auth() {
            Some(a) => a,
            None => return Ok(None),
--- a/codex-rs/core/src/client.rs
+++ b/codex-rs/core/src/client.rs
@@ -31,6 +31,7 @@ use tracing::warn;

 use crate::AuthManager;
 use crate::auth::CodexAuth;
+use crate::auth::RefreshTokenError;
 use crate::chat_completions::AggregateStreamExt;
 use crate::chat_completions::stream_chat_completions;
 use crate::client_common::Prompt;
@@ -389,12 +390,17 @@ impl ModelClient {
                    && let Some(manager) = auth_manager.as_ref()
                    && let Some(auth) = auth.as_ref()
                    && auth.mode == AuthMode::ChatGPT
+                    && let Err(err) = manager.refresh_token().await
                {
-                    manager.refresh_token().await.map_err(|err| {
-                        StreamAttemptError::Fatal(CodexErr::Fatal(format!(
-                            "Failed to refresh ChatGPT credentials: {err}"
-                        )))
-                    })?;
+                    let stream_error = match err {
+                        RefreshTokenError::Permanent(failed) => {
+                            StreamAttemptError::Fatal(CodexErr::RefreshTokenFailed(failed))
+                        }
+                        RefreshTokenError::Transient(other) => {
+                            StreamAttemptError::RetryableTransportError(CodexErr::Io(other))
+                        }
+                    };
+                    return Err(stream_error);
                }

                // The OpenAI Responses endpoint returns structured JSON bodies even for 4xx/5xx
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -1928,6 +1928,7 @@ async fn run_turn(
                return Err(CodexErr::UsageLimitReached(e));
            }
            Err(CodexErr::UsageNotIncluded) => return Err(CodexErr::UsageNotIncluded),
+            Err(e @ CodexErr::RefreshTokenFailed(_)) => return Err(e),
            Err(e) => {
                // Use the configured provider-specific stream retry budget.
                let max_retries = turn_context.client.get_provider().stream_max_retries();
@@ -1946,7 +1947,7 @@ async fn run_turn(
                    // at a seemingly frozen screen.
                    sess.notify_stream_error(
                        &turn_context,
-                        format!("Re-connecting... {retries}/{max_retries}"),
+                        format!("Reconnecting... {retries}/{max_retries}"),
                    )
                    .await;

--- a/codex-rs/core/src/error.rs
+++ b/codex-rs/core/src/error.rs
@@ -135,6 +135,9 @@ pub enum CodexErr {
    #[error("unsupported operation: {0}")]
    UnsupportedOperation(String),

+    #[error("{0}")]
+    RefreshTokenFailed(RefreshTokenFailedError),
+
    #[error("Fatal error: {0}")]
    Fatal(String),

@@ -201,6 +204,30 @@ impl std::fmt::Display for ResponseStreamFailed {
    }
 }

+#[derive(Debug, Clone, PartialEq, Eq, Error)]
+#[error("{message}")]
+pub struct RefreshTokenFailedError {
+    pub reason: RefreshTokenFailedReason,
+    pub message: String,
+}
+
+impl RefreshTokenFailedError {
+    pub fn new(reason: RefreshTokenFailedReason, message: impl Into<String>) -> Self {
+        Self {
+            reason,
+            message: message.into(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum RefreshTokenFailedReason {
+    Expired,
+    Exhausted,
+    Revoked,
+    Other,
+}
+
 #[derive(Debug)]
 pub struct UnexpectedResponseError {
    pub status: StatusCode,