Handle cancelling/aborting while processing a turn (#5543)

Currently we collect all all turn items in a vector, then we add it to the history on success. This result in losing those items on errors including aborting `ctrl+c`. This PR: - Adds the ability for the tool call to handle cancellation - bubble the turn items up to where we are recording this info Admittedly, this logic is an ad-hoc logic that doesn't handle a lot of error edge cases. The right thing to do is recording to the history on the spot as `items`/`tool calls output` come. However, this isn't possible because of having different `task_kind` that has different `conversation_histories`. The `try_run_turn` has no idea what thread are we using. We cannot also pass an `arc` to the `conversation_histories` because it's a private element of `state`. That's said, `abort` is the most common case and we should cover it until we remove `task kind`
2025-10-23 08:47:10 -07:00
parent 3ab6028e80
commit f59978ed3d
7 changed files with 339 additions and 128 deletions
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -10,6 +10,7 @@ use crate::function_tool::FunctionCallError;
 use crate::mcp::auth::McpAuthStatusEntry;
 use crate::parse_command::parse_command;
 use crate::parse_turn_item;
+use crate::response_processing::process_items;
 use crate::review_format::format_review_findings_block;
 use crate::terminal;
 use crate::user_notification::UserNotifier;
@@ -855,7 +856,7 @@ impl Session {

    /// Records input items: always append to conversation history and
    /// persist these response items to rollout.
-    async fn record_conversation_items(&self, items: &[ResponseItem]) {
+    pub(crate) async fn record_conversation_items(&self, items: &[ResponseItem]) {
        self.record_into_history(items).await;
        self.persist_rollout_response_items(items).await;
    }
@@ -1608,109 +1609,13 @@ pub(crate) async fn run_task(
                let token_limit_reached = total_usage_tokens
                    .map(|tokens| tokens >= limit)
                    .unwrap_or(false);
-                let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
-                let mut responses = Vec::<ResponseInputItem>::new();
-                for processed_response_item in processed_items {
-                    let ProcessedResponseItem { item, response } = processed_response_item;
-                    match (&item, &response) {
-                        (ResponseItem::Message { role, .. }, None) if role == "assistant" => {
-                            // If the model returned a message, we need to record it.
-                            items_to_record_in_conversation_history.push(item);
-                        }
-                        (
-                            ResponseItem::LocalShellCall { .. },
-                            Some(ResponseInputItem::FunctionCallOutput { call_id, output }),
-                        ) => {
-                            items_to_record_in_conversation_history.push(item);
-                            items_to_record_in_conversation_history.push(
-                                ResponseItem::FunctionCallOutput {
-                                    call_id: call_id.clone(),
-                                    output: output.clone(),
-                                },
-                            );
-                        }
-                        (
-                            ResponseItem::FunctionCall { .. },
-                            Some(ResponseInputItem::FunctionCallOutput { call_id, output }),
-                        ) => {
-                            items_to_record_in_conversation_history.push(item);
-                            items_to_record_in_conversation_history.push(
-                                ResponseItem::FunctionCallOutput {
-                                    call_id: call_id.clone(),
-                                    output: output.clone(),
-                                },
-                            );
-                        }
-                        (
-                            ResponseItem::CustomToolCall { .. },
-                            Some(ResponseInputItem::CustomToolCallOutput { call_id, output }),
-                        ) => {
-                            items_to_record_in_conversation_history.push(item);
-                            items_to_record_in_conversation_history.push(
-                                ResponseItem::CustomToolCallOutput {
-                                    call_id: call_id.clone(),
-                                    output: output.clone(),
-                                },
-                            );
-                        }
-                        (
-                            ResponseItem::FunctionCall { .. },
-                            Some(ResponseInputItem::McpToolCallOutput { call_id, result }),
-                        ) => {
-                            items_to_record_in_conversation_history.push(item);
-                            let output = match result {
-                                Ok(call_tool_result) => {
-                                    convert_call_tool_result_to_function_call_output_payload(
-                                        call_tool_result,
-                                    )
-                                }
-                                Err(err) => FunctionCallOutputPayload {
-                                    content: err.clone(),
-                                    success: Some(false),
-                                },
-                            };
-                            items_to_record_in_conversation_history.push(
-                                ResponseItem::FunctionCallOutput {
-                                    call_id: call_id.clone(),
-                                    output,
-                                },
-                            );
-                        }
-                        (
-                            ResponseItem::Reasoning {
-                                id,
-                                summary,
-                                content,
-                                encrypted_content,
-                            },
-                            None,
-                        ) => {
-                            items_to_record_in_conversation_history.push(ResponseItem::Reasoning {
-                                id: id.clone(),
-                                summary: summary.clone(),
-                                content: content.clone(),
-                                encrypted_content: encrypted_content.clone(),
-                            });
-                        }
-                        _ => {
-                            warn!("Unexpected response item: {item:?} with response: {response:?}");
-                        }
-                    };
-                    if let Some(response) = response {
-                        responses.push(response);
-                    }
-                }
-
-                // Only attempt to take the lock if there is something to record.
-                if !items_to_record_in_conversation_history.is_empty() {
-                    if is_review_mode {
-                        review_thread_history
-                            .record_items(items_to_record_in_conversation_history.iter());
-                    } else {
-                        sess.record_conversation_items(&items_to_record_in_conversation_history)
-                            .await;
-                    }
-                }
+                let (responses, items_to_record_in_conversation_history) = process_items(
+                    processed_items,
+                    is_review_mode,
+                    &mut review_thread_history,
+                    &sess,
+                )
+                .await;

                if token_limit_reached {
                    if auto_compact_recently_attempted {
@@ -1749,7 +1654,16 @@ pub(crate) async fn run_task(
                }
                continue;
            }
-            Err(CodexErr::TurnAborted) => {
+            Err(CodexErr::TurnAborted {
+                dangling_artifacts: processed_items,
+            }) => {
+                let _ = process_items(
+                    processed_items,
+                    is_review_mode,
+                    &mut review_thread_history,
+                    &sess,
+                )
+                .await;
                // Aborted turn is reported via a different event.
                break;
            }
@@ -1850,7 +1764,13 @@ async fn run_turn(
        .await
        {
            Ok(output) => return Ok(output),
-            Err(CodexErr::TurnAborted) => return Err(CodexErr::TurnAborted),
+            Err(CodexErr::TurnAborted {
+                dangling_artifacts: processed_items,
+            }) => {
+                return Err(CodexErr::TurnAborted {
+                    dangling_artifacts: processed_items,
+                });
+            }
            Err(CodexErr::Interrupted) => return Err(CodexErr::Interrupted),
            Err(CodexErr::EnvVar(var)) => return Err(CodexErr::EnvVar(var)),
            Err(e @ CodexErr::Fatal(_)) => return Err(e),
@@ -1903,9 +1823,9 @@ async fn run_turn(
 /// "handled" such that it produces a `ResponseInputItem` that needs to be
 /// sent back to the model on the next turn.
 #[derive(Debug)]
-pub(crate) struct ProcessedResponseItem {
-    pub(crate) item: ResponseItem,
-    pub(crate) response: Option<ResponseInputItem>,
+pub struct ProcessedResponseItem {
+    pub item: ResponseItem,
+    pub response: Option<ResponseInputItem>,
 }

 #[derive(Debug)]
@@ -1954,7 +1874,15 @@ async fn try_run_turn(
        // Poll the next item from the model stream. We must inspect *both* Ok and Err
        // cases so that transient stream failures (e.g., dropped SSE connection before
        // `response.completed`) bubble up and trigger the caller's retry logic.
-        let event = stream.next().or_cancel(&cancellation_token).await?;
+        let event = match stream.next().or_cancel(&cancellation_token).await {
+            Ok(event) => event,
+            Err(codex_async_utils::CancelErr::Cancelled) => {
+                let processed_items = output.try_collect().await?;
+                return Err(CodexErr::TurnAborted {
+                    dangling_artifacts: processed_items,
+                });
+            }
+        };

        let event = match event {
            Some(res) => res?,
@@ -1978,7 +1906,8 @@ async fn try_run_turn(
                        let payload_preview = call.payload.log_payload().into_owned();
                        tracing::info!("ToolCall: {} {}", call.tool_name, payload_preview);

-                        let response = tool_runtime.handle_tool_call(call);
+                        let response =
+                            tool_runtime.handle_tool_call(call, cancellation_token.child_token());

                        output.push_back(
                            async move {
@@ -2060,12 +1989,7 @@ async fn try_run_turn(
            } => {
                sess.update_token_usage_info(turn_context.as_ref(), token_usage.as_ref())
                    .await;
-
-                let processed_items = output
-                    .try_collect()
-                    .or_cancel(&cancellation_token)
-                    .await??;
-
+                let processed_items = output.try_collect().await?;
                let unified_diff = {
                    let mut tracker = turn_diff_tracker.lock().await;
                    tracker.get_unified_diff()
@@ -2169,7 +2093,7 @@ pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) -
        }
    })
 }
-fn convert_call_tool_result_to_function_call_output_payload(
+pub(crate) fn convert_call_tool_result_to_function_call_output_payload(
    call_tool_result: &CallToolResult,
 ) -> FunctionCallOutputPayload {
    let CallToolResult {
--- a/codex-rs/core/src/error.rs
+++ b/codex-rs/core/src/error.rs
@@ -1,3 +1,4 @@
+use crate::codex::ProcessedResponseItem;
 use crate::exec::ExecToolCallOutput;
 use crate::token_data::KnownPlan;
 use crate::token_data::PlanType;
@@ -53,8 +54,11 @@ pub enum SandboxErr {

 #[derive(Error, Debug)]
 pub enum CodexErr {
+    // todo(aibrahim): git rid of this error carrying the dangling artifacts
    #[error("turn aborted")]
-    TurnAborted,
+    TurnAborted {
+        dangling_artifacts: Vec<ProcessedResponseItem>,
+    },

    /// Returned by ResponsesClient when the SSE stream disconnects or errors out **after** the HTTP
    /// handshake has succeeded but **before** it finished emitting `response.completed`.
@@ -158,7 +162,9 @@ pub enum CodexErr {

 impl From<CancelErr> for CodexErr {
    fn from(_: CancelErr) -> Self {
-        CodexErr::TurnAborted
+        CodexErr::TurnAborted {
+            dangling_artifacts: Vec::new(),
+        }
    }
 }

--- a/codex-rs/core/src/lib.rs
+++ b/codex-rs/core/src/lib.rs
@@ -36,6 +36,7 @@ mod mcp_tool_call;
 mod message_history;
 mod model_provider_info;
 pub mod parse_command;
+mod response_processing;
 pub mod sandboxing;
 pub mod token_data;
 mod truncate;
--- a/codex-rs/core/src/response_processing.rs
+++ b/codex-rs/core/src/response_processing.rs
@@ -0,0 +1,112 @@
+use crate::codex::Session;
+use crate::conversation_history::ConversationHistory;
+use codex_protocol::models::FunctionCallOutputPayload;
+use codex_protocol::models::ResponseInputItem;
+use codex_protocol::models::ResponseItem;
+use tracing::warn;
+
+/// Process streamed `ResponseItem`s from the model into the pair of:
+/// - items we should record in conversation history; and
+/// - `ResponseInputItem`s to send back to the model on the next turn.
+pub(crate) async fn process_items(
+    processed_items: Vec<crate::codex::ProcessedResponseItem>,
+    is_review_mode: bool,
+    review_thread_history: &mut ConversationHistory,
+    sess: &Session,
+) -> (Vec<ResponseInputItem>, Vec<ResponseItem>) {
+    let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
+    let mut responses = Vec::<ResponseInputItem>::new();
+    for processed_response_item in processed_items {
+        let crate::codex::ProcessedResponseItem { item, response } = processed_response_item;
+        match (&item, &response) {
+            (ResponseItem::Message { role, .. }, None) if role == "assistant" => {
+                // If the model returned a message, we need to record it.
+                items_to_record_in_conversation_history.push(item);
+            }
+            (
+                ResponseItem::LocalShellCall { .. },
+                Some(ResponseInputItem::FunctionCallOutput { call_id, output }),
+            ) => {
+                items_to_record_in_conversation_history.push(item);
+                items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
+                    call_id: call_id.clone(),
+                    output: output.clone(),
+                });
+            }
+            (
+                ResponseItem::FunctionCall { .. },
+                Some(ResponseInputItem::FunctionCallOutput { call_id, output }),
+            ) => {
+                items_to_record_in_conversation_history.push(item);
+                items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
+                    call_id: call_id.clone(),
+                    output: output.clone(),
+                });
+            }
+            (
+                ResponseItem::CustomToolCall { .. },
+                Some(ResponseInputItem::CustomToolCallOutput { call_id, output }),
+            ) => {
+                items_to_record_in_conversation_history.push(item);
+                items_to_record_in_conversation_history.push(ResponseItem::CustomToolCallOutput {
+                    call_id: call_id.clone(),
+                    output: output.clone(),
+                });
+            }
+            (
+                ResponseItem::FunctionCall { .. },
+                Some(ResponseInputItem::McpToolCallOutput { call_id, result }),
+            ) => {
+                items_to_record_in_conversation_history.push(item);
+                let output = match result {
+                    Ok(call_tool_result) => {
+                        crate::codex::convert_call_tool_result_to_function_call_output_payload(
+                            call_tool_result,
+                        )
+                    }
+                    Err(err) => FunctionCallOutputPayload {
+                        content: err.clone(),
+                        success: Some(false),
+                    },
+                };
+                items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput {
+                    call_id: call_id.clone(),
+                    output,
+                });
+            }
+            (
+                ResponseItem::Reasoning {
+                    id,
+                    summary,
+                    content,
+                    encrypted_content,
+                },
+                None,
+            ) => {
+                items_to_record_in_conversation_history.push(ResponseItem::Reasoning {
+                    id: id.clone(),
+                    summary: summary.clone(),
+                    content: content.clone(),
+                    encrypted_content: encrypted_content.clone(),
+                });
+            }
+            _ => {
+                warn!("Unexpected response item: {item:?} with response: {response:?}");
+            }
+        };
+        if let Some(response) = response {
+            responses.push(response);
+        }
+    }
+
+    // Only attempt to take the lock if there is something to record.
+    if !items_to_record_in_conversation_history.is_empty() {
+        if is_review_mode {
+            review_thread_history.record_items(items_to_record_in_conversation_history.iter());
+        } else {
+            sess.record_conversation_items(&items_to_record_in_conversation_history)
+                .await;
+        }
+    }
+    (responses, items_to_record_in_conversation_history)
+}
--- a/codex-rs/core/src/tools/parallel.rs
+++ b/codex-rs/core/src/tools/parallel.rs
@@ -2,6 +2,7 @@ use std::sync::Arc;

 use tokio::sync::RwLock;
 use tokio_util::either::Either;
+use tokio_util::sync::CancellationToken;
 use tokio_util::task::AbortOnDropHandle;

 use crate::codex::Session;
@@ -9,8 +10,10 @@ use crate::codex::TurnContext;
 use crate::error::CodexErr;
 use crate::function_tool::FunctionCallError;
 use crate::tools::context::SharedTurnDiffTracker;
+use crate::tools::context::ToolPayload;
 use crate::tools::router::ToolCall;
 use crate::tools::router::ToolRouter;
+use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ResponseInputItem;

 pub(crate) struct ToolCallRuntime {
@@ -40,6 +43,7 @@ impl ToolCallRuntime {
    pub(crate) fn handle_tool_call(
        &self,
        call: ToolCall,
+        cancellation_token: CancellationToken,
    ) -> impl std::future::Future<Output = Result<ResponseInputItem, CodexErr>> {
        let supports_parallel = self.router.tool_supports_parallel(&call.tool_name);

@@ -48,18 +52,24 @@ impl ToolCallRuntime {
        let turn = Arc::clone(&self.turn_context);
        let tracker = Arc::clone(&self.tracker);
        let lock = Arc::clone(&self.parallel_execution);
+        let aborted_response = Self::aborted_response(&call);

        let handle: AbortOnDropHandle<Result<ResponseInputItem, FunctionCallError>> =
            AbortOnDropHandle::new(tokio::spawn(async move {
-                let _guard = if supports_parallel {
-                    Either::Left(lock.read().await)
-                } else {
-                    Either::Right(lock.write().await)
-                };
+                tokio::select! {
+                    _ = cancellation_token.cancelled() => Ok(aborted_response),
+                    res = async {
+                        let _guard = if supports_parallel {
+                            Either::Left(lock.read().await)
+                        } else {
+                            Either::Right(lock.write().await)
+                        };

-                router
-                    .dispatch_tool_call(session, turn, tracker, call)
-                    .await
+                        router
+                            .dispatch_tool_call(session, turn, tracker, call)
+                            .await
+                    } => res,
+                }
            }));

        async move {
@@ -74,3 +84,25 @@ impl ToolCallRuntime {
        }
    }
 }
+
+impl ToolCallRuntime {
+    fn aborted_response(call: &ToolCall) -> ResponseInputItem {
+        match &call.payload {
+            ToolPayload::Custom { .. } => ResponseInputItem::CustomToolCallOutput {
+                call_id: call.call_id.clone(),
+                output: "aborted".to_string(),
+            },
+            ToolPayload::Mcp { .. } => ResponseInputItem::McpToolCallOutput {
+                call_id: call.call_id.clone(),
+                result: Err("aborted".to_string()),
+            },
+            _ => ResponseInputItem::FunctionCallOutput {
+                call_id: call.call_id.clone(),
+                output: FunctionCallOutputPayload {
+                    content: "aborted".to_string(),
+                    success: None,
+                },
+            },
+        }
+    }
+}
--- a/codex-rs/core/tests/common/responses.rs
+++ b/codex-rs/core/tests/common/responses.rs
@@ -35,6 +35,22 @@ impl ResponseMock {
    pub fn requests(&self) -> Vec<ResponsesRequest> {
        self.requests.lock().unwrap().clone()
    }
+
+    /// Returns true if any captured request contains a `function_call` with the
+    /// provided `call_id`.
+    pub fn saw_function_call(&self, call_id: &str) -> bool {
+        self.requests()
+            .iter()
+            .any(|req| req.has_function_call(call_id))
+    }
+
+    /// Returns the `output` string for a matching `function_call_output` with
+    /// the provided `call_id`, searching across all captured requests.
+    pub fn function_call_output_text(&self, call_id: &str) -> Option<String> {
+        self.requests()
+            .iter()
+            .find_map(|req| req.function_call_output_text(call_id))
+    }
 }

 #[derive(Debug, Clone)]
@@ -70,6 +86,28 @@ impl ResponsesRequest {
            .unwrap_or_else(|| panic!("function call output {call_id} item not found in request"))
    }

+    /// Returns true if this request's `input` contains a `function_call` with
+    /// the specified `call_id`.
+    pub fn has_function_call(&self, call_id: &str) -> bool {
+        self.input().iter().any(|item| {
+            item.get("type").and_then(Value::as_str) == Some("function_call")
+                && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+        })
+    }
+
+    /// If present, returns the `output` string of the `function_call_output`
+    /// entry matching `call_id` in this request's `input`.
+    pub fn function_call_output_text(&self, call_id: &str) -> Option<String> {
+        let binding = self.input();
+        let item = binding.iter().find(|item| {
+            item.get("type").and_then(Value::as_str) == Some("function_call_output")
+                && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+        })?;
+        item.get("output")
+            .and_then(Value::as_str)
+            .map(str::to_string)
+    }
+
    pub fn header(&self, name: &str) -> Option<String> {
        self.0
            .headers
--- a/codex-rs/core/tests/suite/abort_tasks.rs
+++ b/codex-rs/core/tests/suite/abort_tasks.rs
@@ -1,3 +1,4 @@
+use std::sync::Arc;
 use std::time::Duration;

 use codex_core::protocol::EventMsg;
@@ -5,7 +6,9 @@ use codex_core::protocol::Op;
 use codex_protocol::user_input::UserInput;
 use core_test_support::responses::ev_completed;
 use core_test_support::responses::ev_function_call;
+use core_test_support::responses::ev_response_created;
 use core_test_support::responses::mount_sse_once;
+use core_test_support::responses::mount_sse_sequence;
 use core_test_support::responses::sse;
 use core_test_support::responses::start_mock_server;
 use core_test_support::test_codex::test_codex;
@@ -67,3 +70,98 @@ async fn interrupt_long_running_tool_emits_turn_aborted() {
    )
    .await;
 }
+
+/// After an interrupt we expect the next request to the model to include both
+/// the original tool call and an `"aborted"` `function_call_output`. This test
+/// exercises the follow-up flow: it sends another user turn, inspects the mock
+/// responses server, and ensures the model receives the synthesized abort.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn interrupt_tool_records_history_entries() {
+    let command = vec![
+        "bash".to_string(),
+        "-lc".to_string(),
+        "sleep 60".to_string(),
+    ];
+    let call_id = "call-history";
+
+    let args = json!({
+        "command": command,
+        "timeout_ms": 60_000
+    })
+    .to_string();
+    let first_body = sse(vec![
+        ev_response_created("resp-history"),
+        ev_function_call(call_id, "shell", &args),
+        ev_completed("resp-history"),
+    ]);
+    let follow_up_body = sse(vec![
+        ev_response_created("resp-followup"),
+        ev_completed("resp-followup"),
+    ]);
+
+    let server = start_mock_server().await;
+    let response_mock = mount_sse_sequence(&server, vec![first_body, follow_up_body]).await;
+
+    let fixture = test_codex().build(&server).await.unwrap();
+    let codex = Arc::clone(&fixture.codex);
+
+    let wait_timeout = Duration::from_millis(100);
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![UserInput::Text {
+                text: "start history recording".into(),
+            }],
+        })
+        .await
+        .unwrap();
+
+    wait_for_event_with_timeout(
+        &codex,
+        |ev| matches!(ev, EventMsg::ExecCommandBegin(_)),
+        wait_timeout,
+    )
+    .await;
+
+    codex.submit(Op::Interrupt).await.unwrap();
+
+    wait_for_event_with_timeout(
+        &codex,
+        |ev| matches!(ev, EventMsg::TurnAborted(_)),
+        wait_timeout,
+    )
+    .await;
+
+    codex
+        .submit(Op::UserInput {
+            items: vec![UserInput::Text {
+                text: "follow up".into(),
+            }],
+        })
+        .await
+        .unwrap();
+
+    wait_for_event_with_timeout(
+        &codex,
+        |ev| matches!(ev, EventMsg::TaskComplete(_)),
+        wait_timeout,
+    )
+    .await;
+
+    let requests = response_mock.requests();
+    assert!(
+        requests.len() == 2,
+        "expected two calls to the responses API, got {}",
+        requests.len()
+    );
+
+    assert!(
+        response_mock.saw_function_call(call_id),
+        "function call not recorded in responses payload"
+    );
+    assert_eq!(
+        response_mock.function_call_output_text(call_id).as_deref(),
+        Some("aborted"),
+        "aborted function call output not recorded in responses payload"
+    );
+}