feat: Freeform apply_patch with simple shell output (#4718)

## Summary This PR is an alternative approach to #4711, but instead of changing our storage, parses out shell calls in the client and reserializes them on the fly before we send them out as part of the request. What this changes: 1. Adds additional serialization logic when the ApplyPatchToolType::Freeform is in use. 2. Adds a --custom-apply-patch flag to enable this setting on a session-by-session basis. This change is delicate, but is not meant to be permanent. It is meant to be the first step in a migration: 1. (This PR) Add in-flight serialization with config 2. Update model_family default 3. Update serialization logic to store turn outputs in a structured format, with logic to serialize based on model_family setting. 4. Remove this rewrite in-flight logic. ## Test Plan - [x] Additional unit tests added - [x] Integration tests added - [x] Tested locally
2025-10-04 19:16:36 -07:00
parent 90ef94d3b3
commit 4764fc1ee7
11 changed files with 420 additions and 15 deletions
--- a/codex-rs/core/src/client_common.rs
+++ b/codex-rs/core/src/client_common.rs
@@ -9,9 +9,11 @@ use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig;
 use codex_protocol::config_types::Verbosity as VerbosityConfig;
 use codex_protocol::models::ResponseItem;
 use futures::Stream;
+use serde::Deserialize;
 use serde::Serialize;
 use serde_json::Value;
 use std::borrow::Cow;
+use std::collections::HashSet;
 use std::ops::Deref;
 use std::pin::Pin;
 use std::task::Context;
@@ -64,10 +66,114 @@ impl Prompt {
    }

    pub(crate) fn get_formatted_input(&self) -> Vec<ResponseItem> {
-        self.input.clone()
+        let mut input = self.input.clone();
+
+        // when using the *Freeform* apply_patch tool specifically, tool outputs
+        // should be structured text, not json. Do NOT reserialize when using
+        // the Function tool - note that this differs from the check above for
+        // instructions. We declare the result as a named variable for clarity.
+        let is_freeform_apply_patch_tool_present = self.tools.iter().any(|tool| match tool {
+            ToolSpec::Freeform(f) => f.name == "apply_patch",
+            _ => false,
+        });
+        if is_freeform_apply_patch_tool_present {
+            reserialize_shell_outputs(&mut input);
+        }
+
+        input
    }
 }

+fn reserialize_shell_outputs(items: &mut [ResponseItem]) {
+    let mut shell_call_ids: HashSet<String> = HashSet::new();
+
+    items.iter_mut().for_each(|item| match item {
+        ResponseItem::LocalShellCall { call_id, id, .. } => {
+            if let Some(identifier) = call_id.clone().or_else(|| id.clone()) {
+                shell_call_ids.insert(identifier);
+            }
+        }
+        ResponseItem::CustomToolCall {
+            id: _,
+            status: _,
+            call_id,
+            name,
+            input: _,
+        } => {
+            if name == "apply_patch" {
+                shell_call_ids.insert(call_id.clone());
+            }
+        }
+        ResponseItem::CustomToolCallOutput { call_id, output } => {
+            if shell_call_ids.remove(call_id)
+                && let Some(structured) = parse_structured_shell_output(output)
+            {
+                *output = structured
+            }
+        }
+        ResponseItem::FunctionCall { name, call_id, .. }
+            if is_shell_tool_name(name) || name == "apply_patch" =>
+        {
+            shell_call_ids.insert(call_id.clone());
+        }
+        ResponseItem::FunctionCallOutput { call_id, output } => {
+            if shell_call_ids.remove(call_id)
+                && let Some(structured) = parse_structured_shell_output(&output.content)
+            {
+                output.content = structured
+            }
+        }
+        _ => {}
+    })
+}
+
+fn is_shell_tool_name(name: &str) -> bool {
+    matches!(name, "shell" | "container.exec")
+}
+
+#[derive(Deserialize)]
+struct ExecOutputJson {
+    output: String,
+    metadata: ExecOutputMetadataJson,
+}
+
+#[derive(Deserialize)]
+struct ExecOutputMetadataJson {
+    exit_code: i32,
+    duration_seconds: f32,
+}
+
+fn parse_structured_shell_output(raw: &str) -> Option<String> {
+    let parsed: ExecOutputJson = serde_json::from_str(raw).ok()?;
+    Some(build_structured_output(&parsed))
+}
+
+fn build_structured_output(parsed: &ExecOutputJson) -> String {
+    let mut sections = Vec::new();
+    sections.push(format!("Exit code: {}", parsed.metadata.exit_code));
+    sections.push(format!(
+        "Wall time: {} seconds",
+        parsed.metadata.duration_seconds
+    ));
+
+    if let Some(total_lines) = extract_total_output_lines(&parsed.output) {
+        sections.push(format!("Total output lines: {total_lines}"));
+    }
+
+    sections.push("Output:".to_string());
+    sections.push(parsed.output.clone());
+
+    sections.join("\n")
+}
+
+fn extract_total_output_lines(output: &str) -> Option<u32> {
+    let marker_start = output.find("[... omitted ")?;
+    let marker = &output[marker_start..];
+    let (_, after_of) = marker.split_once(" of ")?;
+    let (total_segment, _) = after_of.split_once(' ')?;
+    total_segment.parse::<u32>().ok()
+}
+
 #[derive(Debug)]
 pub enum ResponseEvent {
    Created,
--- a/codex-rs/core/src/config.rs
+++ b/codex-rs/core/src/config.rs
@@ -750,6 +750,7 @@ pub struct ConfigToml {
    pub experimental_use_exec_command_tool: Option<bool>,
    pub experimental_use_unified_exec_tool: Option<bool>,
    pub experimental_use_rmcp_client: Option<bool>,
+    pub experimental_use_freeform_apply_patch: Option<bool>,

    pub projects: Option<HashMap<String, ProjectConfig>>,

@@ -1111,7 +1112,9 @@ impl Config {
                .or(cfg.chatgpt_base_url)
                .unwrap_or("https://chatgpt.com/backend-api/".to_string()),
            include_plan_tool: include_plan_tool.unwrap_or(false),
-            include_apply_patch_tool: include_apply_patch_tool.unwrap_or(false),
+            include_apply_patch_tool: include_apply_patch_tool
+                .or(cfg.experimental_use_freeform_apply_patch)
+                .unwrap_or(false),
            tools_web_search_request,
            use_experimental_streamable_shell_tool: cfg
                .experimental_use_exec_command_tool
--- a/codex-rs/core/src/model_family.rs
+++ b/codex-rs/core/src/model_family.rs
@@ -110,6 +110,7 @@ pub fn find_family_for_model(slug: &str) -> Option<ModelFamily> {
            reasoning_summary_format: ReasoningSummaryFormat::Experimental,
            base_instructions: GPT_5_CODEX_INSTRUCTIONS.to_string(),
            experimental_supported_tools: vec!["read_file".to_string()],
+            apply_patch_tool_type: Some(ApplyPatchToolType::Freeform),
        )
    } else if slug.starts_with("gpt-5") {
        model_family!(
--- a/codex-rs/core/src/tools/handlers/apply_patch.rs
+++ b/codex-rs/core/src/tools/handlers/apply_patch.rs
@@ -106,7 +106,7 @@ pub enum ApplyPatchToolType {
 pub(crate) fn create_apply_patch_freeform_tool() -> ToolSpec {
    ToolSpec::Freeform(FreeformTool {
        name: "apply_patch".to_string(),
-        description: "Use the `apply_patch` tool to edit files".to_string(),
+        description: "Use the `apply_patch` tool to edit files. This is a FREEFORM tool, so do not wrap the patch in JSON.".to_string(),
        format: FreeformToolFormat {
            r#type: "grammar".to_string(),
            syntax: "lark".to_string(),
--- a/codex-rs/core/src/tools/spec.rs
+++ b/codex-rs/core/src/tools/spec.rs
@@ -695,7 +695,7 @@ mod tests {
        });
        let (tools, _) = build_specs(&config, Some(HashMap::new())).build();

-        assert_eq_tool_names(&tools, &["unified_exec", "read_file"]);
+        assert_eq_tool_names(&tools, &["unified_exec", "apply_patch", "read_file"]);
    }

    #[test]
@@ -921,6 +921,7 @@ mod tests {
            &tools,
            &[
                "unified_exec",
+                "apply_patch",
                "read_file",
                "web_search",
                "view_image",
@@ -929,7 +930,7 @@ mod tests {
        );

        assert_eq!(
-            tools[4],
+            tools[5],
            ToolSpec::Function(ResponsesApiTool {
                name: "dash/search".to_string(),
                parameters: JsonSchema::Object {
@@ -988,6 +989,7 @@ mod tests {
            &tools,
            &[
                "unified_exec",
+                "apply_patch",
                "read_file",
                "web_search",
                "view_image",
@@ -995,7 +997,7 @@ mod tests {
            ],
        );
        assert_eq!(
-            tools[4],
+            tools[5],
            ToolSpec::Function(ResponsesApiTool {
                name: "dash/paginate".to_string(),
                parameters: JsonSchema::Object {
@@ -1019,7 +1021,7 @@ mod tests {
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            include_plan_tool: false,
-            include_apply_patch_tool: false,
+            include_apply_patch_tool: true,
            include_web_search_request: true,
            use_streamable_shell_tool: false,
            include_view_image_tool: true,
@@ -1052,6 +1054,7 @@ mod tests {
            &tools,
            &[
                "unified_exec",
+                "apply_patch",
                "read_file",
                "web_search",
                "view_image",
@@ -1059,7 +1062,7 @@ mod tests {
            ],
        );
        assert_eq!(
-            tools[4],
+            tools[5],
            ToolSpec::Function(ResponsesApiTool {
                name: "dash/tags".to_string(),
                parameters: JsonSchema::Object {
@@ -1119,6 +1122,7 @@ mod tests {
            &tools,
            &[
                "unified_exec",
+                "apply_patch",
                "read_file",
                "web_search",
                "view_image",
@@ -1126,7 +1130,7 @@ mod tests {
            ],
        );
        assert_eq!(
-            tools[4],
+            tools[5],
            ToolSpec::Function(ResponsesApiTool {
                name: "dash/value".to_string(),
                parameters: JsonSchema::Object {
@@ -1223,6 +1227,7 @@ mod tests {
            &tools,
            &[
                "unified_exec",
+                "apply_patch",
                "read_file",
                "web_search",
                "view_image",
@@ -1231,7 +1236,7 @@ mod tests {
        );

        assert_eq!(
-            tools[4],
+            tools[5],
            ToolSpec::Function(ResponsesApiTool {
                name: "test_server/do_something_cool".to_string(),
                parameters: JsonSchema::Object {
--- a/codex-rs/core/tests/suite/mod.rs
+++ b/codex-rs/core/tests/suite/mod.rs
@@ -20,6 +20,7 @@ mod review;
 mod rmcp_client;
 mod rollout_list_find;
 mod seatbelt;
+mod shell_serialization;
 mod stream_error_allows_next_turn;
 mod stream_no_completed;
 mod tool_harness;
--- a/codex-rs/core/tests/suite/model_tools.rs
+++ b/codex-rs/core/tests/suite/model_tools.rs
@@ -125,7 +125,11 @@ async fn model_selects_expected_tools() {
    let gpt5_codex_tools = collect_tool_identifiers_for_model("gpt-5-codex").await;
    assert_eq!(
        gpt5_codex_tools,
-        vec!["shell".to_string(), "read_file".to_string()],
+        vec![
+            "shell".to_string(),
+            "apply_patch".to_string(),
+            "read_file".to_string()
+        ],
        "gpt-5-codex should expose the beta read_file tool",
    );
 }
--- a/codex-rs/core/tests/suite/shell_serialization.rs
+++ b/codex-rs/core/tests/suite/shell_serialization.rs
@@ -0,0 +1,276 @@
+#![cfg(not(target_os = "windows"))]
+
+use anyhow::Result;
+use codex_core::model_family::find_family_for_model;
+use codex_core::protocol::AskForApproval;
+use codex_core::protocol::EventMsg;
+use codex_core::protocol::InputItem;
+use codex_core::protocol::Op;
+use codex_core::protocol::SandboxPolicy;
+use codex_protocol::config_types::ReasoningSummary;
+use core_test_support::responses::ev_assistant_message;
+use core_test_support::responses::ev_completed;
+use core_test_support::responses::ev_function_call;
+use core_test_support::responses::mount_sse_sequence;
+use core_test_support::responses::sse;
+use core_test_support::responses::start_mock_server;
+use core_test_support::skip_if_no_network;
+use core_test_support::test_codex::TestCodex;
+use core_test_support::test_codex::test_codex;
+use serde_json::Value;
+use serde_json::json;
+
+async fn submit_turn(test: &TestCodex, prompt: &str, sandbox_policy: SandboxPolicy) -> Result<()> {
+    let session_model = test.session_configured.model.clone();
+
+    test.codex
+        .submit(Op::UserTurn {
+            items: vec![InputItem::Text {
+                text: prompt.into(),
+            }],
+            final_output_json_schema: None,
+            cwd: test.cwd.path().to_path_buf(),
+            approval_policy: AskForApproval::Never,
+            sandbox_policy,
+            model: session_model,
+            effort: None,
+            summary: ReasoningSummary::Auto,
+        })
+        .await?;
+
+    loop {
+        let event = test.codex.next_event().await?;
+        if matches!(event.msg, EventMsg::TaskComplete(_)) {
+            break;
+        }
+    }
+
+    Ok(())
+}
+
+fn request_bodies(requests: &[wiremock::Request]) -> Result<Vec<Value>> {
+    requests
+        .iter()
+        .map(|req| Ok(serde_json::from_slice::<Value>(&req.body)?))
+        .collect()
+}
+
+fn find_function_call_output<'a>(bodies: &'a [Value], call_id: &str) -> Option<&'a Value> {
+    for body in bodies {
+        if let Some(items) = body.get("input").and_then(Value::as_array) {
+            for item in items {
+                if item.get("type").and_then(Value::as_str) == Some("function_call_output")
+                    && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+                {
+                    return Some(item);
+                }
+            }
+        }
+    }
+    None
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn shell_output_stays_json_without_freeform_apply_patch() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let mut builder = test_codex().with_config(|config| {
+        config.include_apply_patch_tool = false;
+        config.model = "gpt-5".to_string();
+        config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a model family");
+    });
+    let test = builder.build(&server).await?;
+
+    let call_id = "shell-json";
+    let args = json!({
+        "command": ["/bin/echo", "shell json"],
+        "timeout_ms": 1_000,
+    });
+    let responses = vec![
+        sse(vec![
+            json!({"type": "response.created", "response": {"id": "resp-1"}}),
+            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_turn(
+        &test,
+        "run the json shell command",
+        SandboxPolicy::DangerFullAccess,
+    )
+    .await?;
+
+    let requests = server
+        .received_requests()
+        .await
+        .expect("recorded requests present");
+    let bodies = request_bodies(&requests)?;
+    let output_item = find_function_call_output(&bodies, call_id).expect("shell output present");
+    let output = output_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("shell output string");
+
+    let parsed: Value = serde_json::from_str(output)?;
+    assert_eq!(
+        parsed
+            .get("metadata")
+            .and_then(|metadata| metadata.get("exit_code"))
+            .and_then(Value::as_i64),
+        Some(0),
+        "expected zero exit code in unformatted JSON output",
+    );
+    let stdout = parsed
+        .get("output")
+        .and_then(Value::as_str)
+        .unwrap_or_default();
+    assert!(
+        stdout.contains("shell json"),
+        "expected stdout to include command output, got {stdout:?}"
+    );
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn shell_output_is_structured_with_freeform_apply_patch() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let mut builder = test_codex().with_config(|config| {
+        config.include_apply_patch_tool = true;
+    });
+    let test = builder.build(&server).await?;
+
+    let call_id = "shell-structured";
+    let args = json!({
+        "command": ["/bin/echo", "freeform shell"],
+        "timeout_ms": 1_000,
+    });
+    let responses = vec![
+        sse(vec![
+            json!({"type": "response.created", "response": {"id": "resp-1"}}),
+            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_turn(
+        &test,
+        "run the structured shell command",
+        SandboxPolicy::DangerFullAccess,
+    )
+    .await?;
+
+    let requests = server
+        .received_requests()
+        .await
+        .expect("recorded requests present");
+    let bodies = request_bodies(&requests)?;
+    let output_item =
+        find_function_call_output(&bodies, call_id).expect("structured output present");
+    let output = output_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("structured output string");
+
+    assert!(
+        serde_json::from_str::<Value>(output).is_err(),
+        "expected structured shell output to be plain text",
+    );
+    assert!(
+        output.starts_with("Exit code: 0\n"),
+        "expected exit code prefix, got {output:?}",
+    );
+    assert!(
+        output.contains("\nOutput:\n"),
+        "expected Output section, got {output:?}"
+    );
+    assert!(
+        output.contains("freeform shell"),
+        "expected stdout content, got {output:?}"
+    );
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn shell_output_reserializes_truncated_content() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+    let mut builder = test_codex().with_config(|config| {
+        config.model = "gpt-5-codex".to_string();
+        config.model_family =
+            find_family_for_model("gpt-5-codex").expect("gpt-5 is a model family");
+    });
+    let test = builder.build(&server).await?;
+
+    let call_id = "shell-truncated";
+    let args = json!({
+        "command": ["/bin/sh", "-c", "seq 1 400"],
+        "timeout_ms": 1_000,
+    });
+    let responses = vec![
+        sse(vec![
+            json!({"type": "response.created", "response": {"id": "resp-1"}}),
+            ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_turn(
+        &test,
+        "run the truncation shell command",
+        SandboxPolicy::DangerFullAccess,
+    )
+    .await?;
+
+    let requests = server
+        .received_requests()
+        .await
+        .expect("recorded requests present");
+    let bodies = request_bodies(&requests)?;
+    let output_item =
+        find_function_call_output(&bodies, call_id).expect("truncated output present");
+    let output = output_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("truncated output string");
+
+    assert!(
+        serde_json::from_str::<Value>(output).is_err(),
+        "expected truncated shell output to be plain text",
+    );
+    assert!(
+        output.starts_with("Exit code: 0\n"),
+        "expected exit code prefix, got {output:?}",
+    );
+    assert!(
+        output.lines().any(|line| line == "Total output lines: 400"),
+        "expected total output lines marker, got {output:?}",
+    );
+    assert!(
+        output.contains("[... omitted"),
+        "expected truncated marker, got {output:?}",
+    );
+
+    Ok(())
+}
--- a/codex-rs/core/tests/suite/tool_harness.rs
+++ b/codex-rs/core/tests/suite/tool_harness.rs
@@ -1,5 +1,6 @@
 #![cfg(not(target_os = "windows"))]

+use codex_core::model_family::find_family_for_model;
 use codex_core::protocol::AskForApproval;
 use codex_core::protocol::EventMsg;
 use codex_core::protocol::InputItem;
@@ -53,7 +54,8 @@ async fn shell_tool_executes_command_and_streams_output() -> anyhow::Result<()>
    let server = start_mock_server().await;

    let mut builder = test_codex().with_config(|config| {
-        config.include_apply_patch_tool = true;
+        config.model = "gpt-5".to_string();
+        config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a valid model");
    });
    let TestCodex {
        codex,
--- a/codex-rs/core/tests/suite/tools.rs
+++ b/codex-rs/core/tests/suite/tools.rs
@@ -2,6 +2,7 @@
 #![allow(clippy::unwrap_used, clippy::expect_used)]

 use anyhow::Result;
+use codex_core::model_family::find_family_for_model;
 use codex_core::protocol::AskForApproval;
 use codex_core::protocol::EventMsg;
 use codex_core::protocol::InputItem;
@@ -147,7 +148,10 @@ async fn shell_escalated_permissions_rejected_then_ok() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;
-    let mut builder = test_codex();
+    let mut builder = test_codex().with_config(|config| {
+        config.model = "gpt-5".to_string();
+        config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a valid model");
+    });
    let test = builder.build(&server).await?;

    let command = ["/bin/echo", "shell ok"];
@@ -375,7 +379,10 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;
-    let mut builder = test_codex();
+    let mut builder = test_codex().with_config(|config| {
+        config.model = "gpt-5".to_string();
+        config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a valid model");
+    });
    let test = builder.build(&server).await?;

    let call_id = "shell-timeout";
--- a/codex-rs/exec/src/lib.rs
+++ b/codex-rs/exec/src/lib.rs
@@ -177,7 +177,7 @@ pub async fn run_main(cli: Cli, codex_linux_sandbox_exe: Option<PathBuf>) -> any
        codex_linux_sandbox_exe,
        base_instructions: None,
        include_plan_tool: Some(include_plan_tool),
-        include_apply_patch_tool: Some(true),
+        include_apply_patch_tool: None,
        include_view_image_tool: None,
        show_raw_agent_reasoning: oss.then_some(true),
        tools_web_search_request: None,