From 4764fc1ee737738a6d5bf5b41a434ee2d02faa63 Mon Sep 17 00:00:00 2001 From: Dylan Date: Sat, 4 Oct 2025 19:16:36 -0700 Subject: [PATCH] feat: Freeform apply_patch with simple shell output (#4718) ## Summary This PR is an alternative approach to #4711, but instead of changing our storage, parses out shell calls in the client and reserializes them on the fly before we send them out as part of the request. What this changes: 1. Adds additional serialization logic when the ApplyPatchToolType::Freeform is in use. 2. Adds a --custom-apply-patch flag to enable this setting on a session-by-session basis. This change is delicate, but is not meant to be permanent. It is meant to be the first step in a migration: 1. (This PR) Add in-flight serialization with config 2. Update model_family default 3. Update serialization logic to store turn outputs in a structured format, with logic to serialize based on model_family setting. 4. Remove this rewrite in-flight logic. ## Test Plan - [x] Additional unit tests added - [x] Integration tests added - [x] Tested locally --- codex-rs/core/src/client_common.rs | 108 ++++++- codex-rs/core/src/config.rs | 5 +- codex-rs/core/src/model_family.rs | 1 + .../core/src/tools/handlers/apply_patch.rs | 2 +- codex-rs/core/src/tools/spec.rs | 19 +- codex-rs/core/tests/suite/mod.rs | 1 + codex-rs/core/tests/suite/model_tools.rs | 6 +- .../core/tests/suite/shell_serialization.rs | 276 ++++++++++++++++++ codex-rs/core/tests/suite/tool_harness.rs | 4 +- codex-rs/core/tests/suite/tools.rs | 11 +- codex-rs/exec/src/lib.rs | 2 +- 11 files changed, 420 insertions(+), 15 deletions(-) create mode 100644 codex-rs/core/tests/suite/shell_serialization.rs diff --git a/codex-rs/core/src/client_common.rs b/codex-rs/core/src/client_common.rs index 624a4ccb..dcd244db 100644 --- a/codex-rs/core/src/client_common.rs +++ b/codex-rs/core/src/client_common.rs @@ -9,9 +9,11 @@ use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::config_types::Verbosity as VerbosityConfig; use codex_protocol::models::ResponseItem; use futures::Stream; +use serde::Deserialize; use serde::Serialize; use serde_json::Value; use std::borrow::Cow; +use std::collections::HashSet; use std::ops::Deref; use std::pin::Pin; use std::task::Context; @@ -64,10 +66,114 @@ impl Prompt { } pub(crate) fn get_formatted_input(&self) -> Vec { - self.input.clone() + let mut input = self.input.clone(); + + // when using the *Freeform* apply_patch tool specifically, tool outputs + // should be structured text, not json. Do NOT reserialize when using + // the Function tool - note that this differs from the check above for + // instructions. We declare the result as a named variable for clarity. + let is_freeform_apply_patch_tool_present = self.tools.iter().any(|tool| match tool { + ToolSpec::Freeform(f) => f.name == "apply_patch", + _ => false, + }); + if is_freeform_apply_patch_tool_present { + reserialize_shell_outputs(&mut input); + } + + input } } +fn reserialize_shell_outputs(items: &mut [ResponseItem]) { + let mut shell_call_ids: HashSet = HashSet::new(); + + items.iter_mut().for_each(|item| match item { + ResponseItem::LocalShellCall { call_id, id, .. } => { + if let Some(identifier) = call_id.clone().or_else(|| id.clone()) { + shell_call_ids.insert(identifier); + } + } + ResponseItem::CustomToolCall { + id: _, + status: _, + call_id, + name, + input: _, + } => { + if name == "apply_patch" { + shell_call_ids.insert(call_id.clone()); + } + } + ResponseItem::CustomToolCallOutput { call_id, output } => { + if shell_call_ids.remove(call_id) + && let Some(structured) = parse_structured_shell_output(output) + { + *output = structured + } + } + ResponseItem::FunctionCall { name, call_id, .. } + if is_shell_tool_name(name) || name == "apply_patch" => + { + shell_call_ids.insert(call_id.clone()); + } + ResponseItem::FunctionCallOutput { call_id, output } => { + if shell_call_ids.remove(call_id) + && let Some(structured) = parse_structured_shell_output(&output.content) + { + output.content = structured + } + } + _ => {} + }) +} + +fn is_shell_tool_name(name: &str) -> bool { + matches!(name, "shell" | "container.exec") +} + +#[derive(Deserialize)] +struct ExecOutputJson { + output: String, + metadata: ExecOutputMetadataJson, +} + +#[derive(Deserialize)] +struct ExecOutputMetadataJson { + exit_code: i32, + duration_seconds: f32, +} + +fn parse_structured_shell_output(raw: &str) -> Option { + let parsed: ExecOutputJson = serde_json::from_str(raw).ok()?; + Some(build_structured_output(&parsed)) +} + +fn build_structured_output(parsed: &ExecOutputJson) -> String { + let mut sections = Vec::new(); + sections.push(format!("Exit code: {}", parsed.metadata.exit_code)); + sections.push(format!( + "Wall time: {} seconds", + parsed.metadata.duration_seconds + )); + + if let Some(total_lines) = extract_total_output_lines(&parsed.output) { + sections.push(format!("Total output lines: {total_lines}")); + } + + sections.push("Output:".to_string()); + sections.push(parsed.output.clone()); + + sections.join("\n") +} + +fn extract_total_output_lines(output: &str) -> Option { + let marker_start = output.find("[... omitted ")?; + let marker = &output[marker_start..]; + let (_, after_of) = marker.split_once(" of ")?; + let (total_segment, _) = after_of.split_once(' ')?; + total_segment.parse::().ok() +} + #[derive(Debug)] pub enum ResponseEvent { Created, diff --git a/codex-rs/core/src/config.rs b/codex-rs/core/src/config.rs index 61512127..6fb83cee 100644 --- a/codex-rs/core/src/config.rs +++ b/codex-rs/core/src/config.rs @@ -750,6 +750,7 @@ pub struct ConfigToml { pub experimental_use_exec_command_tool: Option, pub experimental_use_unified_exec_tool: Option, pub experimental_use_rmcp_client: Option, + pub experimental_use_freeform_apply_patch: Option, pub projects: Option>, @@ -1111,7 +1112,9 @@ impl Config { .or(cfg.chatgpt_base_url) .unwrap_or("https://chatgpt.com/backend-api/".to_string()), include_plan_tool: include_plan_tool.unwrap_or(false), - include_apply_patch_tool: include_apply_patch_tool.unwrap_or(false), + include_apply_patch_tool: include_apply_patch_tool + .or(cfg.experimental_use_freeform_apply_patch) + .unwrap_or(false), tools_web_search_request, use_experimental_streamable_shell_tool: cfg .experimental_use_exec_command_tool diff --git a/codex-rs/core/src/model_family.rs b/codex-rs/core/src/model_family.rs index 5ddaaca2..71d2f9b3 100644 --- a/codex-rs/core/src/model_family.rs +++ b/codex-rs/core/src/model_family.rs @@ -110,6 +110,7 @@ pub fn find_family_for_model(slug: &str) -> Option { reasoning_summary_format: ReasoningSummaryFormat::Experimental, base_instructions: GPT_5_CODEX_INSTRUCTIONS.to_string(), experimental_supported_tools: vec!["read_file".to_string()], + apply_patch_tool_type: Some(ApplyPatchToolType::Freeform), ) } else if slug.starts_with("gpt-5") { model_family!( diff --git a/codex-rs/core/src/tools/handlers/apply_patch.rs b/codex-rs/core/src/tools/handlers/apply_patch.rs index ced0898a..1ad8a95d 100644 --- a/codex-rs/core/src/tools/handlers/apply_patch.rs +++ b/codex-rs/core/src/tools/handlers/apply_patch.rs @@ -106,7 +106,7 @@ pub enum ApplyPatchToolType { pub(crate) fn create_apply_patch_freeform_tool() -> ToolSpec { ToolSpec::Freeform(FreeformTool { name: "apply_patch".to_string(), - description: "Use the `apply_patch` tool to edit files".to_string(), + description: "Use the `apply_patch` tool to edit files. This is a FREEFORM tool, so do not wrap the patch in JSON.".to_string(), format: FreeformToolFormat { r#type: "grammar".to_string(), syntax: "lark".to_string(), diff --git a/codex-rs/core/src/tools/spec.rs b/codex-rs/core/src/tools/spec.rs index 5e24e46e..7bd622a3 100644 --- a/codex-rs/core/src/tools/spec.rs +++ b/codex-rs/core/src/tools/spec.rs @@ -695,7 +695,7 @@ mod tests { }); let (tools, _) = build_specs(&config, Some(HashMap::new())).build(); - assert_eq_tool_names(&tools, &["unified_exec", "read_file"]); + assert_eq_tool_names(&tools, &["unified_exec", "apply_patch", "read_file"]); } #[test] @@ -921,6 +921,7 @@ mod tests { &tools, &[ "unified_exec", + "apply_patch", "read_file", "web_search", "view_image", @@ -929,7 +930,7 @@ mod tests { ); assert_eq!( - tools[4], + tools[5], ToolSpec::Function(ResponsesApiTool { name: "dash/search".to_string(), parameters: JsonSchema::Object { @@ -988,6 +989,7 @@ mod tests { &tools, &[ "unified_exec", + "apply_patch", "read_file", "web_search", "view_image", @@ -995,7 +997,7 @@ mod tests { ], ); assert_eq!( - tools[4], + tools[5], ToolSpec::Function(ResponsesApiTool { name: "dash/paginate".to_string(), parameters: JsonSchema::Object { @@ -1019,7 +1021,7 @@ mod tests { let config = ToolsConfig::new(&ToolsConfigParams { model_family: &model_family, include_plan_tool: false, - include_apply_patch_tool: false, + include_apply_patch_tool: true, include_web_search_request: true, use_streamable_shell_tool: false, include_view_image_tool: true, @@ -1052,6 +1054,7 @@ mod tests { &tools, &[ "unified_exec", + "apply_patch", "read_file", "web_search", "view_image", @@ -1059,7 +1062,7 @@ mod tests { ], ); assert_eq!( - tools[4], + tools[5], ToolSpec::Function(ResponsesApiTool { name: "dash/tags".to_string(), parameters: JsonSchema::Object { @@ -1119,6 +1122,7 @@ mod tests { &tools, &[ "unified_exec", + "apply_patch", "read_file", "web_search", "view_image", @@ -1126,7 +1130,7 @@ mod tests { ], ); assert_eq!( - tools[4], + tools[5], ToolSpec::Function(ResponsesApiTool { name: "dash/value".to_string(), parameters: JsonSchema::Object { @@ -1223,6 +1227,7 @@ mod tests { &tools, &[ "unified_exec", + "apply_patch", "read_file", "web_search", "view_image", @@ -1231,7 +1236,7 @@ mod tests { ); assert_eq!( - tools[4], + tools[5], ToolSpec::Function(ResponsesApiTool { name: "test_server/do_something_cool".to_string(), parameters: JsonSchema::Object { diff --git a/codex-rs/core/tests/suite/mod.rs b/codex-rs/core/tests/suite/mod.rs index 5f2892cb..27bddb7e 100644 --- a/codex-rs/core/tests/suite/mod.rs +++ b/codex-rs/core/tests/suite/mod.rs @@ -20,6 +20,7 @@ mod review; mod rmcp_client; mod rollout_list_find; mod seatbelt; +mod shell_serialization; mod stream_error_allows_next_turn; mod stream_no_completed; mod tool_harness; diff --git a/codex-rs/core/tests/suite/model_tools.rs b/codex-rs/core/tests/suite/model_tools.rs index 9c29e469..c4594d46 100644 --- a/codex-rs/core/tests/suite/model_tools.rs +++ b/codex-rs/core/tests/suite/model_tools.rs @@ -125,7 +125,11 @@ async fn model_selects_expected_tools() { let gpt5_codex_tools = collect_tool_identifiers_for_model("gpt-5-codex").await; assert_eq!( gpt5_codex_tools, - vec!["shell".to_string(), "read_file".to_string()], + vec![ + "shell".to_string(), + "apply_patch".to_string(), + "read_file".to_string() + ], "gpt-5-codex should expose the beta read_file tool", ); } diff --git a/codex-rs/core/tests/suite/shell_serialization.rs b/codex-rs/core/tests/suite/shell_serialization.rs new file mode 100644 index 00000000..f8f9960b --- /dev/null +++ b/codex-rs/core/tests/suite/shell_serialization.rs @@ -0,0 +1,276 @@ +#![cfg(not(target_os = "windows"))] + +use anyhow::Result; +use codex_core::model_family::find_family_for_model; +use codex_core::protocol::AskForApproval; +use codex_core::protocol::EventMsg; +use codex_core::protocol::InputItem; +use codex_core::protocol::Op; +use codex_core::protocol::SandboxPolicy; +use codex_protocol::config_types::ReasoningSummary; +use core_test_support::responses::ev_assistant_message; +use core_test_support::responses::ev_completed; +use core_test_support::responses::ev_function_call; +use core_test_support::responses::mount_sse_sequence; +use core_test_support::responses::sse; +use core_test_support::responses::start_mock_server; +use core_test_support::skip_if_no_network; +use core_test_support::test_codex::TestCodex; +use core_test_support::test_codex::test_codex; +use serde_json::Value; +use serde_json::json; + +async fn submit_turn(test: &TestCodex, prompt: &str, sandbox_policy: SandboxPolicy) -> Result<()> { + let session_model = test.session_configured.model.clone(); + + test.codex + .submit(Op::UserTurn { + items: vec![InputItem::Text { + text: prompt.into(), + }], + final_output_json_schema: None, + cwd: test.cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy, + model: session_model, + effort: None, + summary: ReasoningSummary::Auto, + }) + .await?; + + loop { + let event = test.codex.next_event().await?; + if matches!(event.msg, EventMsg::TaskComplete(_)) { + break; + } + } + + Ok(()) +} + +fn request_bodies(requests: &[wiremock::Request]) -> Result> { + requests + .iter() + .map(|req| Ok(serde_json::from_slice::(&req.body)?)) + .collect() +} + +fn find_function_call_output<'a>(bodies: &'a [Value], call_id: &str) -> Option<&'a Value> { + for body in bodies { + if let Some(items) = body.get("input").and_then(Value::as_array) { + for item in items { + if item.get("type").and_then(Value::as_str) == Some("function_call_output") + && item.get("call_id").and_then(Value::as_str) == Some(call_id) + { + return Some(item); + } + } + } + } + None +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shell_output_stays_json_without_freeform_apply_patch() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + config.include_apply_patch_tool = false; + config.model = "gpt-5".to_string(); + config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a model family"); + }); + let test = builder.build(&server).await?; + + let call_id = "shell-json"; + let args = json!({ + "command": ["/bin/echo", "shell json"], + "timeout_ms": 1_000, + }); + let responses = vec![ + sse(vec![ + json!({"type": "response.created", "response": {"id": "resp-1"}}), + ev_function_call(call_id, "shell", &serde_json::to_string(&args)?), + ev_completed("resp-1"), + ]), + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ]; + mount_sse_sequence(&server, responses).await; + + submit_turn( + &test, + "run the json shell command", + SandboxPolicy::DangerFullAccess, + ) + .await?; + + let requests = server + .received_requests() + .await + .expect("recorded requests present"); + let bodies = request_bodies(&requests)?; + let output_item = find_function_call_output(&bodies, call_id).expect("shell output present"); + let output = output_item + .get("output") + .and_then(Value::as_str) + .expect("shell output string"); + + let parsed: Value = serde_json::from_str(output)?; + assert_eq!( + parsed + .get("metadata") + .and_then(|metadata| metadata.get("exit_code")) + .and_then(Value::as_i64), + Some(0), + "expected zero exit code in unformatted JSON output", + ); + let stdout = parsed + .get("output") + .and_then(Value::as_str) + .unwrap_or_default(); + assert!( + stdout.contains("shell json"), + "expected stdout to include command output, got {stdout:?}" + ); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shell_output_is_structured_with_freeform_apply_patch() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + config.include_apply_patch_tool = true; + }); + let test = builder.build(&server).await?; + + let call_id = "shell-structured"; + let args = json!({ + "command": ["/bin/echo", "freeform shell"], + "timeout_ms": 1_000, + }); + let responses = vec![ + sse(vec![ + json!({"type": "response.created", "response": {"id": "resp-1"}}), + ev_function_call(call_id, "shell", &serde_json::to_string(&args)?), + ev_completed("resp-1"), + ]), + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ]; + mount_sse_sequence(&server, responses).await; + + submit_turn( + &test, + "run the structured shell command", + SandboxPolicy::DangerFullAccess, + ) + .await?; + + let requests = server + .received_requests() + .await + .expect("recorded requests present"); + let bodies = request_bodies(&requests)?; + let output_item = + find_function_call_output(&bodies, call_id).expect("structured output present"); + let output = output_item + .get("output") + .and_then(Value::as_str) + .expect("structured output string"); + + assert!( + serde_json::from_str::(output).is_err(), + "expected structured shell output to be plain text", + ); + assert!( + output.starts_with("Exit code: 0\n"), + "expected exit code prefix, got {output:?}", + ); + assert!( + output.contains("\nOutput:\n"), + "expected Output section, got {output:?}" + ); + assert!( + output.contains("freeform shell"), + "expected stdout content, got {output:?}" + ); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shell_output_reserializes_truncated_content() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + config.model = "gpt-5-codex".to_string(); + config.model_family = + find_family_for_model("gpt-5-codex").expect("gpt-5 is a model family"); + }); + let test = builder.build(&server).await?; + + let call_id = "shell-truncated"; + let args = json!({ + "command": ["/bin/sh", "-c", "seq 1 400"], + "timeout_ms": 1_000, + }); + let responses = vec![ + sse(vec![ + json!({"type": "response.created", "response": {"id": "resp-1"}}), + ev_function_call(call_id, "shell", &serde_json::to_string(&args)?), + ev_completed("resp-1"), + ]), + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ]; + mount_sse_sequence(&server, responses).await; + + submit_turn( + &test, + "run the truncation shell command", + SandboxPolicy::DangerFullAccess, + ) + .await?; + + let requests = server + .received_requests() + .await + .expect("recorded requests present"); + let bodies = request_bodies(&requests)?; + let output_item = + find_function_call_output(&bodies, call_id).expect("truncated output present"); + let output = output_item + .get("output") + .and_then(Value::as_str) + .expect("truncated output string"); + + assert!( + serde_json::from_str::(output).is_err(), + "expected truncated shell output to be plain text", + ); + assert!( + output.starts_with("Exit code: 0\n"), + "expected exit code prefix, got {output:?}", + ); + assert!( + output.lines().any(|line| line == "Total output lines: 400"), + "expected total output lines marker, got {output:?}", + ); + assert!( + output.contains("[... omitted"), + "expected truncated marker, got {output:?}", + ); + + Ok(()) +} diff --git a/codex-rs/core/tests/suite/tool_harness.rs b/codex-rs/core/tests/suite/tool_harness.rs index 317e530c..156a7a95 100644 --- a/codex-rs/core/tests/suite/tool_harness.rs +++ b/codex-rs/core/tests/suite/tool_harness.rs @@ -1,5 +1,6 @@ #![cfg(not(target_os = "windows"))] +use codex_core::model_family::find_family_for_model; use codex_core::protocol::AskForApproval; use codex_core::protocol::EventMsg; use codex_core::protocol::InputItem; @@ -53,7 +54,8 @@ async fn shell_tool_executes_command_and_streams_output() -> anyhow::Result<()> let server = start_mock_server().await; let mut builder = test_codex().with_config(|config| { - config.include_apply_patch_tool = true; + config.model = "gpt-5".to_string(); + config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a valid model"); }); let TestCodex { codex, diff --git a/codex-rs/core/tests/suite/tools.rs b/codex-rs/core/tests/suite/tools.rs index 824a629b..b3c067ec 100644 --- a/codex-rs/core/tests/suite/tools.rs +++ b/codex-rs/core/tests/suite/tools.rs @@ -2,6 +2,7 @@ #![allow(clippy::unwrap_used, clippy::expect_used)] use anyhow::Result; +use codex_core::model_family::find_family_for_model; use codex_core::protocol::AskForApproval; use codex_core::protocol::EventMsg; use codex_core::protocol::InputItem; @@ -147,7 +148,10 @@ async fn shell_escalated_permissions_rejected_then_ok() -> Result<()> { skip_if_no_network!(Ok(())); let server = start_mock_server().await; - let mut builder = test_codex(); + let mut builder = test_codex().with_config(|config| { + config.model = "gpt-5".to_string(); + config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a valid model"); + }); let test = builder.build(&server).await?; let command = ["/bin/echo", "shell ok"]; @@ -375,7 +379,10 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> { skip_if_no_network!(Ok(())); let server = start_mock_server().await; - let mut builder = test_codex(); + let mut builder = test_codex().with_config(|config| { + config.model = "gpt-5".to_string(); + config.model_family = find_family_for_model("gpt-5").expect("gpt-5 is a valid model"); + }); let test = builder.build(&server).await?; let call_id = "shell-timeout"; diff --git a/codex-rs/exec/src/lib.rs b/codex-rs/exec/src/lib.rs index e3bf6e52..18e09778 100644 --- a/codex-rs/exec/src/lib.rs +++ b/codex-rs/exec/src/lib.rs @@ -177,7 +177,7 @@ pub async fn run_main(cli: Cli, codex_linux_sandbox_exe: Option) -> any codex_linux_sandbox_exe, base_instructions: None, include_plan_tool: Some(include_plan_tool), - include_apply_patch_tool: Some(true), + include_apply_patch_tool: None, include_view_image_tool: None, show_raw_agent_reasoning: oss.then_some(true), tools_web_search_request: None,