Use a unified shell tell to not break cache (#3814)

Currently, we change the tool description according to the sandbox policy and approval policy. This breaks the cache when the user hits `/approvals`. This PR does the following: - Always use the shell with escalation parameter: - removes `create_shell_tool_for_sandbox` and always uses unified tool via `create_shell_tool` - Reject the func call when the model uses escalation parameter when it cannot.
2025-09-18 17:08:28 -07:00
parent de64f5f007
commit a7fda70053
3 changed files with 133 additions and 128 deletions
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -456,8 +456,6 @@ impl Session {
            client,
            tools_config: ToolsConfig::new(&ToolsConfigParams {
                model_family: &config.model_family,
                approval_policy,
                sandbox_policy: sandbox_policy.clone(),
                include_plan_tool: config.include_plan_tool,
                include_apply_patch_tool: config.include_apply_patch_tool,
                include_web_search_request: config.tools_web_search_request,
@@ -1237,8 +1235,6 @@ async fn submission_loop(
                let tools_config = ToolsConfig::new(&ToolsConfigParams {
                    model_family: &effective_family,
                    approval_policy: new_approval_policy,
                    sandbox_policy: new_sandbox_policy.clone(),
                    include_plan_tool: config.include_plan_tool,
                    include_apply_patch_tool: config.include_apply_patch_tool,
                    include_web_search_request: config.tools_web_search_request,
@@ -1325,8 +1321,6 @@ async fn submission_loop(
                        client,
                        tools_config: ToolsConfig::new(&ToolsConfigParams {
                            model_family: &model_family,
                            approval_policy,
                            sandbox_policy: sandbox_policy.clone(),
                            include_plan_tool: config.include_plan_tool,
                            include_apply_patch_tool: config.include_apply_patch_tool,
                            include_web_search_request: config.tools_web_search_request,
@@ -1553,8 +1547,6 @@ async fn spawn_review_thread(
        .unwrap_or_else(|| parent_turn_context.client.get_model_family());
    let tools_config = ToolsConfig::new(&ToolsConfigParams {
        model_family: &review_model_family,
        approval_policy: parent_turn_context.approval_policy,
        sandbox_policy: parent_turn_context.sandbox_policy.clone(),
        include_plan_tool: false,
        include_apply_patch_tool: config.include_apply_patch_tool,
        include_web_search_request: false,
@@ -2724,6 +2716,21 @@ async fn handle_container_exec_with_params(
    sub_id: String,
    call_id: String,
 ) -> ResponseInputItem {
    if params.with_escalated_permissions.unwrap_or(false)
        && !matches!(turn_context.approval_policy, AskForApproval::OnRequest)
    {
        return ResponseInputItem::FunctionCallOutput {
            call_id,
            output: FunctionCallOutputPayload {
                content: format!(
                    "approval policy is {policy:?}; reject command — you should not ask for escalated permissions if the approval policy is {policy:?}",
                    policy = turn_context.approval_policy
                ),
                success: None,
            },
        };
    }
    // check if this was a patch, and apply it if so
    let apply_patch_exec = match maybe_parse_apply_patch_verified(&params.command, &params.cwd) {
        MaybeApplyPatchVerified::Body(changes) => {
@@ -3345,6 +3352,7 @@ mod tests {
    use mcp_types::ContentBlock;
    use mcp_types::TextContent;
    use pretty_assertions::assert_eq;
    use serde::Deserialize;
    use serde_json::json;
    use std::path::PathBuf;
    use std::sync::Arc;
@@ -3594,8 +3602,6 @@ mod tests {
        );
        let tools_config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &config.model_family,
            approval_policy: config.approval_policy,
            sandbox_policy: config.sandbox_policy.clone(),
            include_plan_tool: config.include_plan_tool,
            include_apply_patch_tool: config.include_apply_patch_tool,
            include_web_search_request: config.tools_web_search_request,
@@ -3735,4 +3741,105 @@ mod tests {
        (rollout_items, live_history.contents())
    }
    #[tokio::test]
    async fn rejects_escalated_permissions_when_policy_not_on_request() {
        use crate::exec::ExecParams;
        use crate::protocol::AskForApproval;
        use crate::protocol::SandboxPolicy;
        use crate::turn_diff_tracker::TurnDiffTracker;
        use std::collections::HashMap;
        let (session, mut turn_context) = make_session_and_context();
        // Ensure policy is NOT OnRequest so the early rejection path triggers
        turn_context.approval_policy = AskForApproval::OnFailure;
        let params = ExecParams {
            command: if cfg!(windows) {
                vec![
                    "cmd.exe".to_string(),
                    "/C".to_string(),
                    "echo hi".to_string(),
                ]
            } else {
                vec![
                    "/bin/sh".to_string(),
                    "-c".to_string(),
                    "echo hi".to_string(),
                ]
            },
            cwd: turn_context.cwd.clone(),
            timeout_ms: Some(1000),
            env: HashMap::new(),
            with_escalated_permissions: Some(true),
            justification: Some("test".to_string()),
        };
        let params2 = ExecParams {
            with_escalated_permissions: Some(false),
            ..params.clone()
        };
        let mut turn_diff_tracker = TurnDiffTracker::new();
        let sub_id = "test-sub".to_string();
        let call_id = "test-call".to_string();
        let resp = handle_container_exec_with_params(
            params,
            &session,
            &turn_context,
            &mut turn_diff_tracker,
            sub_id,
            call_id,
        )
        .await;
        let ResponseInputItem::FunctionCallOutput { output, .. } = resp else {
            panic!("expected FunctionCallOutput");
        };
        let expected = format!(
            "approval policy is {policy:?}; reject command — you should not ask for escalated permissions if the approval policy is {policy:?}",
            policy = turn_context.approval_policy
        );
        pretty_assertions::assert_eq!(output.content, expected);
        // Now retry the same command WITHOUT escalated permissions; should succeed.
        // Force DangerFullAccess to avoid platform sandbox dependencies in tests.
        turn_context.sandbox_policy = SandboxPolicy::DangerFullAccess;
        let resp2 = handle_container_exec_with_params(
            params2,
            &session,
            &turn_context,
            &mut turn_diff_tracker,
            "test-sub".to_string(),
            "test-call-2".to_string(),
        )
        .await;
        let ResponseInputItem::FunctionCallOutput { output, .. } = resp2 else {
            panic!("expected FunctionCallOutput on retry");
        };
        #[derive(Deserialize, PartialEq, Eq, Debug)]
        struct ResponseExecMetadata {
            exit_code: i32,
        }
        #[derive(Deserialize)]
        struct ResponseExecOutput {
            output: String,
            metadata: ResponseExecMetadata,
        }
        let exec_output: ResponseExecOutput =
            serde_json::from_str(&output.content).expect("valid exec output json");
        pretty_assertions::assert_eq!(exec_output.metadata, ResponseExecMetadata { exit_code: 0 });
        assert!(exec_output.output.contains("hi"));
        pretty_assertions::assert_eq!(output.success, Some(true));
    }
 }
--- a/codex-rs/core/src/exec.rs
+++ b/codex-rs/core/src/exec.rs
@@ -45,7 +45,7 @@ const AGGREGATE_BUFFER_INITIAL_CAPACITY: usize = 8 * 1024; // 8 KiB
 /// Aggregation still collects full output; only the live event stream is capped.
 pub(crate) const MAX_EXEC_OUTPUT_DELTAS_PER_CALL: usize = 10_000;
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug)]
 pub struct ExecParams {
    pub command: Vec<String>,
    pub cwd: PathBuf,
--- a/codex-rs/core/src/openai_tools.rs
+++ b/codex-rs/core/src/openai_tools.rs
@@ -7,8 +7,6 @@ use std::collections::HashMap;
 use crate::model_family::ModelFamily;
 use crate::plan_tool::PLAN_TOOL;
 use crate::protocol::AskForApproval;
 use crate::protocol::SandboxPolicy;
 use crate::tool_apply_patch::ApplyPatchToolType;
 use crate::tool_apply_patch::create_apply_patch_freeform_tool;
 use crate::tool_apply_patch::create_apply_patch_json_tool;
@@ -57,10 +55,9 @@ pub(crate) enum OpenAiTool {
 #[derive(Debug, Clone)]
 pub enum ConfigShellToolType {
-    DefaultShell,
+    Default,
-    ShellWithRequest { sandbox_policy: SandboxPolicy },
+    Local,
-    LocalShell,
+    Streamable,
    StreamableShell,
 }
 #[derive(Debug, Clone)]
@@ -75,8 +72,6 @@ pub(crate) struct ToolsConfig {
 pub(crate) struct ToolsConfigParams<'a> {
    pub(crate) model_family: &'a ModelFamily,
    pub(crate) approval_policy: AskForApproval,
    pub(crate) sandbox_policy: SandboxPolicy,
    pub(crate) include_plan_tool: bool,
    pub(crate) include_apply_patch_tool: bool,
    pub(crate) include_web_search_request: bool,
@@ -89,8 +84,6 @@ impl ToolsConfig {
    pub fn new(params: &ToolsConfigParams) -> Self {
        let ToolsConfigParams {
            model_family,
            approval_policy,
            sandbox_policy,
            include_plan_tool,
            include_apply_patch_tool,
            include_web_search_request,
@@ -98,18 +91,13 @@ impl ToolsConfig {
            include_view_image_tool,
            experimental_unified_exec_tool,
        } = params;
-        let mut shell_type = if *use_streamable_shell_tool {
+        let shell_type = if *use_streamable_shell_tool {
-            ConfigShellToolType::StreamableShell
+            ConfigShellToolType::Streamable
        } else if model_family.uses_local_shell_tool {
-            ConfigShellToolType::LocalShell
+            ConfigShellToolType::Local
        } else {
-            ConfigShellToolType::DefaultShell
+            ConfigShellToolType::Default
        };
        if matches!(approval_policy, AskForApproval::OnRequest) && !use_streamable_shell_tool {
            shell_type = ConfigShellToolType::ShellWithRequest {
                sandbox_policy: sandbox_policy.clone(),
            }
        }
        let apply_patch_tool_type = match model_family.apply_patch_tool_type {
            Some(ApplyPatchToolType::Freeform) => Some(ApplyPatchToolType::Freeform),
@@ -170,40 +158,6 @@ pub(crate) enum JsonSchema {
    },
 }
 fn create_shell_tool() -> OpenAiTool {
    let mut properties = BTreeMap::new();
    properties.insert(
        "command".to_string(),
        JsonSchema::Array {
            items: Box::new(JsonSchema::String { description: None }),
            description: Some("The command to execute".to_string()),
        },
    );
    properties.insert(
        "workdir".to_string(),
        JsonSchema::String {
            description: Some("The working directory to execute the command in".to_string()),
        },
    );
    properties.insert(
        "timeout_ms".to_string(),
        JsonSchema::Number {
            description: Some("The timeout for the command in milliseconds".to_string()),
        },
    );
    OpenAiTool::Function(ResponsesApiTool {
        name: "shell".to_string(),
        description: "Runs a shell command and returns its output".to_string(),
        strict: false,
        parameters: JsonSchema::Object {
            properties,
            required: Some(vec!["command".to_string()]),
            additional_properties: Some(false),
        },
    })
 }
 fn create_unified_exec_tool() -> OpenAiTool {
    let mut properties = BTreeMap::new();
    properties.insert(
@@ -251,7 +205,7 @@ fn create_unified_exec_tool() -> OpenAiTool {
    })
 }
-fn create_shell_tool_for_sandbox(sandbox_policy: &SandboxPolicy) -> OpenAiTool {
+fn create_shell_tool() -> OpenAiTool {
    let mut properties = BTreeMap::new();
    properties.insert(
        "command".to_string(),
@@ -273,20 +227,18 @@ fn create_shell_tool_for_sandbox(sandbox_policy: &SandboxPolicy) -> OpenAiTool {
        },
    );
-    if !matches!(sandbox_policy, SandboxPolicy::DangerFullAccess) {
+    properties.insert(
        properties.insert(
        "with_escalated_permissions".to_string(),
        JsonSchema::Boolean {
            description: Some("Whether to request escalated permissions. Set to true if command needs to be run without sandbox restrictions".to_string()),
        },
    );
-        properties.insert(
+    properties.insert(
        "justification".to_string(),
        JsonSchema::String {
            description: Some("Only set if with_escalated_permissions is true. 1-sentence explanation of why we want to run this command.".to_string()),
        },
    );
    }
    OpenAiTool::Function(ResponsesApiTool {
        name: "shell".to_string(),
@@ -537,16 +489,13 @@ pub(crate) fn get_openai_tools(
        tools.push(create_unified_exec_tool());
    } else {
        match &config.shell_type {
-            ConfigShellToolType::DefaultShell => {
+            ConfigShellToolType::Default => {
                tools.push(create_shell_tool());
            }
-            ConfigShellToolType::ShellWithRequest { sandbox_policy } => {
+            ConfigShellToolType::Local => {
                tools.push(create_shell_tool_for_sandbox(sandbox_policy));
            }
            ConfigShellToolType::LocalShell => {
                tools.push(OpenAiTool::LocalShell {});
            }
-            ConfigShellToolType::StreamableShell => {
+            ConfigShellToolType::Streamable => {
                tools.push(OpenAiTool::Function(
                    crate::exec_command::create_exec_command_tool_for_responses_api(),
                ));
@@ -636,8 +585,6 @@ mod tests {
            .expect("codex-mini-latest should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: true,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -658,8 +605,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: true,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -680,8 +625,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: false,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -786,8 +729,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: false,
            include_apply_patch_tool: false,
            include_web_search_request: false,
@@ -864,8 +805,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: false,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -927,8 +866,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: false,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -985,8 +922,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: false,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -1046,8 +981,6 @@ mod tests {
        let model_family = find_family_for_model("o3").expect("o3 should be a valid model family");
        let config = ToolsConfig::new(&ToolsConfigParams {
            model_family: &model_family,
            approval_policy: AskForApproval::Never,
            sandbox_policy: SandboxPolicy::ReadOnly,
            include_plan_tool: false,
            include_apply_patch_tool: false,
            include_web_search_request: true,
@@ -1100,14 +1033,8 @@ mod tests {
    }
    #[test]
-    fn test_shell_tool_for_sandbox_workspace_write() {
+    fn test_shell_tool() {
-        let sandbox_policy = SandboxPolicy::WorkspaceWrite {
+        let tool = super::create_shell_tool();
            writable_roots: vec!["workspace".into()],
            network_access: false,
            exclude_tmpdir_env_var: false,
            exclude_slash_tmp: false,
        };
        let tool = super::create_shell_tool_for_sandbox(&sandbox_policy);
        let OpenAiTool::Function(ResponsesApiTool {
            description, name, ..
        }) = &tool
@@ -1119,33 +1046,4 @@ mod tests {
        let expected = "Runs a shell command and returns its output.";
        assert_eq!(description, expected);
    }
    #[test]
    fn test_shell_tool_for_sandbox_readonly() {
        let tool = super::create_shell_tool_for_sandbox(&SandboxPolicy::ReadOnly);
        let OpenAiTool::Function(ResponsesApiTool {
            description, name, ..
        }) = &tool
        else {
            panic!("expected function tool");
        };
        assert_eq!(name, "shell");
        let expected = "Runs a shell command and returns its output.";
        assert_eq!(description, expected);
    }
    #[test]
    fn test_shell_tool_for_sandbox_danger_full_access() {
        let tool = super::create_shell_tool_for_sandbox(&SandboxPolicy::DangerFullAccess);
        let OpenAiTool::Function(ResponsesApiTool {
            description, name, ..
        }) = &tool
        else {
            panic!("expected function tool");
        };
        assert_eq!(name, "shell");
        assert_eq!(description, "Runs a shell command and returns its output.");
    }
 }