Added model summary and risk assessment for commands that violate sandbox policy (#5536)

This PR adds support for a model-based summary and risk assessment for commands that violate the sandbox policy and require user approval. This aids the user in evaluating whether the command should be approved. The feature works by taking a failed command and passing it back to the model and asking it to summarize the command, give it a risk level (low, medium, high) and a risk category (e.g. "data deletion" or "data exfiltration"). It uses a new conversation thread so the context in the existing thread doesn't influence the answer. If the call to the model fails or takes longer than 5 seconds, it falls back to the current behavior. For now, this is an experimental feature and is gated by a config key `experimental_sandbox_command_assessment`. Here is a screen shot of the approval prompt showing the risk assessment and summary. <img width="723" height="282" alt="image" src="https://github.com/user-attachments/assets/4597dd7c-d5a0-4e9f-9d13-414bd082fd6b" />
2025-10-24 17:23:44 -05:00
parent a4be4d78b9
commit f8af4f5c8d
31 changed files with 701 additions and 34 deletions
--- a/codex-rs/protocol/src/protocol.rs
+++ b/codex-rs/protocol/src/protocol.rs
@@ -34,6 +34,12 @@ use serde_with::serde_as;
 use strum_macros::Display;
 use ts_rs::TS;

+pub use crate::approvals::ApplyPatchApprovalRequestEvent;
+pub use crate::approvals::ExecApprovalRequestEvent;
+pub use crate::approvals::SandboxCommandAssessment;
+pub use crate::approvals::SandboxRiskCategory;
+pub use crate::approvals::SandboxRiskLevel;
+
 /// Open/close tags for special user-input blocks. Used across crates to avoid
 /// duplicated hardcoded strings.
 pub const USER_INSTRUCTIONS_OPEN_TAG: &str = "<user_instructions>";
@@ -1126,33 +1132,6 @@ pub struct ExecCommandOutputDeltaEvent {
    pub chunk: Vec<u8>,
 }

-#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
-pub struct ExecApprovalRequestEvent {
-    /// Identifier for the associated exec call, if available.
-    pub call_id: String,
-    /// The command to be executed.
-    pub command: Vec<String>,
-    /// The command's working directory.
-    pub cwd: PathBuf,
-    /// Optional human-readable reason for the approval (e.g. retry without sandbox).
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub reason: Option<String>,
-    pub parsed_cmd: Vec<ParsedCommand>,
-}
-
-#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
-pub struct ApplyPatchApprovalRequestEvent {
-    /// Responses API call id for the associated patch apply call, if available.
-    pub call_id: String,
-    pub changes: HashMap<PathBuf, FileChange>,
-    /// Optional explanatory reason (e.g. request for extra write access).
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub reason: Option<String>,
-    /// When set, the agent is asking the user to allow writes under this root for the remainder of the session.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub grant_root: Option<PathBuf>,
-}
-
 #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
 pub struct BackgroundEventEvent {
    pub message: String,