From d5853d9c47b1badad183f62622745cf47e6ff0f4 Mon Sep 17 00:00:00 2001
From: Eric Traut <etraut@openai.com>
Date: Sat, 1 Nov 2025 16:52:23 -0500
Subject: [PATCH] Changes to sandbox command assessment feature based on
 initial experiment feedback (#6091)

* Removed sandbox risk categories; feedback indicates that these are not
that useful and "less is more"
* Tweaked the assessment prompt to generate terser answers
* Fixed bug in orchestrator that prevents this feature from being
exposed in the extension
---
 codex-rs/core/src/sandboxing/assessment.rs    | 28 ++++---------------
 codex-rs/core/src/tools/orchestrator.rs       | 11 +++++++-
 .../templates/sandboxing/assessment_prompt.md | 11 +++-----
 codex-rs/otel/src/otel_event_manager.rs       | 12 --------
 codex-rs/protocol/src/approvals.rs            | 28 -------------------
 codex-rs/protocol/src/protocol.rs             |  1 -
 .../tui/src/bottom_pane/approval_overlay.rs   | 27 +-----------------
 7 files changed, 20 insertions(+), 98 deletions(-)
diff --git a/codex-rs/core/src/sandboxing/assessment.rs b/codex-rs/core/src/sandboxing/assessment.rs
index c7310c1f..31e76777 100644
--- a/codex-rs/core/src/sandboxing/assessment.rs
+++ b/codex-rs/core/src/sandboxing/assessment.rs
@@ -25,16 +25,6 @@ use tracing::warn;
 
 const SANDBOX_ASSESSMENT_TIMEOUT: Duration = Duration::from_secs(5);
 
-const SANDBOX_RISK_CATEGORY_VALUES: &[&str] = &[
-    "data_deletion",
-    "data_exfiltration",
-    "privilege_escalation",
-    "system_modification",
-    "network_access",
-    "resource_exhaustion",
-    "compliance",
-];
-
 #[derive(Template)]
 #[template(path = "sandboxing/assessment_prompt.md", escape = "none")]
 struct SandboxAssessmentPromptTemplate<'a> {
@@ -176,27 +166,26 @@ pub(crate) async fn assess_command(
                     call_id,
                     "success",
                     Some(assessment.risk_level),
-                    &assessment.risk_categories,
                     duration,
                 );
                 return Some(assessment);
             }
             Err(err) => {
                 warn!("failed to parse sandbox assessment JSON: {err}");
-                parent_otel.sandbox_assessment(call_id, "parse_error", None, &[], duration);
+                parent_otel.sandbox_assessment(call_id, "parse_error", None, duration);
             }
         },
         Ok(Ok(None)) => {
             warn!("sandbox assessment response did not include any message");
-            parent_otel.sandbox_assessment(call_id, "no_output", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "no_output", None, duration);
         }
         Ok(Err(err)) => {
             warn!("sandbox assessment failed: {err}");
-            parent_otel.sandbox_assessment(call_id, "model_error", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "model_error", None, duration);
         }
         Err(_) => {
             warn!("sandbox assessment timed out");
-            parent_otel.sandbox_assessment(call_id, "timeout", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "timeout", None, duration);
         }
     }
 
@@ -229,7 +218,7 @@ fn sandbox_roots_for_prompt(policy: &SandboxPolicy, cwd: &Path) -> Vec<PathBuf>
 fn sandbox_assessment_schema() -> serde_json::Value {
     json!({
         "type": "object",
-        "required": ["description", "risk_level", "risk_categories"],
+        "required": ["description", "risk_level"],
         "properties": {
             "description": {
                 "type": "string",
@@ -240,13 +229,6 @@ fn sandbox_assessment_schema() -> serde_json::Value {
                 "type": "string",
                 "enum": ["low", "medium", "high"]
             },
-            "risk_categories": {
-                "type": "array",
-                "items": {
-                    "type": "string",
-                    "enum": SANDBOX_RISK_CATEGORY_VALUES
-                }
-            }
         },
         "additionalProperties": false
     })
diff --git a/codex-rs/core/src/tools/orchestrator.rs b/codex-rs/core/src/tools/orchestrator.rs
index f334c4ef..878e48e8 100644
--- a/codex-rs/core/src/tools/orchestrator.rs
+++ b/codex-rs/core/src/tools/orchestrator.rs
@@ -54,12 +54,21 @@ impl ToolOrchestrator {
         let mut already_approved = false;
 
         if needs_initial_approval {
+            let mut risk = None;
+
+            if let Some(metadata) = req.sandbox_retry_data() {
+                risk = tool_ctx
+                    .session
+                    .assess_sandbox_command(turn_ctx, &tool_ctx.call_id, &metadata.command, None)
+                    .await;
+            }
+
             let approval_ctx = ApprovalCtx {
                 session: tool_ctx.session,
                 turn: turn_ctx,
                 call_id: &tool_ctx.call_id,
                 retry_reason: None,
-                risk: None,
+                risk,
             };
             let decision = tool.start_approval_async(req, approval_ctx).await;
 
diff --git a/codex-rs/core/templates/sandboxing/assessment_prompt.md b/codex-rs/core/templates/sandboxing/assessment_prompt.md
index 7e5664a0..dc59f47d 100644
--- a/codex-rs/core/templates/sandboxing/assessment_prompt.md
+++ b/codex-rs/core/templates/sandboxing/assessment_prompt.md
@@ -1,13 +1,10 @@
-You are a security analyst evaluating shell commands that were blocked by a sandbox. Given the provided metadata, summarize the command's likely intent and assess the risk. Return strictly valid JSON with the keys:
-- description (concise summary, at most two sentences)
+You are a security analyst evaluating shell commands that were blocked by a sandbox. Given the provided metadata, summarize the command's likely intent and assess the risk to help the user decide whether to approve command execution. Return strictly valid JSON with the keys:
+- description (concise summary of command intent and potential effects, no more than one sentence, use present tense)
 - risk_level ("low", "medium", or "high")
-- risk_categories (optional array of zero or more category strings)
 Risk level examples:
-- low: read-only inspections, listing files, printing configuration
-- medium: modifying project files, installing dependencies, fetching artifacts from trusted sources
+- low: read-only inspections, listing files, printing configuration, fetching artifacts from trusted sources
+- medium: modifying project files, installing dependencies
 - high: deleting or overwriting data, exfiltrating secrets, escalating privileges, or disabling security controls
-Recognized risk_categories: data_deletion, data_exfiltration, privilege_escalation, system_modification, network_access, resource_exhaustion, compliance.
-Use multiple categories when appropriate.
 If information is insufficient, choose the most cautious risk level supported by the evidence.
 Respond with JSON only, without markdown code fences or extra commentary.
 
diff --git a/codex-rs/otel/src/otel_event_manager.rs b/codex-rs/otel/src/otel_event_manager.rs
index 4006df17..5d9cbd49 100644
--- a/codex-rs/otel/src/otel_event_manager.rs
+++ b/codex-rs/otel/src/otel_event_manager.rs
@@ -8,7 +8,6 @@ use codex_protocol::models::ResponseItem;
 use codex_protocol::protocol::AskForApproval;
 use codex_protocol::protocol::ReviewDecision;
 use codex_protocol::protocol::SandboxPolicy;
-use codex_protocol::protocol::SandboxRiskCategory;
 use codex_protocol::protocol::SandboxRiskLevel;
 use codex_protocol::user_input::UserInput;
 use eventsource_stream::Event as StreamEvent;
@@ -373,19 +372,9 @@ impl OtelEventManager {
         call_id: &str,
         status: &str,
         risk_level: Option<SandboxRiskLevel>,
-        risk_categories: &[SandboxRiskCategory],
         duration: Duration,
     ) {
         let level = risk_level.map(|level| level.as_str());
-        let categories = if risk_categories.is_empty() {
-            String::new()
-        } else {
-            risk_categories
-                .iter()
-                .map(SandboxRiskCategory::as_str)
-                .collect::<Vec<_>>()
-                .join(", ")
-        };
 
         tracing::event!(
             tracing::Level::INFO,
@@ -402,7 +391,6 @@ impl OtelEventManager {
             call_id = %call_id,
             status = %status,
             risk_level = level,
-            risk_categories = categories,
             duration_ms = %duration.as_millis(),
         );
     }
diff --git a/codex-rs/protocol/src/approvals.rs b/codex-rs/protocol/src/approvals.rs
index d608dba6..3227ddd1 100644
--- a/codex-rs/protocol/src/approvals.rs
+++ b/codex-rs/protocol/src/approvals.rs
@@ -16,24 +16,10 @@ pub enum SandboxRiskLevel {
     High,
 }
 
-#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Hash, JsonSchema, TS)]
-#[serde(rename_all = "snake_case")]
-pub enum SandboxRiskCategory {
-    DataDeletion,
-    DataExfiltration,
-    PrivilegeEscalation,
-    SystemModification,
-    NetworkAccess,
-    ResourceExhaustion,
-    Compliance,
-}
-
 #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
 pub struct SandboxCommandAssessment {
     pub description: String,
     pub risk_level: SandboxRiskLevel,
-    #[serde(default, skip_serializing_if = "Vec::is_empty")]
-    pub risk_categories: Vec<SandboxRiskCategory>,
 }
 
 impl SandboxRiskLevel {
@@ -46,20 +32,6 @@ impl SandboxRiskLevel {
     }
 }
 
-impl SandboxRiskCategory {
-    pub fn as_str(&self) -> &'static str {
-        match self {
-            Self::DataDeletion => "data_deletion",
-            Self::DataExfiltration => "data_exfiltration",
-            Self::PrivilegeEscalation => "privilege_escalation",
-            Self::SystemModification => "system_modification",
-            Self::NetworkAccess => "network_access",
-            Self::ResourceExhaustion => "resource_exhaustion",
-            Self::Compliance => "compliance",
-        }
-    }
-}
-
 #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
 pub struct ExecApprovalRequestEvent {
     /// Identifier for the associated exec call, if available.
diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs
index 413a3aea..c30ba42b 100644
--- a/codex-rs/protocol/src/protocol.rs
+++ b/codex-rs/protocol/src/protocol.rs
@@ -37,7 +37,6 @@ use ts_rs::TS;
 pub use crate::approvals::ApplyPatchApprovalRequestEvent;
 pub use crate::approvals::ExecApprovalRequestEvent;
 pub use crate::approvals::SandboxCommandAssessment;
-pub use crate::approvals::SandboxRiskCategory;
 pub use crate::approvals::SandboxRiskLevel;
 
 /// Open/close tags for special user-input blocks. Used across crates to avoid
diff --git a/codex-rs/tui/src/bottom_pane/approval_overlay.rs b/codex-rs/tui/src/bottom_pane/approval_overlay.rs
index ba368700..140b5386 100644
--- a/codex-rs/tui/src/bottom_pane/approval_overlay.rs
+++ b/codex-rs/tui/src/bottom_pane/approval_overlay.rs
@@ -20,7 +20,6 @@ use codex_core::protocol::FileChange;
 use codex_core::protocol::Op;
 use codex_core::protocol::ReviewDecision;
 use codex_core::protocol::SandboxCommandAssessment;
-use codex_core::protocol::SandboxRiskCategory;
 use codex_core::protocol::SandboxRiskLevel;
 use crossterm::event::KeyCode;
 use crossterm::event::KeyEvent;
@@ -356,35 +355,11 @@ fn render_risk_lines(risk: &SandboxCommandAssessment) -> Vec<Line<'static>> {
         ]));
     }
 
-    let mut spans: Vec<Span<'static>> = vec!["Risk: ".into(), level_span];
-    if !risk.risk_categories.is_empty() {
-        spans.push(" (".into());
-        for (idx, category) in risk.risk_categories.iter().enumerate() {
-            if idx > 0 {
-                spans.push(", ".into());
-            }
-            spans.push(risk_category_label(*category).into());
-        }
-        spans.push(")".into());
-    }
-
-    lines.push(Line::from(spans));
+    lines.push(vec!["Risk: ".into(), level_span].into());
     lines.push(Line::from(""));
     lines
 }
 
-fn risk_category_label(category: SandboxRiskCategory) -> &'static str {
-    match category {
-        SandboxRiskCategory::DataDeletion => "data deletion",
-        SandboxRiskCategory::DataExfiltration => "data exfiltration",
-        SandboxRiskCategory::PrivilegeEscalation => "privilege escalation",
-        SandboxRiskCategory::SystemModification => "system modification",
-        SandboxRiskCategory::NetworkAccess => "network access",
-        SandboxRiskCategory::ResourceExhaustion => "resource exhaustion",
-        SandboxRiskCategory::Compliance => "compliance",
-    }
-}
-
 #[derive(Clone)]
 enum ApprovalVariant {
     Exec { id: String, command: Vec<String> },