Changes to sandbox command assessment feature based on initial experiment feedback (#6091)

* Removed sandbox risk categories; feedback indicates that these are not that useful and "less is more" * Tweaked the assessment prompt to generate terser answers * Fixed bug in orchestrator that prevents this feature from being exposed in the extension
2025-11-01 16:52:23 -05:00
parent d9118c04bf
commit d5853d9c47
7 changed files with 20 additions and 98 deletions
--- a/codex-rs/core/src/sandboxing/assessment.rs
+++ b/codex-rs/core/src/sandboxing/assessment.rs
@@ -25,16 +25,6 @@ use tracing::warn;

 const SANDBOX_ASSESSMENT_TIMEOUT: Duration = Duration::from_secs(5);

-const SANDBOX_RISK_CATEGORY_VALUES: &[&str] = &[
-    "data_deletion",
-    "data_exfiltration",
-    "privilege_escalation",
-    "system_modification",
-    "network_access",
-    "resource_exhaustion",
-    "compliance",
-];
-
 #[derive(Template)]
 #[template(path = "sandboxing/assessment_prompt.md", escape = "none")]
 struct SandboxAssessmentPromptTemplate<'a> {
@@ -176,27 +166,26 @@ pub(crate) async fn assess_command(
                    call_id,
                    "success",
                    Some(assessment.risk_level),
-                    &assessment.risk_categories,
                    duration,
                );
                return Some(assessment);
            }
            Err(err) => {
                warn!("failed to parse sandbox assessment JSON: {err}");
-                parent_otel.sandbox_assessment(call_id, "parse_error", None, &[], duration);
+                parent_otel.sandbox_assessment(call_id, "parse_error", None, duration);
            }
        },
        Ok(Ok(None)) => {
            warn!("sandbox assessment response did not include any message");
-            parent_otel.sandbox_assessment(call_id, "no_output", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "no_output", None, duration);
        }
        Ok(Err(err)) => {
            warn!("sandbox assessment failed: {err}");
-            parent_otel.sandbox_assessment(call_id, "model_error", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "model_error", None, duration);
        }
        Err(_) => {
            warn!("sandbox assessment timed out");
-            parent_otel.sandbox_assessment(call_id, "timeout", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "timeout", None, duration);
        }
    }

@@ -229,7 +218,7 @@ fn sandbox_roots_for_prompt(policy: &SandboxPolicy, cwd: &Path) -> Vec<PathBuf>
 fn sandbox_assessment_schema() -> serde_json::Value {
    json!({
        "type": "object",
-        "required": ["description", "risk_level", "risk_categories"],
+        "required": ["description", "risk_level"],
        "properties": {
            "description": {
                "type": "string",
@@ -240,13 +229,6 @@ fn sandbox_assessment_schema() -> serde_json::Value {
                "type": "string",
                "enum": ["low", "medium", "high"]
            },
-            "risk_categories": {
-                "type": "array",
-                "items": {
-                    "type": "string",
-                    "enum": SANDBOX_RISK_CATEGORY_VALUES
-                }
-            }
        },
        "additionalProperties": false
    })
--- a/codex-rs/core/src/tools/orchestrator.rs
+++ b/codex-rs/core/src/tools/orchestrator.rs
@@ -54,12 +54,21 @@ impl ToolOrchestrator {
        let mut already_approved = false;

        if needs_initial_approval {
+            let mut risk = None;
+
+            if let Some(metadata) = req.sandbox_retry_data() {
+                risk = tool_ctx
+                    .session
+                    .assess_sandbox_command(turn_ctx, &tool_ctx.call_id, &metadata.command, None)
+                    .await;
+            }
+
            let approval_ctx = ApprovalCtx {
                session: tool_ctx.session,
                turn: turn_ctx,
                call_id: &tool_ctx.call_id,
                retry_reason: None,
-                risk: None,
+                risk,
            };
            let decision = tool.start_approval_async(req, approval_ctx).await;

--- a/codex-rs/core/templates/sandboxing/assessment_prompt.md
+++ b/codex-rs/core/templates/sandboxing/assessment_prompt.md
@@ -1,13 +1,10 @@
-You are a security analyst evaluating shell commands that were blocked by a sandbox. Given the provided metadata, summarize the command's likely intent and assess the risk. Return strictly valid JSON with the keys:
- description (concise summary, at most two sentences)
+You are a security analyst evaluating shell commands that were blocked by a sandbox. Given the provided metadata, summarize the command's likely intent and assess the risk to help the user decide whether to approve command execution. Return strictly valid JSON with the keys:
+- description (concise summary of command intent and potential effects, no more than one sentence, use present tense)
 - risk_level ("low", "medium", or "high")
- risk_categories (optional array of zero or more category strings)
 Risk level examples:
- low: read-only inspections, listing files, printing configuration
- medium: modifying project files, installing dependencies, fetching artifacts from trusted sources
+- low: read-only inspections, listing files, printing configuration, fetching artifacts from trusted sources
+- medium: modifying project files, installing dependencies
 - high: deleting or overwriting data, exfiltrating secrets, escalating privileges, or disabling security controls
-Recognized risk_categories: data_deletion, data_exfiltration, privilege_escalation, system_modification, network_access, resource_exhaustion, compliance.
-Use multiple categories when appropriate.
 If information is insufficient, choose the most cautious risk level supported by the evidence.
 Respond with JSON only, without markdown code fences or extra commentary.

--- a/codex-rs/otel/src/otel_event_manager.rs
+++ b/codex-rs/otel/src/otel_event_manager.rs
@@ -8,7 +8,6 @@ use codex_protocol::models::ResponseItem;
 use codex_protocol::protocol::AskForApproval;
 use codex_protocol::protocol::ReviewDecision;
 use codex_protocol::protocol::SandboxPolicy;
-use codex_protocol::protocol::SandboxRiskCategory;
 use codex_protocol::protocol::SandboxRiskLevel;
 use codex_protocol::user_input::UserInput;
 use eventsource_stream::Event as StreamEvent;
@@ -373,19 +372,9 @@ impl OtelEventManager {
        call_id: &str,
        status: &str,
        risk_level: Option<SandboxRiskLevel>,
-        risk_categories: &[SandboxRiskCategory],
        duration: Duration,
    ) {
        let level = risk_level.map(|level| level.as_str());
-        let categories = if risk_categories.is_empty() {
-            String::new()
-        } else {
-            risk_categories
-                .iter()
-                .map(SandboxRiskCategory::as_str)
-                .collect::<Vec<_>>()
-                .join(", ")
-        };

        tracing::event!(
            tracing::Level::INFO,
@@ -402,7 +391,6 @@ impl OtelEventManager {
            call_id = %call_id,
            status = %status,
            risk_level = level,
-            risk_categories = categories,
            duration_ms = %duration.as_millis(),
        );
    }
--- a/codex-rs/protocol/src/approvals.rs
+++ b/codex-rs/protocol/src/approvals.rs
@@ -16,24 +16,10 @@ pub enum SandboxRiskLevel {
    High,
 }

-#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Hash, JsonSchema, TS)]
-#[serde(rename_all = "snake_case")]
-pub enum SandboxRiskCategory {
-    DataDeletion,
-    DataExfiltration,
-    PrivilegeEscalation,
-    SystemModification,
-    NetworkAccess,
-    ResourceExhaustion,
-    Compliance,
-}
-
 #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
 pub struct SandboxCommandAssessment {
    pub description: String,
    pub risk_level: SandboxRiskLevel,
-    #[serde(default, skip_serializing_if = "Vec::is_empty")]
-    pub risk_categories: Vec<SandboxRiskCategory>,
 }

 impl SandboxRiskLevel {
@@ -46,20 +32,6 @@ impl SandboxRiskLevel {
    }
 }

-impl SandboxRiskCategory {
-    pub fn as_str(&self) -> &'static str {
-        match self {
-            Self::DataDeletion => "data_deletion",
-            Self::DataExfiltration => "data_exfiltration",
-            Self::PrivilegeEscalation => "privilege_escalation",
-            Self::SystemModification => "system_modification",
-            Self::NetworkAccess => "network_access",
-            Self::ResourceExhaustion => "resource_exhaustion",
-            Self::Compliance => "compliance",
-        }
-    }
-}
-
 #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
 pub struct ExecApprovalRequestEvent {
    /// Identifier for the associated exec call, if available.
--- a/codex-rs/protocol/src/protocol.rs
+++ b/codex-rs/protocol/src/protocol.rs
@@ -37,7 +37,6 @@ use ts_rs::TS;
 pub use crate::approvals::ApplyPatchApprovalRequestEvent;
 pub use crate::approvals::ExecApprovalRequestEvent;
 pub use crate::approvals::SandboxCommandAssessment;
-pub use crate::approvals::SandboxRiskCategory;
 pub use crate::approvals::SandboxRiskLevel;

 /// Open/close tags for special user-input blocks. Used across crates to avoid
--- a/codex-rs/tui/src/bottom_pane/approval_overlay.rs
+++ b/codex-rs/tui/src/bottom_pane/approval_overlay.rs
@@ -20,7 +20,6 @@ use codex_core::protocol::FileChange;
 use codex_core::protocol::Op;
 use codex_core::protocol::ReviewDecision;
 use codex_core::protocol::SandboxCommandAssessment;
-use codex_core::protocol::SandboxRiskCategory;
 use codex_core::protocol::SandboxRiskLevel;
 use crossterm::event::KeyCode;
 use crossterm::event::KeyEvent;
@@ -356,35 +355,11 @@ fn render_risk_lines(risk: &SandboxCommandAssessment) -> Vec<Line<'static>> {
        ]));
    }

-    let mut spans: Vec<Span<'static>> = vec!["Risk: ".into(), level_span];
-    if !risk.risk_categories.is_empty() {
-        spans.push(" (".into());
-        for (idx, category) in risk.risk_categories.iter().enumerate() {
-            if idx > 0 {
-                spans.push(", ".into());
-            }
-            spans.push(risk_category_label(*category).into());
-        }
-        spans.push(")".into());
-    }
-
-    lines.push(Line::from(spans));
+    lines.push(vec!["Risk: ".into(), level_span].into());
    lines.push(Line::from(""));
    lines
 }

-fn risk_category_label(category: SandboxRiskCategory) -> &'static str {
-    match category {
-        SandboxRiskCategory::DataDeletion => "data deletion",
-        SandboxRiskCategory::DataExfiltration => "data exfiltration",
-        SandboxRiskCategory::PrivilegeEscalation => "privilege escalation",
-        SandboxRiskCategory::SystemModification => "system modification",
-        SandboxRiskCategory::NetworkAccess => "network access",
-        SandboxRiskCategory::ResourceExhaustion => "resource exhaustion",
-        SandboxRiskCategory::Compliance => "compliance",
-    }
-}
-
 #[derive(Clone)]
 enum ApprovalVariant {
    Exec { id: String, command: Vec<String> },