Changes to sandbox command assessment feature based on initial experiment feedback (#6091)

* Removed sandbox risk categories; feedback indicates that these are not that useful and "less is more" * Tweaked the assessment prompt to generate terser answers * Fixed bug in orchestrator that prevents this feature from being exposed in the extension
2025-11-01 16:52:23 -05:00
parent d9118c04bf
commit d5853d9c47
7 changed files with 20 additions and 98 deletions
--- a/codex-rs/core/src/sandboxing/assessment.rs
+++ b/codex-rs/core/src/sandboxing/assessment.rs
@@ -25,16 +25,6 @@ use tracing::warn;

 const SANDBOX_ASSESSMENT_TIMEOUT: Duration = Duration::from_secs(5);

-const SANDBOX_RISK_CATEGORY_VALUES: &[&str] = &[
-    "data_deletion",
-    "data_exfiltration",
-    "privilege_escalation",
-    "system_modification",
-    "network_access",
-    "resource_exhaustion",
-    "compliance",
-];
-
 #[derive(Template)]
 #[template(path = "sandboxing/assessment_prompt.md", escape = "none")]
 struct SandboxAssessmentPromptTemplate<'a> {
@@ -176,27 +166,26 @@ pub(crate) async fn assess_command(
                    call_id,
                    "success",
                    Some(assessment.risk_level),
-                    &assessment.risk_categories,
                    duration,
                );
                return Some(assessment);
            }
            Err(err) => {
                warn!("failed to parse sandbox assessment JSON: {err}");
-                parent_otel.sandbox_assessment(call_id, "parse_error", None, &[], duration);
+                parent_otel.sandbox_assessment(call_id, "parse_error", None, duration);
            }
        },
        Ok(Ok(None)) => {
            warn!("sandbox assessment response did not include any message");
-            parent_otel.sandbox_assessment(call_id, "no_output", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "no_output", None, duration);
        }
        Ok(Err(err)) => {
            warn!("sandbox assessment failed: {err}");
-            parent_otel.sandbox_assessment(call_id, "model_error", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "model_error", None, duration);
        }
        Err(_) => {
            warn!("sandbox assessment timed out");
-            parent_otel.sandbox_assessment(call_id, "timeout", None, &[], duration);
+            parent_otel.sandbox_assessment(call_id, "timeout", None, duration);
        }
    }

@@ -229,7 +218,7 @@ fn sandbox_roots_for_prompt(policy: &SandboxPolicy, cwd: &Path) -> Vec<PathBuf>
 fn sandbox_assessment_schema() -> serde_json::Value {
    json!({
        "type": "object",
-        "required": ["description", "risk_level", "risk_categories"],
+        "required": ["description", "risk_level"],
        "properties": {
            "description": {
                "type": "string",
@@ -240,13 +229,6 @@ fn sandbox_assessment_schema() -> serde_json::Value {
                "type": "string",
                "enum": ["low", "medium", "high"]
            },
-            "risk_categories": {
-                "type": "array",
-                "items": {
-                    "type": "string",
-                    "enum": SANDBOX_RISK_CATEGORY_VALUES
-                }
-            }
        },
        "additionalProperties": false
    })
--- a/codex-rs/core/src/tools/orchestrator.rs
+++ b/codex-rs/core/src/tools/orchestrator.rs
@@ -54,12 +54,21 @@ impl ToolOrchestrator {
        let mut already_approved = false;

        if needs_initial_approval {
+            let mut risk = None;
+
+            if let Some(metadata) = req.sandbox_retry_data() {
+                risk = tool_ctx
+                    .session
+                    .assess_sandbox_command(turn_ctx, &tool_ctx.call_id, &metadata.command, None)
+                    .await;
+            }
+
            let approval_ctx = ApprovalCtx {
                session: tool_ctx.session,
                turn: turn_ctx,
                call_id: &tool_ctx.call_id,
                retry_reason: None,
-                risk: None,
+                risk,
            };
            let decision = tool.start_approval_async(req, approval_ctx).await;