Changes to sandbox command assessment feature based on initial experiment feedback (#6091)

* Removed sandbox risk categories; feedback indicates that these are not
that useful and "less is more"
* Tweaked the assessment prompt to generate terser answers
* Fixed bug in orchestrator that prevents this feature from being
exposed in the extension
This commit is contained in:
Eric Traut
2025-11-01 16:52:23 -05:00
committed by GitHub
parent d9118c04bf
commit d5853d9c47
7 changed files with 20 additions and 98 deletions

View File

@@ -25,16 +25,6 @@ use tracing::warn;
const SANDBOX_ASSESSMENT_TIMEOUT: Duration = Duration::from_secs(5);
const SANDBOX_RISK_CATEGORY_VALUES: &[&str] = &[
"data_deletion",
"data_exfiltration",
"privilege_escalation",
"system_modification",
"network_access",
"resource_exhaustion",
"compliance",
];
#[derive(Template)]
#[template(path = "sandboxing/assessment_prompt.md", escape = "none")]
struct SandboxAssessmentPromptTemplate<'a> {
@@ -176,27 +166,26 @@ pub(crate) async fn assess_command(
call_id,
"success",
Some(assessment.risk_level),
&assessment.risk_categories,
duration,
);
return Some(assessment);
}
Err(err) => {
warn!("failed to parse sandbox assessment JSON: {err}");
parent_otel.sandbox_assessment(call_id, "parse_error", None, &[], duration);
parent_otel.sandbox_assessment(call_id, "parse_error", None, duration);
}
},
Ok(Ok(None)) => {
warn!("sandbox assessment response did not include any message");
parent_otel.sandbox_assessment(call_id, "no_output", None, &[], duration);
parent_otel.sandbox_assessment(call_id, "no_output", None, duration);
}
Ok(Err(err)) => {
warn!("sandbox assessment failed: {err}");
parent_otel.sandbox_assessment(call_id, "model_error", None, &[], duration);
parent_otel.sandbox_assessment(call_id, "model_error", None, duration);
}
Err(_) => {
warn!("sandbox assessment timed out");
parent_otel.sandbox_assessment(call_id, "timeout", None, &[], duration);
parent_otel.sandbox_assessment(call_id, "timeout", None, duration);
}
}
@@ -229,7 +218,7 @@ fn sandbox_roots_for_prompt(policy: &SandboxPolicy, cwd: &Path) -> Vec<PathBuf>
fn sandbox_assessment_schema() -> serde_json::Value {
json!({
"type": "object",
"required": ["description", "risk_level", "risk_categories"],
"required": ["description", "risk_level"],
"properties": {
"description": {
"type": "string",
@@ -240,13 +229,6 @@ fn sandbox_assessment_schema() -> serde_json::Value {
"type": "string",
"enum": ["low", "medium", "high"]
},
"risk_categories": {
"type": "array",
"items": {
"type": "string",
"enum": SANDBOX_RISK_CATEGORY_VALUES
}
}
},
"additionalProperties": false
})

View File

@@ -54,12 +54,21 @@ impl ToolOrchestrator {
let mut already_approved = false;
if needs_initial_approval {
let mut risk = None;
if let Some(metadata) = req.sandbox_retry_data() {
risk = tool_ctx
.session
.assess_sandbox_command(turn_ctx, &tool_ctx.call_id, &metadata.command, None)
.await;
}
let approval_ctx = ApprovalCtx {
session: tool_ctx.session,
turn: turn_ctx,
call_id: &tool_ctx.call_id,
retry_reason: None,
risk: None,
risk,
};
let decision = tool.start_approval_async(req, approval_ctx).await;