Files
llmx/codex-rs/core/src/bash.rs

220 lines
7.0 KiB
Rust
Raw Normal View History

feat: expand the set of commands that can be safely identified as "trusted" (#1668) This PR updates `is_known_safe_command()` to account for "safe operators" to expand the set of commands that can be run without approval. This concept existed in the TypeScript CLI, and we are [finally!] porting it to the Rust one: https://github.com/openai/codex/blob/c9e2def49487585cfe6f8bb7b2be442e8c0b5e1b/codex-cli/src/approvals.ts#L531-L541 The idea is that if we have `EXPR1 SAFE_OP EXPR2` and `EXPR1` and `EXPR2` are considered safe independently, then `EXPR1 SAFE_OP EXPR2` should be considered safe. Currently, `SAFE_OP` includes `&&`, `||`, `;`, and `|`. In the TypeScript implementation, we relied on https://www.npmjs.com/package/shell-quote to parse the string of Bash, as it could provide a "lightweight" parse tree, parsing `'beep || boop > /byte'` as: ``` [ 'beep', { op: '||' }, 'boop', { op: '>' }, '/byte' ] ``` Though in this PR, we introduce the use of https://crates.io/crates/tree-sitter-bash for parsing (which incidentally we were already using in [`codex-apply-patch`](https://github.com/openai/codex/blob/c9e2def49487585cfe6f8bb7b2be442e8c0b5e1b/codex-rs/apply-patch/Cargo.toml#L18)), which gives us a richer parse tree. (Incidentally, if you have never played with tree-sitter, try the [playground](https://tree-sitter.github.io/tree-sitter/7-playground.html) and select **Bash** from the dropdown to see how it parses various expressions.) As a concrete example, prior to this change, our implementation of `is_known_safe_command()` could verify things like: ``` ["bash", "-lc", "grep -R \"Cargo.toml\" -n"] ``` but not: ``` ["bash", "-lc", "grep -R \"Cargo.toml\" -n || true"] ``` With this change, the version with `|| true` is also accepted. Admittedly, this PR does not expand the safety check to support subshells, so it would reject, e.g. `bash -lc 'ls || (pwd && echo hi)'`, but that can be addressed in a subsequent PR.
2025-07-24 14:13:30 -07:00
use tree_sitter::Parser;
use tree_sitter::Tree;
use tree_sitter_bash::LANGUAGE as BASH;
/// Parse the provided bash source using tree-sitter-bash, returning a Tree on
/// success or None if parsing failed.
pub fn try_parse_bash(bash_lc_arg: &str) -> Option<Tree> {
let lang = BASH.into();
let mut parser = Parser::new();
#[expect(clippy::expect_used)]
parser.set_language(&lang).expect("load bash grammar");
let old_tree: Option<&Tree> = None;
parser.parse(bash_lc_arg, old_tree)
}
/// Parse a script which may contain multiple simple commands joined only by
/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`.
///
/// Returns `Some(Vec<command_words>)` if every command is a plain wordonly
/// command and the parse tree does not contain disallowed constructs
/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise
/// returns `None`.
pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option<Vec<Vec<String>>> {
if tree.root_node().has_error() {
return None;
}
// List of allowed (named) node kinds for a "word only commands sequence".
// If we encounter a named node that is not in this list we reject.
const ALLOWED_KINDS: &[&str] = &[
// top level containers
"program",
"list",
"pipeline",
// commands & words
"command",
"command_name",
"word",
"string",
"string_content",
"raw_string",
"number",
];
// Allow only safe punctuation / operator tokens; anything else causes reject.
const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"];
let root = tree.root_node();
let mut cursor = root.walk();
let mut stack = vec![root];
let mut command_nodes = Vec::new();
while let Some(node) = stack.pop() {
let kind = node.kind();
if node.is_named() {
if !ALLOWED_KINDS.contains(&kind) {
return None;
}
if kind == "command" {
command_nodes.push(node);
}
} else {
// Reject any punctuation / operator tokens that are not explicitly allowed.
if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) {
return None;
}
if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) {
// If it's a quote token or operator it's allowed above; we also allow whitespace tokens.
// Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected.
return None;
}
}
for child in node.children(&mut cursor) {
stack.push(child);
}
}
let mut commands = Vec::new();
for node in command_nodes {
if let Some(words) = parse_plain_command_from_node(node, src) {
commands.push(words);
} else {
return None;
}
}
Some(commands)
}
fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option<Vec<String>> {
if cmd.kind() != "command" {
return None;
}
let mut words = Vec::new();
let mut cursor = cmd.walk();
for child in cmd.named_children(&mut cursor) {
match child.kind() {
"command_name" => {
let word_node = child.named_child(0)?;
if word_node.kind() != "word" {
return None;
}
words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned());
}
"word" | "number" => {
words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned());
}
"string" => {
if child.child_count() == 3
&& child.child(0)?.kind() == "\""
&& child.child(1)?.kind() == "string_content"
&& child.child(2)?.kind() == "\""
{
words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned());
} else {
return None;
}
}
"raw_string" => {
let raw_string = child.utf8_text(src.as_bytes()).ok()?;
let stripped = raw_string
.strip_prefix('\'')
.and_then(|s| s.strip_suffix('\''));
if let Some(s) = stripped {
words.push(s.to_owned());
} else {
return None;
}
}
_ => return None,
}
}
Some(words)
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used)]
use super::*;
fn parse_seq(src: &str) -> Option<Vec<Vec<String>>> {
let tree = try_parse_bash(src)?;
try_parse_word_only_commands_sequence(&tree, src)
}
#[test]
fn accepts_single_simple_command() {
let cmds = parse_seq("ls -1").unwrap();
assert_eq!(cmds, vec![vec!["ls".to_string(), "-1".to_string()]]);
}
#[test]
fn accepts_multiple_commands_with_allowed_operators() {
let src = "ls && pwd; echo 'hi there' | wc -l";
let cmds = parse_seq(src).unwrap();
let expected: Vec<Vec<String>> = vec![
vec!["wc".to_string(), "-l".to_string()],
vec!["echo".to_string(), "hi there".to_string()],
vec!["pwd".to_string()],
vec!["ls".to_string()],
];
assert_eq!(cmds, expected);
}
#[test]
fn extracts_double_and_single_quoted_strings() {
let cmds = parse_seq("echo \"hello world\"").unwrap();
assert_eq!(
cmds,
vec![vec!["echo".to_string(), "hello world".to_string()]]
);
let cmds2 = parse_seq("echo 'hi there'").unwrap();
assert_eq!(
cmds2,
vec![vec!["echo".to_string(), "hi there".to_string()]]
);
}
#[test]
fn accepts_numbers_as_words() {
let cmds = parse_seq("echo 123 456").unwrap();
assert_eq!(
cmds,
vec![vec![
"echo".to_string(),
"123".to_string(),
"456".to_string()
]]
);
}
#[test]
fn rejects_parentheses_and_subshells() {
assert!(parse_seq("(ls)").is_none());
assert!(parse_seq("ls || (pwd && echo hi)").is_none());
}
#[test]
fn rejects_redirections_and_unsupported_operators() {
assert!(parse_seq("ls > out.txt").is_none());
assert!(parse_seq("echo hi & echo bye").is_none());
}
#[test]
fn rejects_command_and_process_substitutions_and_expansions() {
assert!(parse_seq("echo $(pwd)").is_none());
assert!(parse_seq("echo `pwd`").is_none());
assert!(parse_seq("echo $HOME").is_none());
assert!(parse_seq("echo \"hi $USER\"").is_none());
}
#[test]
fn rejects_variable_assignment_prefix() {
assert!(parse_seq("FOO=bar ls").is_none());
}
#[test]
fn rejects_trailing_operator_parse_error() {
assert!(parse_seq("ls &&").is_none());
}
}