Files
llmx/codex-rs/core/src/bash.rs
Michael Bolin a1641743a8 feat: expand the set of commands that can be safely identified as "trusted" (#1668)
This PR updates `is_known_safe_command()` to account for "safe
operators" to expand the set of commands that can be run without
approval. This concept existed in the TypeScript CLI, and we are
[finally!] porting it to the Rust one:


c9e2def494/codex-cli/src/approvals.ts (L531-L541)

The idea is that if we have `EXPR1 SAFE_OP EXPR2` and `EXPR1` and
`EXPR2` are considered safe independently, then `EXPR1 SAFE_OP EXPR2`
should be considered safe. Currently, `SAFE_OP` includes `&&`, `||`,
`;`, and `|`.

In the TypeScript implementation, we relied on
https://www.npmjs.com/package/shell-quote to parse the string of Bash,
as it could provide a "lightweight" parse tree, parsing `'beep || boop >
/byte'` as:

```
[ 'beep', { op: '||' }, 'boop', { op: '>' }, '/byte' ]
```

Though in this PR, we introduce the use of
https://crates.io/crates/tree-sitter-bash for parsing (which
incidentally we were already using in
[`codex-apply-patch`](c9e2def494/codex-rs/apply-patch/Cargo.toml (L18))),
which gives us a richer parse tree. (Incidentally, if you have never
played with tree-sitter, try the
[playground](https://tree-sitter.github.io/tree-sitter/7-playground.html)
and select **Bash** from the dropdown to see how it parses various
expressions.)

As a concrete example, prior to this change, our implementation of
`is_known_safe_command()` could verify things like:

```
["bash", "-lc", "grep -R \"Cargo.toml\" -n"]
```

but not:

```
["bash", "-lc", "grep -R \"Cargo.toml\" -n || true"]
```

With this change, the version with `|| true` is also accepted.

Admittedly, this PR does not expand the safety check to support
subshells, so it would reject, e.g. `bash -lc 'ls || (pwd && echo hi)'`,
but that can be addressed in a subsequent PR.
2025-07-24 14:13:30 -07:00

220 lines
7.0 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use tree_sitter::Parser;
use tree_sitter::Tree;
use tree_sitter_bash::LANGUAGE as BASH;
/// Parse the provided bash source using tree-sitter-bash, returning a Tree on
/// success or None if parsing failed.
pub fn try_parse_bash(bash_lc_arg: &str) -> Option<Tree> {
let lang = BASH.into();
let mut parser = Parser::new();
#[expect(clippy::expect_used)]
parser.set_language(&lang).expect("load bash grammar");
let old_tree: Option<&Tree> = None;
parser.parse(bash_lc_arg, old_tree)
}
/// Parse a script which may contain multiple simple commands joined only by
/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`.
///
/// Returns `Some(Vec<command_words>)` if every command is a plain wordonly
/// command and the parse tree does not contain disallowed constructs
/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise
/// returns `None`.
pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option<Vec<Vec<String>>> {
if tree.root_node().has_error() {
return None;
}
// List of allowed (named) node kinds for a "word only commands sequence".
// If we encounter a named node that is not in this list we reject.
const ALLOWED_KINDS: &[&str] = &[
// top level containers
"program",
"list",
"pipeline",
// commands & words
"command",
"command_name",
"word",
"string",
"string_content",
"raw_string",
"number",
];
// Allow only safe punctuation / operator tokens; anything else causes reject.
const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"];
let root = tree.root_node();
let mut cursor = root.walk();
let mut stack = vec![root];
let mut command_nodes = Vec::new();
while let Some(node) = stack.pop() {
let kind = node.kind();
if node.is_named() {
if !ALLOWED_KINDS.contains(&kind) {
return None;
}
if kind == "command" {
command_nodes.push(node);
}
} else {
// Reject any punctuation / operator tokens that are not explicitly allowed.
if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) {
return None;
}
if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) {
// If it's a quote token or operator it's allowed above; we also allow whitespace tokens.
// Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected.
return None;
}
}
for child in node.children(&mut cursor) {
stack.push(child);
}
}
let mut commands = Vec::new();
for node in command_nodes {
if let Some(words) = parse_plain_command_from_node(node, src) {
commands.push(words);
} else {
return None;
}
}
Some(commands)
}
fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option<Vec<String>> {
if cmd.kind() != "command" {
return None;
}
let mut words = Vec::new();
let mut cursor = cmd.walk();
for child in cmd.named_children(&mut cursor) {
match child.kind() {
"command_name" => {
let word_node = child.named_child(0)?;
if word_node.kind() != "word" {
return None;
}
words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned());
}
"word" | "number" => {
words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned());
}
"string" => {
if child.child_count() == 3
&& child.child(0)?.kind() == "\""
&& child.child(1)?.kind() == "string_content"
&& child.child(2)?.kind() == "\""
{
words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned());
} else {
return None;
}
}
"raw_string" => {
let raw_string = child.utf8_text(src.as_bytes()).ok()?;
let stripped = raw_string
.strip_prefix('\'')
.and_then(|s| s.strip_suffix('\''));
if let Some(s) = stripped {
words.push(s.to_owned());
} else {
return None;
}
}
_ => return None,
}
}
Some(words)
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used)]
use super::*;
fn parse_seq(src: &str) -> Option<Vec<Vec<String>>> {
let tree = try_parse_bash(src)?;
try_parse_word_only_commands_sequence(&tree, src)
}
#[test]
fn accepts_single_simple_command() {
let cmds = parse_seq("ls -1").unwrap();
assert_eq!(cmds, vec![vec!["ls".to_string(), "-1".to_string()]]);
}
#[test]
fn accepts_multiple_commands_with_allowed_operators() {
let src = "ls && pwd; echo 'hi there' | wc -l";
let cmds = parse_seq(src).unwrap();
let expected: Vec<Vec<String>> = vec![
vec!["wc".to_string(), "-l".to_string()],
vec!["echo".to_string(), "hi there".to_string()],
vec!["pwd".to_string()],
vec!["ls".to_string()],
];
assert_eq!(cmds, expected);
}
#[test]
fn extracts_double_and_single_quoted_strings() {
let cmds = parse_seq("echo \"hello world\"").unwrap();
assert_eq!(
cmds,
vec![vec!["echo".to_string(), "hello world".to_string()]]
);
let cmds2 = parse_seq("echo 'hi there'").unwrap();
assert_eq!(
cmds2,
vec![vec!["echo".to_string(), "hi there".to_string()]]
);
}
#[test]
fn accepts_numbers_as_words() {
let cmds = parse_seq("echo 123 456").unwrap();
assert_eq!(
cmds,
vec![vec![
"echo".to_string(),
"123".to_string(),
"456".to_string()
]]
);
}
#[test]
fn rejects_parentheses_and_subshells() {
assert!(parse_seq("(ls)").is_none());
assert!(parse_seq("ls || (pwd && echo hi)").is_none());
}
#[test]
fn rejects_redirections_and_unsupported_operators() {
assert!(parse_seq("ls > out.txt").is_none());
assert!(parse_seq("echo hi & echo bye").is_none());
}
#[test]
fn rejects_command_and_process_substitutions_and_expansions() {
assert!(parse_seq("echo $(pwd)").is_none());
assert!(parse_seq("echo `pwd`").is_none());
assert!(parse_seq("echo $HOME").is_none());
assert!(parse_seq("echo \"hi $USER\"").is_none());
}
#[test]
fn rejects_variable_assignment_prefix() {
assert!(parse_seq("FOO=bar ls").is_none());
}
#[test]
fn rejects_trailing_operator_parse_error() {
assert!(parse_seq("ls &&").is_none());
}
}