From a1641743a8d43af1d0a75c252bab07fab8de57a7 Mon Sep 17 00:00:00 2001 From: Michael Bolin Date: Thu, 24 Jul 2025 14:13:30 -0700 Subject: [PATCH] feat: expand the set of commands that can be safely identified as "trusted" (#1668) This PR updates `is_known_safe_command()` to account for "safe operators" to expand the set of commands that can be run without approval. This concept existed in the TypeScript CLI, and we are [finally!] porting it to the Rust one: https://github.com/openai/codex/blob/c9e2def49487585cfe6f8bb7b2be442e8c0b5e1b/codex-cli/src/approvals.ts#L531-L541 The idea is that if we have `EXPR1 SAFE_OP EXPR2` and `EXPR1` and `EXPR2` are considered safe independently, then `EXPR1 SAFE_OP EXPR2` should be considered safe. Currently, `SAFE_OP` includes `&&`, `||`, `;`, and `|`. In the TypeScript implementation, we relied on https://www.npmjs.com/package/shell-quote to parse the string of Bash, as it could provide a "lightweight" parse tree, parsing `'beep || boop > /byte'` as: ``` [ 'beep', { op: '||' }, 'boop', { op: '>' }, '/byte' ] ``` Though in this PR, we introduce the use of https://crates.io/crates/tree-sitter-bash for parsing (which incidentally we were already using in [`codex-apply-patch`](https://github.com/openai/codex/blob/c9e2def49487585cfe6f8bb7b2be442e8c0b5e1b/codex-rs/apply-patch/Cargo.toml#L18)), which gives us a richer parse tree. (Incidentally, if you have never played with tree-sitter, try the [playground](https://tree-sitter.github.io/tree-sitter/7-playground.html) and select **Bash** from the dropdown to see how it parses various expressions.) As a concrete example, prior to this change, our implementation of `is_known_safe_command()` could verify things like: ``` ["bash", "-lc", "grep -R \"Cargo.toml\" -n"] ``` but not: ``` ["bash", "-lc", "grep -R \"Cargo.toml\" -n || true"] ``` With this change, the version with `|| true` is also accepted. Admittedly, this PR does not expand the safety check to support subshells, so it would reject, e.g. `bash -lc 'ls || (pwd && echo hi)'`, but that can be addressed in a subsequent PR. --- codex-rs/core/src/bash.rs | 219 +++++++++++++++++++++++++++ codex-rs/core/src/is_safe_command.rs | 205 +++++++++---------------- codex-rs/core/src/lib.rs | 1 + 3 files changed, 292 insertions(+), 133 deletions(-) create mode 100644 codex-rs/core/src/bash.rs diff --git a/codex-rs/core/src/bash.rs b/codex-rs/core/src/bash.rs new file mode 100644 index 00000000..b9cd4443 --- /dev/null +++ b/codex-rs/core/src/bash.rs @@ -0,0 +1,219 @@ +use tree_sitter::Parser; +use tree_sitter::Tree; +use tree_sitter_bash::LANGUAGE as BASH; + +/// Parse the provided bash source using tree-sitter-bash, returning a Tree on +/// success or None if parsing failed. +pub fn try_parse_bash(bash_lc_arg: &str) -> Option { + let lang = BASH.into(); + let mut parser = Parser::new(); + #[expect(clippy::expect_used)] + parser.set_language(&lang).expect("load bash grammar"); + let old_tree: Option<&Tree> = None; + parser.parse(bash_lc_arg, old_tree) +} + +/// Parse a script which may contain multiple simple commands joined only by +/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`. +/// +/// Returns `Some(Vec)` if every command is a plain word‑only +/// command and the parse tree does not contain disallowed constructs +/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise +/// returns `None`. +pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option>> { + if tree.root_node().has_error() { + return None; + } + + // List of allowed (named) node kinds for a "word only commands sequence". + // If we encounter a named node that is not in this list we reject. + const ALLOWED_KINDS: &[&str] = &[ + // top level containers + "program", + "list", + "pipeline", + // commands & words + "command", + "command_name", + "word", + "string", + "string_content", + "raw_string", + "number", + ]; + // Allow only safe punctuation / operator tokens; anything else causes reject. + const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"]; + + let root = tree.root_node(); + let mut cursor = root.walk(); + let mut stack = vec![root]; + let mut command_nodes = Vec::new(); + while let Some(node) = stack.pop() { + let kind = node.kind(); + if node.is_named() { + if !ALLOWED_KINDS.contains(&kind) { + return None; + } + if kind == "command" { + command_nodes.push(node); + } + } else { + // Reject any punctuation / operator tokens that are not explicitly allowed. + if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) { + return None; + } + if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) { + // If it's a quote token or operator it's allowed above; we also allow whitespace tokens. + // Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected. + return None; + } + } + for child in node.children(&mut cursor) { + stack.push(child); + } + } + + let mut commands = Vec::new(); + for node in command_nodes { + if let Some(words) = parse_plain_command_from_node(node, src) { + commands.push(words); + } else { + return None; + } + } + Some(commands) +} + +fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option> { + if cmd.kind() != "command" { + return None; + } + let mut words = Vec::new(); + let mut cursor = cmd.walk(); + for child in cmd.named_children(&mut cursor) { + match child.kind() { + "command_name" => { + let word_node = child.named_child(0)?; + if word_node.kind() != "word" { + return None; + } + words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned()); + } + "word" | "number" => { + words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned()); + } + "string" => { + if child.child_count() == 3 + && child.child(0)?.kind() == "\"" + && child.child(1)?.kind() == "string_content" + && child.child(2)?.kind() == "\"" + { + words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned()); + } else { + return None; + } + } + "raw_string" => { + let raw_string = child.utf8_text(src.as_bytes()).ok()?; + let stripped = raw_string + .strip_prefix('\'') + .and_then(|s| s.strip_suffix('\'')); + if let Some(s) = stripped { + words.push(s.to_owned()); + } else { + return None; + } + } + _ => return None, + } + } + Some(words) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + + fn parse_seq(src: &str) -> Option>> { + let tree = try_parse_bash(src)?; + try_parse_word_only_commands_sequence(&tree, src) + } + + #[test] + fn accepts_single_simple_command() { + let cmds = parse_seq("ls -1").unwrap(); + assert_eq!(cmds, vec![vec!["ls".to_string(), "-1".to_string()]]); + } + + #[test] + fn accepts_multiple_commands_with_allowed_operators() { + let src = "ls && pwd; echo 'hi there' | wc -l"; + let cmds = parse_seq(src).unwrap(); + let expected: Vec> = vec![ + vec!["wc".to_string(), "-l".to_string()], + vec!["echo".to_string(), "hi there".to_string()], + vec!["pwd".to_string()], + vec!["ls".to_string()], + ]; + assert_eq!(cmds, expected); + } + + #[test] + fn extracts_double_and_single_quoted_strings() { + let cmds = parse_seq("echo \"hello world\"").unwrap(); + assert_eq!( + cmds, + vec![vec!["echo".to_string(), "hello world".to_string()]] + ); + + let cmds2 = parse_seq("echo 'hi there'").unwrap(); + assert_eq!( + cmds2, + vec![vec!["echo".to_string(), "hi there".to_string()]] + ); + } + + #[test] + fn accepts_numbers_as_words() { + let cmds = parse_seq("echo 123 456").unwrap(); + assert_eq!( + cmds, + vec![vec![ + "echo".to_string(), + "123".to_string(), + "456".to_string() + ]] + ); + } + + #[test] + fn rejects_parentheses_and_subshells() { + assert!(parse_seq("(ls)").is_none()); + assert!(parse_seq("ls || (pwd && echo hi)").is_none()); + } + + #[test] + fn rejects_redirections_and_unsupported_operators() { + assert!(parse_seq("ls > out.txt").is_none()); + assert!(parse_seq("echo hi & echo bye").is_none()); + } + + #[test] + fn rejects_command_and_process_substitutions_and_expansions() { + assert!(parse_seq("echo $(pwd)").is_none()); + assert!(parse_seq("echo `pwd`").is_none()); + assert!(parse_seq("echo $HOME").is_none()); + assert!(parse_seq("echo \"hi $USER\"").is_none()); + } + + #[test] + fn rejects_variable_assignment_prefix() { + assert!(parse_seq("FOO=bar ls").is_none()); + } + + #[test] + fn rejects_trailing_operator_parse_error() { + assert!(parse_seq("ls &&").is_none()); + } +} diff --git a/codex-rs/core/src/is_safe_command.rs b/codex-rs/core/src/is_safe_command.rs index 493650a4..f5f453f8 100644 --- a/codex-rs/core/src/is_safe_command.rs +++ b/codex-rs/core/src/is_safe_command.rs @@ -1,22 +1,34 @@ -use tree_sitter::Parser; -use tree_sitter::Tree; -use tree_sitter_bash::LANGUAGE as BASH; +use crate::bash::try_parse_bash; +use crate::bash::try_parse_word_only_commands_sequence; pub fn is_known_safe_command(command: &[String]) -> bool { if is_safe_to_call_with_exec(command) { return true; } - // TODO(mbolin): Also support safe commands that are piped together such - // as `cat foo | wc -l`. - matches!( - command, - [bash, flag, script] - if bash == "bash" - && flag == "-lc" - && try_parse_bash(script).and_then(|tree| - try_parse_single_word_only_command(&tree, script)).is_some_and(|parsed_bash_command| is_safe_to_call_with_exec(&parsed_bash_command)) - ) + // Support `bash -lc "..."` where the script consists solely of one or + // more "plain" commands (only bare words / quoted strings) combined with + // a conservative allow‑list of shell operators that themselves do not + // introduce side effects ( "&&", "||", ";", and "|" ). If every + // individual command in the script is itself a known‑safe command, then + // the composite expression is considered safe. + if let [bash, flag, script] = command { + if bash == "bash" && flag == "-lc" { + if let Some(tree) = try_parse_bash(script) { + if let Some(all_commands) = try_parse_word_only_commands_sequence(&tree, script) { + if !all_commands.is_empty() + && all_commands + .iter() + .all(|cmd| is_safe_to_call_with_exec(cmd)) + { + return true; + } + } + } + } + } + + false } fn is_safe_to_call_with_exec(command: &[String]) -> bool { @@ -109,90 +121,7 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool { } } -fn try_parse_bash(bash_lc_arg: &str) -> Option { - let lang = BASH.into(); - let mut parser = Parser::new(); - #[expect(clippy::expect_used)] - parser.set_language(&lang).expect("load bash grammar"); - - let old_tree: Option<&Tree> = None; - parser.parse(bash_lc_arg, old_tree) -} - -/// If `tree` represents a single Bash command whose name and every argument is -/// an ordinary `word`, return those words in order; otherwise, return `None`. -/// -/// `src` must be the exact source string that was parsed into `tree`, so we can -/// extract the text for every node. -pub fn try_parse_single_word_only_command(tree: &Tree, src: &str) -> Option> { - // Any parse error is an immediate rejection. - if tree.root_node().has_error() { - return None; - } - - // (program …) with exactly one statement - let root = tree.root_node(); - if root.kind() != "program" || root.named_child_count() != 1 { - return None; - } - - let cmd = root.named_child(0)?; // (command …) - if cmd.kind() != "command" { - return None; - } - - let mut words = Vec::new(); - let mut cursor = cmd.walk(); - - for child in cmd.named_children(&mut cursor) { - match child.kind() { - // The command name node wraps one `word` child. - "command_name" => { - let word_node = child.named_child(0)?; // make sure it's only a word - if word_node.kind() != "word" { - return None; - } - words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned()); - } - // Positional‑argument word (allowed). - "word" | "number" => { - words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned()); - } - "string" => { - if child.child_count() == 3 - && child.child(0)?.kind() == "\"" - && child.child(1)?.kind() == "string_content" - && child.child(2)?.kind() == "\"" - { - words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned()); - } else { - // Anything else means the command is *not* plain words. - return None; - } - } - "concatenation" => { - // TODO: Consider things like `'ab\'a'`. - return None; - } - "raw_string" => { - // Raw string is a single word, but we need to strip the quotes. - let raw_string = child.utf8_text(src.as_bytes()).ok()?; - let stripped = raw_string - .strip_prefix('\'') - .and_then(|s| s.strip_suffix('\'')); - if let Some(stripped) = stripped { - words.push(stripped.to_owned()); - } else { - return None; - } - } - // Anything else means the command is *not* plain words. - _ => return None, - } - } - - Some(words) -} +// (bash parsing helpers implemented in crate::bash) /* ---------------------------------------------------------- Example @@ -230,6 +159,7 @@ fn is_valid_sed_n_arg(arg: Option<&str>) -> bool { _ => false, } } + #[cfg(test)] mod tests { #![allow(clippy::unwrap_used)] @@ -353,6 +283,30 @@ mod tests { ]))); } + #[test] + fn bash_lc_safe_examples_with_operators() { + assert!(is_known_safe_command(&vec_str(&[ + "bash", + "-lc", + "grep -R \"Cargo.toml\" -n || true" + ]))); + assert!(is_known_safe_command(&vec_str(&[ + "bash", + "-lc", + "ls && pwd" + ]))); + assert!(is_known_safe_command(&vec_str(&[ + "bash", + "-lc", + "echo 'hi' ; ls" + ]))); + assert!(is_known_safe_command(&vec_str(&[ + "bash", + "-lc", + "ls | wc -l" + ]))); + } + #[test] fn bash_lc_unsafe_examples() { assert!( @@ -366,44 +320,29 @@ mod tests { assert!( !is_known_safe_command(&vec_str(&["bash", "-lc", "find . -name file.txt -delete"])), - "Unsafe find option should not be auto‑approved." - ); - } - - #[test] - fn test_try_parse_single_word_only_command() { - let script_with_single_quoted_string = "sed -n '1,5p' file.txt"; - let parsed_words = try_parse_bash(script_with_single_quoted_string) - .and_then(|tree| { - try_parse_single_word_only_command(&tree, script_with_single_quoted_string) - }) - .unwrap(); - assert_eq!( - vec![ - "sed".to_string(), - "-n".to_string(), - // Ensure the single quotes are properly removed. - "1,5p".to_string(), - "file.txt".to_string() - ], - parsed_words, + "Unsafe find option should not be auto-approved." ); - let script_with_number_arg = "ls -1"; - let parsed_words = try_parse_bash(script_with_number_arg) - .and_then(|tree| try_parse_single_word_only_command(&tree, script_with_number_arg)) - .unwrap(); - assert_eq!(vec!["ls", "-1"], parsed_words,); + // Disallowed because of unsafe command in sequence. + assert!( + !is_known_safe_command(&vec_str(&["bash", "-lc", "ls && rm -rf /"])), + "Sequence containing unsafe command must be rejected" + ); - let script_with_double_quoted_string_with_no_funny_stuff_arg = "grep -R \"Cargo.toml\" -n"; - let parsed_words = try_parse_bash(script_with_double_quoted_string_with_no_funny_stuff_arg) - .and_then(|tree| { - try_parse_single_word_only_command( - &tree, - script_with_double_quoted_string_with_no_funny_stuff_arg, - ) - }) - .unwrap(); - assert_eq!(vec!["grep", "-R", "Cargo.toml", "-n"], parsed_words); + // Disallowed because of parentheses / subshell. + assert!( + !is_known_safe_command(&vec_str(&["bash", "-lc", "(ls)"])), + "Parentheses (subshell) are not provably safe with the current parser" + ); + assert!( + !is_known_safe_command(&vec_str(&["bash", "-lc", "ls || (pwd && echo hi)"])), + "Nested parentheses are not provably safe with the current parser" + ); + + // Disallowed redirection. + assert!( + !is_known_safe_command(&vec_str(&["bash", "-lc", "ls > out.txt"])), + "> redirection should be rejected" + ); } } diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs index 4e69e94b..12321e0a 100644 --- a/codex-rs/core/src/lib.rs +++ b/codex-rs/core/src/lib.rs @@ -5,6 +5,7 @@ // the TUI or the tracing stack). #![deny(clippy::print_stdout, clippy::print_stderr)] +mod bash; mod chat_completions; mod client; mod client_common;