codex-rs/core/src/bash.rs

use tree_sitter::Parser;
use tree_sitter::Tree;
use tree_sitter_bash::LANGUAGE as BASH;

/// Parse the provided bash source using tree-sitter-bash, returning a Tree on
/// success or None if parsing failed.
pub fn try_parse_bash(bash_lc_arg: &str) -> Option<Tree> {
    let lang = BASH.into();
    let mut parser = Parser::new();
    #[expect(clippy::expect_used)]
    parser.set_language(&lang).expect("load bash grammar");
    let old_tree: Option<&Tree> = None;
    parser.parse(bash_lc_arg, old_tree)
}

/// Parse a script which may contain multiple simple commands joined only by
/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`.
///
/// Returns `Some(Vec<command_words>)` if every command is a plain word‑only
/// command and the parse tree does not contain disallowed constructs
/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise
/// returns `None`.
pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option<Vec<Vec<String>>> {
    if tree.root_node().has_error() {
        return None;
    }

    // List of allowed (named) node kinds for a "word only commands sequence".
    // If we encounter a named node that is not in this list we reject.
    const ALLOWED_KINDS: &[&str] = &[
        // top level containers
        "program",
        "list",
        "pipeline",
        // commands & words
        "command",
        "command_name",
        "word",
        "string",
        "string_content",
        "raw_string",
        "number",
    ];
    // Allow only safe punctuation / operator tokens; anything else causes reject.
    const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"];

    let root = tree.root_node();
    let mut cursor = root.walk();
    let mut stack = vec![root];
    let mut command_nodes = Vec::new();
    while let Some(node) = stack.pop() {
        let kind = node.kind();
        if node.is_named() {
            if !ALLOWED_KINDS.contains(&kind) {
                return None;
            }
            if kind == "command" {
                command_nodes.push(node);
            }
        } else {
            // Reject any punctuation / operator tokens that are not explicitly allowed.
            if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) {
                return None;
            }
            if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) {
                // If it's a quote token or operator it's allowed above; we also allow whitespace tokens.
                // Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected.
                return None;
            }
        }
        for child in node.children(&mut cursor) {
            stack.push(child);
        }
    }

    let mut commands = Vec::new();
    for node in command_nodes {
        if let Some(words) = parse_plain_command_from_node(node, src) {
            commands.push(words);
        } else {
            return None;
        }
    }
    Some(commands)
}

fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option<Vec<String>> {
    if cmd.kind() != "command" {
        return None;
    }
    let mut words = Vec::new();
    let mut cursor = cmd.walk();
    for child in cmd.named_children(&mut cursor) {
        match child.kind() {
            "command_name" => {
                let word_node = child.named_child(0)?;
                if word_node.kind() != "word" {
                    return None;
                }
                words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned());
            }
            "word" | "number" => {
                words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned());
            }
            "string" => {
                if child.child_count() == 3
                    && child.child(0)?.kind() == "\""
                    && child.child(1)?.kind() == "string_content"
                    && child.child(2)?.kind() == "\""
                {
                    words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned());
                } else {
                    return None;
                }
            }
            "raw_string" => {
                let raw_string = child.utf8_text(src.as_bytes()).ok()?;
                let stripped = raw_string
                    .strip_prefix('\'')
                    .and_then(|s| s.strip_suffix('\''));
                if let Some(s) = stripped {
                    words.push(s.to_owned());
                } else {
                    return None;
                }
            }
            _ => return None,
        }
    }
    Some(words)
}

#[cfg(test)]
mod tests {
    #![allow(clippy::unwrap_used)]
    use super::*;

    fn parse_seq(src: &str) -> Option<Vec<Vec<String>>> {
        let tree = try_parse_bash(src)?;
        try_parse_word_only_commands_sequence(&tree, src)
    }

    #[test]
    fn accepts_single_simple_command() {
        let cmds = parse_seq("ls -1").unwrap();
        assert_eq!(cmds, vec![vec!["ls".to_string(), "-1".to_string()]]);
    }

    #[test]
    fn accepts_multiple_commands_with_allowed_operators() {
        let src = "ls && pwd; echo 'hi there' | wc -l";
        let cmds = parse_seq(src).unwrap();
        let expected: Vec<Vec<String>> = vec![
            vec!["wc".to_string(), "-l".to_string()],
            vec!["echo".to_string(), "hi there".to_string()],
            vec!["pwd".to_string()],
            vec!["ls".to_string()],
        ];
        assert_eq!(cmds, expected);
    }

    #[test]
    fn extracts_double_and_single_quoted_strings() {
        let cmds = parse_seq("echo \"hello world\"").unwrap();
        assert_eq!(
            cmds,
            vec![vec!["echo".to_string(), "hello world".to_string()]]
        );

        let cmds2 = parse_seq("echo 'hi there'").unwrap();
        assert_eq!(
            cmds2,
            vec![vec!["echo".to_string(), "hi there".to_string()]]
        );
    }

    #[test]
    fn accepts_numbers_as_words() {
        let cmds = parse_seq("echo 123 456").unwrap();
        assert_eq!(
            cmds,
            vec![vec![
                "echo".to_string(),
                "123".to_string(),
                "456".to_string()
            ]]
        );
    }

    #[test]
    fn rejects_parentheses_and_subshells() {
        assert!(parse_seq("(ls)").is_none());
        assert!(parse_seq("ls || (pwd && echo hi)").is_none());
    }

    #[test]
    fn rejects_redirections_and_unsupported_operators() {
        assert!(parse_seq("ls > out.txt").is_none());
        assert!(parse_seq("echo hi & echo bye").is_none());
    }

    #[test]
    fn rejects_command_and_process_substitutions_and_expansions() {
        assert!(parse_seq("echo $(pwd)").is_none());
        assert!(parse_seq("echo `pwd`").is_none());
        assert!(parse_seq("echo $HOME").is_none());
        assert!(parse_seq("echo \"hi $USER\"").is_none());
    }

    #[test]
    fn rejects_variable_assignment_prefix() {
        assert!(parse_seq("FOO=bar ls").is_none());
    }

    #[test]
    fn rejects_trailing_operator_parse_error() {
        assert!(parse_seq("ls &&").is_none());
    }
}
-												feat: expand the set of commands that can be safely identified as "trusted" (#1668)

This PR updates `is_known_safe_command()` to account for "safe
operators" to expand the set of commands that can be run without
approval. This concept existed in the TypeScript CLI, and we are
[finally!] porting it to the Rust one:


https://github.com/openai/codex/blob/c9e2def49487585cfe6f8bb7b2be442e8c0b5e1b/codex-cli/src/approvals.ts#L531-L541

The idea is that if we have `EXPR1 SAFE_OP EXPR2` and `EXPR1` and
`EXPR2` are considered safe independently, then `EXPR1 SAFE_OP EXPR2`
should be considered safe. Currently, `SAFE_OP` includes `&&`, `||`,
`;`, and `|`.

In the TypeScript implementation, we relied on
https://www.npmjs.com/package/shell-quote to parse the string of Bash,
as it could provide a "lightweight" parse tree, parsing `'beep || boop >
/byte'` as:

```
[ 'beep', { op: '||' }, 'boop', { op: '>' }, '/byte' ]
```

Though in this PR, we introduce the use of
https://crates.io/crates/tree-sitter-bash for parsing (which
incidentally we were already using in
[`codex-apply-patch`](https://github.com/openai/codex/blob/c9e2def49487585cfe6f8bb7b2be442e8c0b5e1b/codex-rs/apply-patch/Cargo.toml#L18)),
which gives us a richer parse tree. (Incidentally, if you have never
played with tree-sitter, try the
[playground](https://tree-sitter.github.io/tree-sitter/7-playground.html)
and select **Bash** from the dropdown to see how it parses various
expressions.)

As a concrete example, prior to this change, our implementation of
`is_known_safe_command()` could verify things like:

```
["bash", "-lc", "grep -R \"Cargo.toml\" -n"]
```

but not:

```
["bash", "-lc", "grep -R \"Cargo.toml\" -n || true"]
```

With this change, the version with `|| true` is also accepted.

Admittedly, this PR does not expand the safety check to support
subshells, so it would reject, e.g. `bash -lc 'ls || (pwd && echo hi)'`,
but that can be addressed in a subsequent PR.
											
										
										
											2025-07-24 14:13:30 -07:00
+								use tree_sitter::Parser;
 								use tree_sitter::Tree;
 								use tree_sitter_bash::LANGUAGE as BASH;
 								/// Parse the provided bash source using tree-sitter-bash, returning a Tree on
 								/// success or None if parsing failed.
 								pub fn try_parse_bash(bash_lc_arg: &str) -> Option<Tree> {
 								    let lang = BASH.into();
 								    let mut parser = Parser::new();
 								    #[expect(clippy::expect_used)]
 								    parser.set_language(&lang).expect("load bash grammar");
 								    let old_tree: Option<&Tree> = None;
 								    parser.parse(bash_lc_arg, old_tree)
 								}
 								/// Parse a script which may contain multiple simple commands joined only by
 								/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`.
 								///
 								/// Returns `Some(Vec<command_words>)` if every command is a plain word‑only
 								/// command and the parse tree does not contain disallowed constructs
 								/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise
 								/// returns `None`.
 								pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option<Vec<Vec<String>>> {
 								    if tree.root_node().has_error() {
 								        return None;
 								    }
 								    // List of allowed (named) node kinds for a "word only commands sequence".
 								    // If we encounter a named node that is not in this list we reject.
 								    const ALLOWED_KINDS: &[&str] = &[
 								        // top level containers
 								        "program",
 								        "list",
 								        "pipeline",
 								        // commands & words
 								        "command",
 								        "command_name",
 								        "word",
 								        "string",
 								        "string_content",
 								        "raw_string",
 								        "number",
 								    ];
 								    // Allow only safe punctuation / operator tokens; anything else causes reject.
 								    const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"];
 								    let root = tree.root_node();
 								    let mut cursor = root.walk();
 								    let mut stack = vec![root];
 								    let mut command_nodes = Vec::new();
 								    while let Some(node) = stack.pop() {
 								        let kind = node.kind();
 								        if node.is_named() {
 								            if !ALLOWED_KINDS.contains(&kind) {
 								                return None;
 								            }
 								            if kind == "command" {
 								                command_nodes.push(node);
 								            }
 								        } else {
 								            // Reject any punctuation / operator tokens that are not explicitly allowed.
 								            if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) {
 								                return None;
 								            }
 								            if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) {
 								                // If it's a quote token or operator it's allowed above; we also allow whitespace tokens.
 								                // Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected.
 								                return None;
 								            }
 								        }
 								        for child in node.children(&mut cursor) {
 								            stack.push(child);
 								        }
 								    }
 								    let mut commands = Vec::new();
 								    for node in command_nodes {
 								        if let Some(words) = parse_plain_command_from_node(node, src) {
 								            commands.push(words);
 								        } else {
 								            return None;
 								        }
 								    }
 								    Some(commands)
 								}
 								fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option<Vec<String>> {
 								    if cmd.kind() != "command" {
 								        return None;
 								    }
 								    let mut words = Vec::new();
 								    let mut cursor = cmd.walk();
 								    for child in cmd.named_children(&mut cursor) {
 								        match child.kind() {
 								            "command_name" => {
 								                let word_node = child.named_child(0)?;
 								                if word_node.kind() != "word" {
 								                    return None;
 								                }
 								                words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned());
 								            }
 								            "word" | "number" => {
 								                words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned());
 								            }
 								            "string" => {
 								                if child.child_count() == 3
 								                    && child.child(0)?.kind() == "\""
 								                    && child.child(1)?.kind() == "string_content"
 								                    && child.child(2)?.kind() == "\""
 								                {
 								                    words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned());
 								                } else {
 								                    return None;
 								                }
 								            }
 								            "raw_string" => {
 								                let raw_string = child.utf8_text(src.as_bytes()).ok()?;
 								                let stripped = raw_string
 								                    .strip_prefix('\'')
 								                    .and_then(|s| s.strip_suffix('\''));
 								                if let Some(s) = stripped {
 								                    words.push(s.to_owned());
 								                } else {
 								                    return None;
 								                }
 								            }
 								            _ => return None,
 								        }
 								    }
 								    Some(words)
 								}
 								#[cfg(test)]
 								mod tests {
 								    #![allow(clippy::unwrap_used)]
 								    use super::*;
 								    fn parse_seq(src: &str) -> Option<Vec<Vec<String>>> {
 								        let tree = try_parse_bash(src)?;
 								        try_parse_word_only_commands_sequence(&tree, src)
 								    }
 								    #[test]
 								    fn accepts_single_simple_command() {
 								        let cmds = parse_seq("ls -1").unwrap();
 								        assert_eq!(cmds, vec![vec!["ls".to_string(), "-1".to_string()]]);
 								    }
 								    #[test]
 								    fn accepts_multiple_commands_with_allowed_operators() {
 								        let src = "ls && pwd; echo 'hi there' | wc -l";
 								        let cmds = parse_seq(src).unwrap();
 								        let expected: Vec<Vec<String>> = vec![
 								            vec!["wc".to_string(), "-l".to_string()],
 								            vec!["echo".to_string(), "hi there".to_string()],
 								            vec!["pwd".to_string()],
 								            vec!["ls".to_string()],
 								        ];
 								        assert_eq!(cmds, expected);
 								    }
 								    #[test]
 								    fn extracts_double_and_single_quoted_strings() {
 								        let cmds = parse_seq("echo \"hello world\"").unwrap();
 								        assert_eq!(
 								            cmds,
 								            vec![vec!["echo".to_string(), "hello world".to_string()]]
 								        );
 								        let cmds2 = parse_seq("echo 'hi there'").unwrap();
 								        assert_eq!(
 								            cmds2,
 								            vec![vec!["echo".to_string(), "hi there".to_string()]]
 								        );
 								    }
 								    #[test]
 								    fn accepts_numbers_as_words() {
 								        let cmds = parse_seq("echo 123 456").unwrap();
 								        assert_eq!(
 								            cmds,
 								            vec![vec![
 								                "echo".to_string(),
 								                "123".to_string(),
 								                "456".to_string()
 								            ]]
 								        );
 								    }
 								    #[test]
 								    fn rejects_parentheses_and_subshells() {
 								        assert!(parse_seq("(ls)").is_none());
 								        assert!(parse_seq("ls || (pwd && echo hi)").is_none());
 								    }
 								    #[test]
 								    fn rejects_redirections_and_unsupported_operators() {
 								        assert!(parse_seq("ls > out.txt").is_none());
 								        assert!(parse_seq("echo hi & echo bye").is_none());
 								    }
 								    #[test]
 								    fn rejects_command_and_process_substitutions_and_expansions() {
 								        assert!(parse_seq("echo $(pwd)").is_none());
 								        assert!(parse_seq("echo `pwd`").is_none());
 								        assert!(parse_seq("echo $HOME").is_none());
 								        assert!(parse_seq("echo \"hi $USER\"").is_none());
 								    }
 								    #[test]
 								    fn rejects_variable_assignment_prefix() {
 								        assert!(parse_seq("FOO=bar ls").is_none());
 								    }
 								    #[test]
 								    fn rejects_trailing_operator_parse_error() {
 								        assert!(parse_seq("ls &&").is_none());
 								    }
 								}