Without proper `zsh -lc` parsing, we lose some things like proper command parsing, turn diff tracking, safe command checks, and other things we expect from raw or `bash -lc` commands.
246 lines
8.0 KiB
Rust
246 lines
8.0 KiB
Rust
use tree_sitter::Node;
|
||
use tree_sitter::Parser;
|
||
use tree_sitter::Tree;
|
||
use tree_sitter_bash::LANGUAGE as BASH;
|
||
|
||
/// Parse the provided bash source using tree-sitter-bash, returning a Tree on
|
||
/// success or None if parsing failed.
|
||
pub fn try_parse_shell(shell_lc_arg: &str) -> Option<Tree> {
|
||
let lang = BASH.into();
|
||
let mut parser = Parser::new();
|
||
#[expect(clippy::expect_used)]
|
||
parser.set_language(&lang).expect("load bash grammar");
|
||
let old_tree: Option<&Tree> = None;
|
||
parser.parse(shell_lc_arg, old_tree)
|
||
}
|
||
|
||
/// Parse a script which may contain multiple simple commands joined only by
|
||
/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`.
|
||
///
|
||
/// Returns `Some(Vec<command_words>)` if every command is a plain word‑only
|
||
/// command and the parse tree does not contain disallowed constructs
|
||
/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise
|
||
/// returns `None`.
|
||
pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option<Vec<Vec<String>>> {
|
||
if tree.root_node().has_error() {
|
||
return None;
|
||
}
|
||
|
||
// List of allowed (named) node kinds for a "word only commands sequence".
|
||
// If we encounter a named node that is not in this list we reject.
|
||
const ALLOWED_KINDS: &[&str] = &[
|
||
// top level containers
|
||
"program",
|
||
"list",
|
||
"pipeline",
|
||
// commands & words
|
||
"command",
|
||
"command_name",
|
||
"word",
|
||
"string",
|
||
"string_content",
|
||
"raw_string",
|
||
"number",
|
||
];
|
||
// Allow only safe punctuation / operator tokens; anything else causes reject.
|
||
const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"];
|
||
|
||
let root = tree.root_node();
|
||
let mut cursor = root.walk();
|
||
let mut stack = vec![root];
|
||
let mut command_nodes = Vec::new();
|
||
while let Some(node) = stack.pop() {
|
||
let kind = node.kind();
|
||
if node.is_named() {
|
||
if !ALLOWED_KINDS.contains(&kind) {
|
||
return None;
|
||
}
|
||
if kind == "command" {
|
||
command_nodes.push(node);
|
||
}
|
||
} else {
|
||
// Reject any punctuation / operator tokens that are not explicitly allowed.
|
||
if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) {
|
||
return None;
|
||
}
|
||
if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) {
|
||
// If it's a quote token or operator it's allowed above; we also allow whitespace tokens.
|
||
// Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected.
|
||
return None;
|
||
}
|
||
}
|
||
for child in node.children(&mut cursor) {
|
||
stack.push(child);
|
||
}
|
||
}
|
||
|
||
// Walk uses a stack (LIFO), so re-sort by position to restore source order.
|
||
command_nodes.sort_by_key(Node::start_byte);
|
||
|
||
let mut commands = Vec::new();
|
||
for node in command_nodes {
|
||
if let Some(words) = parse_plain_command_from_node(node, src) {
|
||
commands.push(words);
|
||
} else {
|
||
return None;
|
||
}
|
||
}
|
||
Some(commands)
|
||
}
|
||
|
||
/// Returns the sequence of plain commands within a `bash -lc "..."` or
|
||
/// `zsh -lc "..."` invocation when the script only contains word-only commands
|
||
/// joined by safe operators.
|
||
pub fn parse_shell_lc_plain_commands(command: &[String]) -> Option<Vec<Vec<String>>> {
|
||
let [shell, flag, script] = command else {
|
||
return None;
|
||
};
|
||
|
||
if flag != "-lc" || !(shell == "bash" || shell == "zsh") {
|
||
return None;
|
||
}
|
||
|
||
let tree = try_parse_shell(script)?;
|
||
try_parse_word_only_commands_sequence(&tree, script)
|
||
}
|
||
|
||
fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option<Vec<String>> {
|
||
if cmd.kind() != "command" {
|
||
return None;
|
||
}
|
||
let mut words = Vec::new();
|
||
let mut cursor = cmd.walk();
|
||
for child in cmd.named_children(&mut cursor) {
|
||
match child.kind() {
|
||
"command_name" => {
|
||
let word_node = child.named_child(0)?;
|
||
if word_node.kind() != "word" {
|
||
return None;
|
||
}
|
||
words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned());
|
||
}
|
||
"word" | "number" => {
|
||
words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned());
|
||
}
|
||
"string" => {
|
||
if child.child_count() == 3
|
||
&& child.child(0)?.kind() == "\""
|
||
&& child.child(1)?.kind() == "string_content"
|
||
&& child.child(2)?.kind() == "\""
|
||
{
|
||
words.push(child.child(1)?.utf8_text(src.as_bytes()).ok()?.to_owned());
|
||
} else {
|
||
return None;
|
||
}
|
||
}
|
||
"raw_string" => {
|
||
let raw_string = child.utf8_text(src.as_bytes()).ok()?;
|
||
let stripped = raw_string
|
||
.strip_prefix('\'')
|
||
.and_then(|s| s.strip_suffix('\''));
|
||
if let Some(s) = stripped {
|
||
words.push(s.to_owned());
|
||
} else {
|
||
return None;
|
||
}
|
||
}
|
||
_ => return None,
|
||
}
|
||
}
|
||
Some(words)
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
fn parse_seq(src: &str) -> Option<Vec<Vec<String>>> {
|
||
let tree = try_parse_shell(src)?;
|
||
try_parse_word_only_commands_sequence(&tree, src)
|
||
}
|
||
|
||
#[test]
|
||
fn accepts_single_simple_command() {
|
||
let cmds = parse_seq("ls -1").unwrap();
|
||
assert_eq!(cmds, vec![vec!["ls".to_string(), "-1".to_string()]]);
|
||
}
|
||
|
||
#[test]
|
||
fn accepts_multiple_commands_with_allowed_operators() {
|
||
let src = "ls && pwd; echo 'hi there' | wc -l";
|
||
let cmds = parse_seq(src).unwrap();
|
||
let expected: Vec<Vec<String>> = vec![
|
||
vec!["ls".to_string()],
|
||
vec!["pwd".to_string()],
|
||
vec!["echo".to_string(), "hi there".to_string()],
|
||
vec!["wc".to_string(), "-l".to_string()],
|
||
];
|
||
assert_eq!(cmds, expected);
|
||
}
|
||
|
||
#[test]
|
||
fn extracts_double_and_single_quoted_strings() {
|
||
let cmds = parse_seq("echo \"hello world\"").unwrap();
|
||
assert_eq!(
|
||
cmds,
|
||
vec![vec!["echo".to_string(), "hello world".to_string()]]
|
||
);
|
||
|
||
let cmds2 = parse_seq("echo 'hi there'").unwrap();
|
||
assert_eq!(
|
||
cmds2,
|
||
vec![vec!["echo".to_string(), "hi there".to_string()]]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn accepts_numbers_as_words() {
|
||
let cmds = parse_seq("echo 123 456").unwrap();
|
||
assert_eq!(
|
||
cmds,
|
||
vec![vec![
|
||
"echo".to_string(),
|
||
"123".to_string(),
|
||
"456".to_string()
|
||
]]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn rejects_parentheses_and_subshells() {
|
||
assert!(parse_seq("(ls)").is_none());
|
||
assert!(parse_seq("ls || (pwd && echo hi)").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn rejects_redirections_and_unsupported_operators() {
|
||
assert!(parse_seq("ls > out.txt").is_none());
|
||
assert!(parse_seq("echo hi & echo bye").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn rejects_command_and_process_substitutions_and_expansions() {
|
||
assert!(parse_seq("echo $(pwd)").is_none());
|
||
assert!(parse_seq("echo `pwd`").is_none());
|
||
assert!(parse_seq("echo $HOME").is_none());
|
||
assert!(parse_seq("echo \"hi $USER\"").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn rejects_variable_assignment_prefix() {
|
||
assert!(parse_seq("FOO=bar ls").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn rejects_trailing_operator_parse_error() {
|
||
assert!(parse_seq("ls &&").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn parse_zsh_lc_plain_commands() {
|
||
let command = vec!["zsh".to_string(), "-lc".to_string(), "ls".to_string()];
|
||
let parsed = parse_shell_lc_plain_commands(&command).unwrap();
|
||
assert_eq!(parsed, vec![vec!["ls".to_string()]]);
|
||
}
|
||
}
|