From 296996d74e345b1b05d8c3451a06ace21c5ada96 Mon Sep 17 00:00:00 2001 From: Michael Bolin Date: Wed, 25 Jun 2025 13:29:03 -0700 Subject: [PATCH] feat: standalone file search CLI (#1386) Standalone fuzzy filename search library that should be helpful in addressing https://github.com/openai/codex/issues/1261. --- codex-rs/Cargo.lock | 52 ++++++ codex-rs/Cargo.toml | 1 + codex-rs/file-search/Cargo.toml | 20 +++ codex-rs/file-search/README.md | 5 + codex-rs/file-search/src/cli.rs | 38 +++++ codex-rs/file-search/src/lib.rs | 284 +++++++++++++++++++++++++++++++ codex-rs/file-search/src/main.rs | 50 ++++++ codex-rs/justfile | 4 + 8 files changed, 454 insertions(+) create mode 100644 codex-rs/file-search/Cargo.toml create mode 100644 codex-rs/file-search/README.md create mode 100644 codex-rs/file-search/src/cli.rs create mode 100644 codex-rs/file-search/src/lib.rs create mode 100644 codex-rs/file-search/src/main.rs diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index bb533be1..e034a993 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -691,6 +691,18 @@ dependencies = [ "tempfile", ] +[[package]] +name = "codex-file-search" +version = "0.0.0" +dependencies = [ + "anyhow", + "clap", + "ignore", + "nucleo-matcher", + "serde_json", + "tokio", +] + [[package]] name = "codex-linux-sandbox" version = "0.0.0" @@ -1601,6 +1613,19 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + [[package]] name = "h2" version = "0.4.9" @@ -1985,6 +2010,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.9", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "image" version = "0.25.6" @@ -2577,6 +2618,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nucleo-matcher" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf33f538733d1a5a3494b836ba913207f14d9d4a1d3cd67030c5061bdd2cac85" +dependencies = [ + "memchr", + "unicode-segmentation", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -4362,6 +4413,7 @@ dependencies = [ "bytes", "libc", "mio", + "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index 6991a622..f93cbbaa 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -8,6 +8,7 @@ members = [ "core", "exec", "execpolicy", + "file-search", "linux-sandbox", "login", "mcp-client", diff --git a/codex-rs/file-search/Cargo.toml b/codex-rs/file-search/Cargo.toml new file mode 100644 index 00000000..1850d5ac --- /dev/null +++ b/codex-rs/file-search/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "codex-file-search" +version = { workspace = true } +edition = "2024" + +[[bin]] +name = "codex-file-search" +path = "src/main.rs" + +[lib] +name = "codex_file_search" +path = "src/lib.rs" + +[dependencies] +anyhow = "1" +clap = { version = "4", features = ["derive"] } +ignore = "0.4.23" +nucleo-matcher = "0.3.1" +serde_json = "1.0.110" +tokio = { version = "1", features = ["full"] } diff --git a/codex-rs/file-search/README.md b/codex-rs/file-search/README.md new file mode 100644 index 00000000..c47d494a --- /dev/null +++ b/codex-rs/file-search/README.md @@ -0,0 +1,5 @@ +# codex_file_search + +Fast fuzzy file search tool for Codex. + +Uses under the hood (which is what `ripgrep` uses) to traverse a directory (while honoring `.gitignore`, etc.) to produce the list of files to search and then uses to fuzzy-match the user supplied `PATTERN` against the corpus. diff --git a/codex-rs/file-search/src/cli.rs b/codex-rs/file-search/src/cli.rs new file mode 100644 index 00000000..27afcbc1 --- /dev/null +++ b/codex-rs/file-search/src/cli.rs @@ -0,0 +1,38 @@ +use std::num::NonZero; +use std::path::PathBuf; + +use clap::ArgAction; +use clap::Parser; + +/// Fuzzy matches filenames under a directory. +#[derive(Parser)] +#[command(version)] +pub struct Cli { + /// Whether to output results in JSON format. + #[clap(long, default_value = "false")] + pub json: bool, + + /// Maximum number of results to return. + #[clap(long, short = 'l', default_value = "64")] + pub limit: NonZero, + + /// Directory to search. + #[clap(long, short = 'C')] + pub cwd: Option, + + // While it is common to default to the number of logical CPUs when creating + // a thread pool, empirically, the I/O of the filetree traversal offers + // limited parallelism and is the bottleneck, so using a smaller number of + // threads is more efficient. (Empirically, using more than 2 threads doesn't seem to provide much benefit.) + // + /// Number of worker threads to use. + #[clap(long, default_value = "2")] + pub threads: NonZero, + + /// Exclude patterns + #[arg(short, long, action = ArgAction::Append)] + pub exclude: Vec, + + /// Search pattern. + pub pattern: Option, +} diff --git a/codex-rs/file-search/src/lib.rs b/codex-rs/file-search/src/lib.rs new file mode 100644 index 00000000..87541816 --- /dev/null +++ b/codex-rs/file-search/src/lib.rs @@ -0,0 +1,284 @@ +use ignore::WalkBuilder; +use ignore::overrides::OverrideBuilder; +use nucleo_matcher::Matcher; +use nucleo_matcher::Utf32Str; +use nucleo_matcher::pattern::AtomKind; +use nucleo_matcher::pattern::CaseMatching; +use nucleo_matcher::pattern::Normalization; +use nucleo_matcher::pattern::Pattern; +use std::cell::UnsafeCell; +use std::cmp::Reverse; +use std::collections::BinaryHeap; +use std::num::NonZero; +use std::path::Path; +use std::path::PathBuf; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; +use tokio::process::Command; + +mod cli; + +pub use cli::Cli; + +pub struct FileSearchResults { + pub matches: Vec<(u32, String)>, + pub total_match_count: usize, +} + +pub trait Reporter { + fn report_match(&self, file: &str, score: u32); + fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize); + fn warn_no_search_pattern(&self, search_directory: &Path); +} + +pub async fn run_main( + Cli { + pattern, + limit, + cwd, + json: _, + exclude, + threads, + }: Cli, + reporter: T, +) -> anyhow::Result<()> { + let search_directory = match cwd { + Some(dir) => dir, + None => std::env::current_dir()?, + }; + let pattern_text = match pattern { + Some(pattern) => pattern, + None => { + reporter.warn_no_search_pattern(&search_directory); + #[cfg(unix)] + Command::new("ls") + .arg("-al") + .current_dir(search_directory) + .stdout(std::process::Stdio::inherit()) + .stderr(std::process::Stdio::inherit()) + .status() + .await?; + #[cfg(windows)] + { + Command::new("cmd") + .arg("/c") + .arg(search_directory) + .stdout(std::process::Stdio::inherit()) + .stderr(std::process::Stdio::inherit()) + .status() + .await?; + } + return Ok(()); + } + }; + + let FileSearchResults { + total_match_count, + matches, + } = run(&pattern_text, limit, search_directory, exclude, threads).await?; + let match_count = matches.len(); + let matches_truncated = total_match_count > match_count; + + for (score, file) in matches { + reporter.report_match(&file, score); + } + if matches_truncated { + reporter.warn_matches_truncated(total_match_count, match_count); + } + + Ok(()) +} + +pub async fn run( + pattern_text: &str, + limit: NonZero, + search_directory: PathBuf, + exclude: Vec, + threads: NonZero, +) -> anyhow::Result { + let pattern = create_pattern(pattern_text); + // Create one BestMatchesList per worker thread so that each worker can + // operate independently. The results across threads will be merged when + // the traversal is complete. + let WorkerCount { + num_walk_builder_threads, + num_best_matches_lists, + } = create_worker_count(threads); + let best_matchers_per_worker: Vec> = (0..num_best_matches_lists) + .map(|_| { + UnsafeCell::new(BestMatchesList::new( + limit.get(), + pattern.clone(), + Matcher::new(nucleo_matcher::Config::DEFAULT), + )) + }) + .collect(); + + // Use the same tree-walker library that ripgrep uses. We use it directly so + // that we can leverage the parallelism it provides. + let mut walk_builder = WalkBuilder::new(&search_directory); + walk_builder.threads(num_walk_builder_threads); + if !exclude.is_empty() { + let mut override_builder = OverrideBuilder::new(&search_directory); + for exclude in exclude { + // The `!` prefix is used to indicate an exclude pattern. + let exclude_pattern = format!("!{}", exclude); + override_builder.add(&exclude_pattern)?; + } + let override_matcher = override_builder.build()?; + walk_builder.overrides(override_matcher); + } + let walker = walk_builder.build_parallel(); + + // Each worker created by `WalkParallel::run()` will have its own + // `BestMatchesList` to update. + let index_counter = AtomicUsize::new(0); + walker.run(|| { + let search_directory = search_directory.clone(); + let index = index_counter.fetch_add(1, Ordering::Relaxed); + let best_list_ptr = best_matchers_per_worker[index].get(); + let best_list = unsafe { &mut *best_list_ptr }; + Box::new(move |entry| { + if let Some(path) = get_file_path(&entry, &search_directory) { + best_list.insert(path); + } + ignore::WalkState::Continue + }) + }); + + fn get_file_path<'a>( + entry_result: &'a Result, + search_directory: &std::path::Path, + ) -> Option<&'a str> { + let entry = match entry_result { + Ok(e) => e, + Err(_) => return None, + }; + if entry.file_type().is_some_and(|ft| ft.is_dir()) { + return None; + } + let path = entry.path(); + match path.strip_prefix(search_directory) { + Ok(rel_path) => rel_path.to_str(), + Err(_) => None, + } + } + + // Merge results across best_matchers_per_worker. + let mut global_heap: BinaryHeap> = BinaryHeap::new(); + let mut total_match_count = 0; + for best_list_cell in best_matchers_per_worker.iter() { + let best_list = unsafe { &*best_list_cell.get() }; + total_match_count += best_list.num_matches; + for &Reverse((score, ref line)) in best_list.binary_heap.iter() { + if global_heap.len() < limit.get() { + global_heap.push(Reverse((score, line.clone()))); + } else if let Some(min_element) = global_heap.peek() { + if score > min_element.0.0 { + global_heap.pop(); + global_heap.push(Reverse((score, line.clone()))); + } + } + } + } + + let mut matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect(); + matches.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); + + Ok(FileSearchResults { + matches, + total_match_count, + }) +} + +/// Maintains the `max_count` best matches for a given pattern. +struct BestMatchesList { + max_count: usize, + num_matches: usize, + pattern: Pattern, + matcher: Matcher, + binary_heap: BinaryHeap>, + + /// Internal buffer for converting strings to UTF-32. + utf32buf: Vec, +} + +impl BestMatchesList { + fn new(max_count: usize, pattern: Pattern, matcher: Matcher) -> Self { + Self { + max_count, + num_matches: 0, + pattern, + matcher, + binary_heap: BinaryHeap::new(), + utf32buf: Vec::::new(), + } + } + + fn insert(&mut self, line: &str) { + let haystack: Utf32Str<'_> = Utf32Str::new(line, &mut self.utf32buf); + if let Some(score) = self.pattern.score(haystack, &mut self.matcher) { + // In the tests below, we verify that score() returns None for a + // non-match, so we can categorically increment the count here. + self.num_matches += 1; + + if self.binary_heap.len() < self.max_count { + self.binary_heap.push(Reverse((score, line.to_string()))); + } else if let Some(min_element) = self.binary_heap.peek() { + if score > min_element.0.0 { + self.binary_heap.pop(); + self.binary_heap.push(Reverse((score, line.to_string()))); + } + } + } + } +} + +struct WorkerCount { + num_walk_builder_threads: usize, + num_best_matches_lists: usize, +} + +fn create_worker_count(num_workers: NonZero) -> WorkerCount { + // It appears that the number of times the function passed to + // `WalkParallel::run()` is called is: the number of threads specified to + // the builder PLUS ONE. + // + // In `WalkParallel::visit()`, the builder function gets called once here: + // https://github.com/BurntSushi/ripgrep/blob/79cbe89deb1151e703f4d91b19af9cdcc128b765/crates/ignore/src/walk.rs#L1233 + // + // And then once for every worker here: + // https://github.com/BurntSushi/ripgrep/blob/79cbe89deb1151e703f4d91b19af9cdcc128b765/crates/ignore/src/walk.rs#L1288 + let num_walk_builder_threads = num_workers.get(); + let num_best_matches_lists = num_walk_builder_threads + 1; + + WorkerCount { + num_walk_builder_threads, + num_best_matches_lists, + } +} + +fn create_pattern(pattern: &str) -> Pattern { + Pattern::new( + pattern, + CaseMatching::Smart, + Normalization::Smart, + AtomKind::Fuzzy, + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn verify_score_is_none_for_non_match() { + let mut utf32buf = Vec::::new(); + let line = "hello"; + let mut matcher = Matcher::new(nucleo_matcher::Config::DEFAULT); + let haystack: Utf32Str<'_> = Utf32Str::new(line, &mut utf32buf); + let pattern = create_pattern("zzz"); + let score = pattern.score(haystack, &mut matcher); + assert_eq!(score, None); + } +} diff --git a/codex-rs/file-search/src/main.rs b/codex-rs/file-search/src/main.rs new file mode 100644 index 00000000..c25122c1 --- /dev/null +++ b/codex-rs/file-search/src/main.rs @@ -0,0 +1,50 @@ +use std::path::Path; + +use clap::Parser; +use codex_file_search::Cli; +use codex_file_search::Reporter; +use codex_file_search::run_main; +use serde_json::json; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + let reporter = StdioReporter { + write_output_as_json: cli.json, + }; + run_main(cli, reporter).await?; + Ok(()) +} + +struct StdioReporter { + write_output_as_json: bool, +} + +impl Reporter for StdioReporter { + fn report_match(&self, file: &str, score: u32) { + if self.write_output_as_json { + let value = json!({ "file": file, "score": score }); + println!("{}", serde_json::to_string(&value).unwrap()); + } else { + println!("{file}"); + } + } + + fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize) { + if self.write_output_as_json { + let value = json!({"matches_truncated": true}); + println!("{}", serde_json::to_string(&value).unwrap()); + } else { + eprintln!( + "Warning: showing {shown_match_count} out of {total_match_count} results. Provide a more specific pattern or increase the --limit.", + ); + } + } + + fn warn_no_search_pattern(&self, search_directory: &Path) { + eprintln!( + "No search pattern specified. Showing the contents of the current directory ({}):", + search_directory.to_string_lossy() + ); + } +} diff --git a/codex-rs/justfile b/codex-rs/justfile index c09465a4..83a390ec 100644 --- a/codex-rs/justfile +++ b/codex-rs/justfile @@ -16,6 +16,10 @@ exec *args: tui *args: cargo run --bin codex -- tui "$@" +# Run the CLI version of the file-search crate. +file-search *args: + cargo run --bin codex-file-search -- "$@" + # format code fmt: cargo fmt -- --config imports_granularity=Item