From e2efe8da9c6ebbbe142e5736e130c2df761be61f Mon Sep 17 00:00:00 2001 From: Michael Bolin Date: Sat, 28 Jun 2025 14:39:29 -0700 Subject: [PATCH] feat: introduce --compute-indices flag to codex-file-search (#1419) This is a small quality-of-life feature, the addition of `--compute-indices` to the CLI, which, if enabled, will compute and set the `indices` field for each `FileMatch` returned by `run()`. Note we only bother to compute `indices` once we have the top N results because there could be a lot of intermediate "top N" results during the search that are ultimately discarded. When set, the indices are included in the JSON output when `--json` is specified and the matching indices are displayed in bold when `--json` is not specified. --- codex-rs/Cargo.lock | 1 + codex-rs/file-search/Cargo.toml | 1 + codex-rs/file-search/src/cli.rs | 4 ++ codex-rs/file-search/src/lib.rs | 67 +++++++++++++++++++++++++++++--- codex-rs/file-search/src/main.rs | 36 +++++++++++++++-- codex-rs/tui/src/file_search.rs | 4 +- 6 files changed, 102 insertions(+), 11 deletions(-) diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index bfc78b65..035f37e5 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -699,6 +699,7 @@ dependencies = [ "clap", "ignore", "nucleo-matcher", + "serde", "serde_json", "tokio", ] diff --git a/codex-rs/file-search/Cargo.toml b/codex-rs/file-search/Cargo.toml index 1850d5ac..bb5b80b2 100644 --- a/codex-rs/file-search/Cargo.toml +++ b/codex-rs/file-search/Cargo.toml @@ -16,5 +16,6 @@ anyhow = "1" clap = { version = "4", features = ["derive"] } ignore = "0.4.23" nucleo-matcher = "0.3.1" +serde = { version = "1", features = ["derive"] } serde_json = "1.0.110" tokio = { version = "1", features = ["full"] } diff --git a/codex-rs/file-search/src/cli.rs b/codex-rs/file-search/src/cli.rs index 27afcbc1..e3394f92 100644 --- a/codex-rs/file-search/src/cli.rs +++ b/codex-rs/file-search/src/cli.rs @@ -20,6 +20,10 @@ pub struct Cli { #[clap(long, short = 'C')] pub cwd: Option, + /// Include matching file indices in the output. + #[arg(long, default_value = "false")] + pub compute_indices: bool, + // While it is common to default to the number of logical CPUs when creating // a thread pool, empirically, the I/O of the filetree traversal offers // limited parallelism and is the bottleneck, so using a smaller number of diff --git a/codex-rs/file-search/src/lib.rs b/codex-rs/file-search/src/lib.rs index 8f7bce3e..2365c176 100644 --- a/codex-rs/file-search/src/lib.rs +++ b/codex-rs/file-search/src/lib.rs @@ -6,6 +6,7 @@ use nucleo_matcher::pattern::AtomKind; use nucleo_matcher::pattern::CaseMatching; use nucleo_matcher::pattern::Normalization; use nucleo_matcher::pattern::Pattern; +use serde::Serialize; use std::cell::UnsafeCell; use std::cmp::Reverse; use std::collections::BinaryHeap; @@ -21,13 +22,31 @@ mod cli; pub use cli::Cli; +/// A single match result returned from the search. +/// +/// * `score` – Relevance score returned by `nucleo_matcher`. +/// * `path` – Path to the matched file (relative to the search directory). +/// * `indices` – Optional list of character indices that matched the query. +/// These are only filled when the caller of [`run`] sets +/// `compute_indices` to `true`. The indices vector follows the +/// guidance from `nucleo_matcher::Pattern::indices`: they are +/// unique and sorted in ascending order so that callers can use +/// them directly for highlighting. +#[derive(Debug, Clone, Serialize)] +pub struct FileMatch { + pub score: u32, + pub path: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub indices: Option>, // Sorted & deduplicated when present +} + pub struct FileSearchResults { - pub matches: Vec<(u32, String)>, + pub matches: Vec, pub total_match_count: usize, } pub trait Reporter { - fn report_match(&self, file: &str, score: u32); + fn report_match(&self, file_match: &FileMatch); fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize); fn warn_no_search_pattern(&self, search_directory: &Path); } @@ -37,6 +56,7 @@ pub async fn run_main( pattern, limit, cwd, + compute_indices, json: _, exclude, threads, @@ -84,12 +104,13 @@ pub async fn run_main( exclude, threads, cancel_flag, + compute_indices, )?; let match_count = matches.len(); let matches_truncated = total_match_count > match_count; - for (score, file) in matches { - reporter.report_match(&file, score); + for file_match in matches { + reporter.report_match(&file_match); } if matches_truncated { reporter.warn_matches_truncated(total_match_count, match_count); @@ -107,6 +128,7 @@ pub fn run( exclude: Vec, threads: NonZero, cancel_flag: Arc, + compute_indices: bool, ) -> anyhow::Result { let pattern = create_pattern(pattern_text); // Create one BestMatchesList per worker thread so that each worker can @@ -215,8 +237,41 @@ pub fn run( } } - let mut matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect(); - sort_matches(&mut matches); + let mut raw_matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect(); + sort_matches(&mut raw_matches); + + // Transform into `FileMatch`, optionally computing indices. + let mut matcher = if compute_indices { + Some(Matcher::new(nucleo_matcher::Config::DEFAULT)) + } else { + None + }; + + let matches: Vec = raw_matches + .into_iter() + .map(|(score, path)| { + let indices = if compute_indices { + let mut buf = Vec::::new(); + let haystack: Utf32Str<'_> = Utf32Str::new(&path, &mut buf); + let mut idx_vec: Vec = Vec::new(); + if let Some(ref mut m) = matcher { + // Ignore the score returned from indices – we already have `score`. + pattern.indices(haystack, m, &mut idx_vec); + } + idx_vec.sort_unstable(); + idx_vec.dedup(); + Some(idx_vec) + } else { + None + }; + + FileMatch { + score, + path, + indices, + } + }) + .collect(); Ok(FileSearchResults { matches, diff --git a/codex-rs/file-search/src/main.rs b/codex-rs/file-search/src/main.rs index c25122c1..6635dc03 100644 --- a/codex-rs/file-search/src/main.rs +++ b/codex-rs/file-search/src/main.rs @@ -1,7 +1,9 @@ +use std::io::IsTerminal; use std::path::Path; use clap::Parser; use codex_file_search::Cli; +use codex_file_search::FileMatch; use codex_file_search::Reporter; use codex_file_search::run_main; use serde_json::json; @@ -11,6 +13,7 @@ async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); let reporter = StdioReporter { write_output_as_json: cli.json, + show_indices: cli.compute_indices && std::io::stdout().is_terminal(), }; run_main(cli, reporter).await?; Ok(()) @@ -18,15 +21,40 @@ async fn main() -> anyhow::Result<()> { struct StdioReporter { write_output_as_json: bool, + show_indices: bool, } impl Reporter for StdioReporter { - fn report_match(&self, file: &str, score: u32) { + fn report_match(&self, file_match: &FileMatch) { if self.write_output_as_json { - let value = json!({ "file": file, "score": score }); - println!("{}", serde_json::to_string(&value).unwrap()); + println!("{}", serde_json::to_string(&file_match).unwrap()); + } else if self.show_indices { + let indices = file_match + .indices + .as_ref() + .expect("--compute-indices was specified"); + // `indices` is guaranteed to be sorted in ascending order. Instead + // of calling `contains` for every character (which would be O(N^2) + // in the worst-case), walk through the `indices` vector once while + // iterating over the characters. + let mut indices_iter = indices.iter().peekable(); + + for (i, c) in file_match.path.chars().enumerate() { + match indices_iter.peek() { + Some(next) if **next == i as u32 => { + // ANSI escape code for bold: \x1b[1m ... \x1b[0m + print!("\x1b[1m{}\x1b[0m", c); + // advance the iterator since we've consumed this index + indices_iter.next(); + } + _ => { + print!("{}", c); + } + } + } + println!(); } else { - println!("{file}"); + println!("{}", file_match.path); } } diff --git a/codex-rs/tui/src/file_search.rs b/codex-rs/tui/src/file_search.rs index 7a76a1f0..77eee35b 100644 --- a/codex-rs/tui/src/file_search.rs +++ b/codex-rs/tui/src/file_search.rs @@ -165,6 +165,7 @@ impl FileSearchManager { cancellation_token: Arc, search_state: Arc>, ) { + let compute_indices = false; std::thread::spawn(move || { let matches = file_search::run( &query, @@ -173,11 +174,12 @@ impl FileSearchManager { Vec::new(), NUM_FILE_SEARCH_THREADS, cancellation_token.clone(), + compute_indices, ) .map(|res| { res.matches .into_iter() - .map(|(_, p)| p) + .map(|m| m.path) .collect::>() }) .unwrap_or_default();