feat: introduce --compute-indices flag to codex-file-search (#1419)

This is a small quality-of-life feature, the addition of
`--compute-indices` to the CLI, which, if enabled, will compute and set
the `indices` field for each `FileMatch` returned by `run()`. Note we
only bother to compute `indices` once we have the top N results because
there could be a lot of intermediate "top N" results during the search
that are ultimately discarded.

When set, the indices are included in the JSON output when `--json` is
specified and the matching indices are displayed in bold when `--json`
is not specified.
This commit is contained in:
Michael Bolin
2025-06-28 14:39:29 -07:00
committed by GitHub
parent 5a0f236ca4
commit e2efe8da9c
6 changed files with 102 additions and 11 deletions

1
codex-rs/Cargo.lock generated
View File

@@ -699,6 +699,7 @@ dependencies = [
"clap", "clap",
"ignore", "ignore",
"nucleo-matcher", "nucleo-matcher",
"serde",
"serde_json", "serde_json",
"tokio", "tokio",
] ]

View File

@@ -16,5 +16,6 @@ anyhow = "1"
clap = { version = "4", features = ["derive"] } clap = { version = "4", features = ["derive"] }
ignore = "0.4.23" ignore = "0.4.23"
nucleo-matcher = "0.3.1" nucleo-matcher = "0.3.1"
serde = { version = "1", features = ["derive"] }
serde_json = "1.0.110" serde_json = "1.0.110"
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }

View File

@@ -20,6 +20,10 @@ pub struct Cli {
#[clap(long, short = 'C')] #[clap(long, short = 'C')]
pub cwd: Option<PathBuf>, pub cwd: Option<PathBuf>,
/// Include matching file indices in the output.
#[arg(long, default_value = "false")]
pub compute_indices: bool,
// While it is common to default to the number of logical CPUs when creating // While it is common to default to the number of logical CPUs when creating
// a thread pool, empirically, the I/O of the filetree traversal offers // a thread pool, empirically, the I/O of the filetree traversal offers
// limited parallelism and is the bottleneck, so using a smaller number of // limited parallelism and is the bottleneck, so using a smaller number of

View File

@@ -6,6 +6,7 @@ use nucleo_matcher::pattern::AtomKind;
use nucleo_matcher::pattern::CaseMatching; use nucleo_matcher::pattern::CaseMatching;
use nucleo_matcher::pattern::Normalization; use nucleo_matcher::pattern::Normalization;
use nucleo_matcher::pattern::Pattern; use nucleo_matcher::pattern::Pattern;
use serde::Serialize;
use std::cell::UnsafeCell; use std::cell::UnsafeCell;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
@@ -21,13 +22,31 @@ mod cli;
pub use cli::Cli; pub use cli::Cli;
/// A single match result returned from the search.
///
/// * `score` Relevance score returned by `nucleo_matcher`.
/// * `path` Path to the matched file (relative to the search directory).
/// * `indices` Optional list of character indices that matched the query.
/// These are only filled when the caller of [`run`] sets
/// `compute_indices` to `true`. The indices vector follows the
/// guidance from `nucleo_matcher::Pattern::indices`: they are
/// unique and sorted in ascending order so that callers can use
/// them directly for highlighting.
#[derive(Debug, Clone, Serialize)]
pub struct FileMatch {
pub score: u32,
pub path: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub indices: Option<Vec<u32>>, // Sorted & deduplicated when present
}
pub struct FileSearchResults { pub struct FileSearchResults {
pub matches: Vec<(u32, String)>, pub matches: Vec<FileMatch>,
pub total_match_count: usize, pub total_match_count: usize,
} }
pub trait Reporter { pub trait Reporter {
fn report_match(&self, file: &str, score: u32); fn report_match(&self, file_match: &FileMatch);
fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize); fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize);
fn warn_no_search_pattern(&self, search_directory: &Path); fn warn_no_search_pattern(&self, search_directory: &Path);
} }
@@ -37,6 +56,7 @@ pub async fn run_main<T: Reporter>(
pattern, pattern,
limit, limit,
cwd, cwd,
compute_indices,
json: _, json: _,
exclude, exclude,
threads, threads,
@@ -84,12 +104,13 @@ pub async fn run_main<T: Reporter>(
exclude, exclude,
threads, threads,
cancel_flag, cancel_flag,
compute_indices,
)?; )?;
let match_count = matches.len(); let match_count = matches.len();
let matches_truncated = total_match_count > match_count; let matches_truncated = total_match_count > match_count;
for (score, file) in matches { for file_match in matches {
reporter.report_match(&file, score); reporter.report_match(&file_match);
} }
if matches_truncated { if matches_truncated {
reporter.warn_matches_truncated(total_match_count, match_count); reporter.warn_matches_truncated(total_match_count, match_count);
@@ -107,6 +128,7 @@ pub fn run(
exclude: Vec<String>, exclude: Vec<String>,
threads: NonZero<usize>, threads: NonZero<usize>,
cancel_flag: Arc<AtomicBool>, cancel_flag: Arc<AtomicBool>,
compute_indices: bool,
) -> anyhow::Result<FileSearchResults> { ) -> anyhow::Result<FileSearchResults> {
let pattern = create_pattern(pattern_text); let pattern = create_pattern(pattern_text);
// Create one BestMatchesList per worker thread so that each worker can // Create one BestMatchesList per worker thread so that each worker can
@@ -215,8 +237,41 @@ pub fn run(
} }
} }
let mut matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect(); let mut raw_matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect();
sort_matches(&mut matches); sort_matches(&mut raw_matches);
// Transform into `FileMatch`, optionally computing indices.
let mut matcher = if compute_indices {
Some(Matcher::new(nucleo_matcher::Config::DEFAULT))
} else {
None
};
let matches: Vec<FileMatch> = raw_matches
.into_iter()
.map(|(score, path)| {
let indices = if compute_indices {
let mut buf = Vec::<char>::new();
let haystack: Utf32Str<'_> = Utf32Str::new(&path, &mut buf);
let mut idx_vec: Vec<u32> = Vec::new();
if let Some(ref mut m) = matcher {
// Ignore the score returned from indices we already have `score`.
pattern.indices(haystack, m, &mut idx_vec);
}
idx_vec.sort_unstable();
idx_vec.dedup();
Some(idx_vec)
} else {
None
};
FileMatch {
score,
path,
indices,
}
})
.collect();
Ok(FileSearchResults { Ok(FileSearchResults {
matches, matches,

View File

@@ -1,7 +1,9 @@
use std::io::IsTerminal;
use std::path::Path; use std::path::Path;
use clap::Parser; use clap::Parser;
use codex_file_search::Cli; use codex_file_search::Cli;
use codex_file_search::FileMatch;
use codex_file_search::Reporter; use codex_file_search::Reporter;
use codex_file_search::run_main; use codex_file_search::run_main;
use serde_json::json; use serde_json::json;
@@ -11,6 +13,7 @@ async fn main() -> anyhow::Result<()> {
let cli = Cli::parse(); let cli = Cli::parse();
let reporter = StdioReporter { let reporter = StdioReporter {
write_output_as_json: cli.json, write_output_as_json: cli.json,
show_indices: cli.compute_indices && std::io::stdout().is_terminal(),
}; };
run_main(cli, reporter).await?; run_main(cli, reporter).await?;
Ok(()) Ok(())
@@ -18,15 +21,40 @@ async fn main() -> anyhow::Result<()> {
struct StdioReporter { struct StdioReporter {
write_output_as_json: bool, write_output_as_json: bool,
show_indices: bool,
} }
impl Reporter for StdioReporter { impl Reporter for StdioReporter {
fn report_match(&self, file: &str, score: u32) { fn report_match(&self, file_match: &FileMatch) {
if self.write_output_as_json { if self.write_output_as_json {
let value = json!({ "file": file, "score": score }); println!("{}", serde_json::to_string(&file_match).unwrap());
println!("{}", serde_json::to_string(&value).unwrap()); } else if self.show_indices {
let indices = file_match
.indices
.as_ref()
.expect("--compute-indices was specified");
// `indices` is guaranteed to be sorted in ascending order. Instead
// of calling `contains` for every character (which would be O(N^2)
// in the worst-case), walk through the `indices` vector once while
// iterating over the characters.
let mut indices_iter = indices.iter().peekable();
for (i, c) in file_match.path.chars().enumerate() {
match indices_iter.peek() {
Some(next) if **next == i as u32 => {
// ANSI escape code for bold: \x1b[1m ... \x1b[0m
print!("\x1b[1m{}\x1b[0m", c);
// advance the iterator since we've consumed this index
indices_iter.next();
}
_ => {
print!("{}", c);
}
}
}
println!();
} else { } else {
println!("{file}"); println!("{}", file_match.path);
} }
} }

View File

@@ -165,6 +165,7 @@ impl FileSearchManager {
cancellation_token: Arc<AtomicBool>, cancellation_token: Arc<AtomicBool>,
search_state: Arc<Mutex<SearchState>>, search_state: Arc<Mutex<SearchState>>,
) { ) {
let compute_indices = false;
std::thread::spawn(move || { std::thread::spawn(move || {
let matches = file_search::run( let matches = file_search::run(
&query, &query,
@@ -173,11 +174,12 @@ impl FileSearchManager {
Vec::new(), Vec::new(),
NUM_FILE_SEARCH_THREADS, NUM_FILE_SEARCH_THREADS,
cancellation_token.clone(), cancellation_token.clone(),
compute_indices,
) )
.map(|res| { .map(|res| {
res.matches res.matches
.into_iter() .into_iter()
.map(|(_, p)| p) .map(|m| m.path)
.collect::<Vec<String>>() .collect::<Vec<String>>()
}) })
.unwrap_or_default(); .unwrap_or_default();