feat: introduce --compute-indices flag to codex-file-search (#1419)
This is a small quality-of-life feature, the addition of `--compute-indices` to the CLI, which, if enabled, will compute and set the `indices` field for each `FileMatch` returned by `run()`. Note we only bother to compute `indices` once we have the top N results because there could be a lot of intermediate "top N" results during the search that are ultimately discarded. When set, the indices are included in the JSON output when `--json` is specified and the matching indices are displayed in bold when `--json` is not specified.
This commit is contained in:
1
codex-rs/Cargo.lock
generated
1
codex-rs/Cargo.lock
generated
@@ -699,6 +699,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"ignore",
|
"ignore",
|
||||||
"nucleo-matcher",
|
"nucleo-matcher",
|
||||||
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -16,5 +16,6 @@ anyhow = "1"
|
|||||||
clap = { version = "4", features = ["derive"] }
|
clap = { version = "4", features = ["derive"] }
|
||||||
ignore = "0.4.23"
|
ignore = "0.4.23"
|
||||||
nucleo-matcher = "0.3.1"
|
nucleo-matcher = "0.3.1"
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
serde_json = "1.0.110"
|
serde_json = "1.0.110"
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
|||||||
@@ -20,6 +20,10 @@ pub struct Cli {
|
|||||||
#[clap(long, short = 'C')]
|
#[clap(long, short = 'C')]
|
||||||
pub cwd: Option<PathBuf>,
|
pub cwd: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// Include matching file indices in the output.
|
||||||
|
#[arg(long, default_value = "false")]
|
||||||
|
pub compute_indices: bool,
|
||||||
|
|
||||||
// While it is common to default to the number of logical CPUs when creating
|
// While it is common to default to the number of logical CPUs when creating
|
||||||
// a thread pool, empirically, the I/O of the filetree traversal offers
|
// a thread pool, empirically, the I/O of the filetree traversal offers
|
||||||
// limited parallelism and is the bottleneck, so using a smaller number of
|
// limited parallelism and is the bottleneck, so using a smaller number of
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use nucleo_matcher::pattern::AtomKind;
|
|||||||
use nucleo_matcher::pattern::CaseMatching;
|
use nucleo_matcher::pattern::CaseMatching;
|
||||||
use nucleo_matcher::pattern::Normalization;
|
use nucleo_matcher::pattern::Normalization;
|
||||||
use nucleo_matcher::pattern::Pattern;
|
use nucleo_matcher::pattern::Pattern;
|
||||||
|
use serde::Serialize;
|
||||||
use std::cell::UnsafeCell;
|
use std::cell::UnsafeCell;
|
||||||
use std::cmp::Reverse;
|
use std::cmp::Reverse;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
@@ -21,13 +22,31 @@ mod cli;
|
|||||||
|
|
||||||
pub use cli::Cli;
|
pub use cli::Cli;
|
||||||
|
|
||||||
|
/// A single match result returned from the search.
|
||||||
|
///
|
||||||
|
/// * `score` – Relevance score returned by `nucleo_matcher`.
|
||||||
|
/// * `path` – Path to the matched file (relative to the search directory).
|
||||||
|
/// * `indices` – Optional list of character indices that matched the query.
|
||||||
|
/// These are only filled when the caller of [`run`] sets
|
||||||
|
/// `compute_indices` to `true`. The indices vector follows the
|
||||||
|
/// guidance from `nucleo_matcher::Pattern::indices`: they are
|
||||||
|
/// unique and sorted in ascending order so that callers can use
|
||||||
|
/// them directly for highlighting.
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct FileMatch {
|
||||||
|
pub score: u32,
|
||||||
|
pub path: String,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub indices: Option<Vec<u32>>, // Sorted & deduplicated when present
|
||||||
|
}
|
||||||
|
|
||||||
pub struct FileSearchResults {
|
pub struct FileSearchResults {
|
||||||
pub matches: Vec<(u32, String)>,
|
pub matches: Vec<FileMatch>,
|
||||||
pub total_match_count: usize,
|
pub total_match_count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait Reporter {
|
pub trait Reporter {
|
||||||
fn report_match(&self, file: &str, score: u32);
|
fn report_match(&self, file_match: &FileMatch);
|
||||||
fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize);
|
fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize);
|
||||||
fn warn_no_search_pattern(&self, search_directory: &Path);
|
fn warn_no_search_pattern(&self, search_directory: &Path);
|
||||||
}
|
}
|
||||||
@@ -37,6 +56,7 @@ pub async fn run_main<T: Reporter>(
|
|||||||
pattern,
|
pattern,
|
||||||
limit,
|
limit,
|
||||||
cwd,
|
cwd,
|
||||||
|
compute_indices,
|
||||||
json: _,
|
json: _,
|
||||||
exclude,
|
exclude,
|
||||||
threads,
|
threads,
|
||||||
@@ -84,12 +104,13 @@ pub async fn run_main<T: Reporter>(
|
|||||||
exclude,
|
exclude,
|
||||||
threads,
|
threads,
|
||||||
cancel_flag,
|
cancel_flag,
|
||||||
|
compute_indices,
|
||||||
)?;
|
)?;
|
||||||
let match_count = matches.len();
|
let match_count = matches.len();
|
||||||
let matches_truncated = total_match_count > match_count;
|
let matches_truncated = total_match_count > match_count;
|
||||||
|
|
||||||
for (score, file) in matches {
|
for file_match in matches {
|
||||||
reporter.report_match(&file, score);
|
reporter.report_match(&file_match);
|
||||||
}
|
}
|
||||||
if matches_truncated {
|
if matches_truncated {
|
||||||
reporter.warn_matches_truncated(total_match_count, match_count);
|
reporter.warn_matches_truncated(total_match_count, match_count);
|
||||||
@@ -107,6 +128,7 @@ pub fn run(
|
|||||||
exclude: Vec<String>,
|
exclude: Vec<String>,
|
||||||
threads: NonZero<usize>,
|
threads: NonZero<usize>,
|
||||||
cancel_flag: Arc<AtomicBool>,
|
cancel_flag: Arc<AtomicBool>,
|
||||||
|
compute_indices: bool,
|
||||||
) -> anyhow::Result<FileSearchResults> {
|
) -> anyhow::Result<FileSearchResults> {
|
||||||
let pattern = create_pattern(pattern_text);
|
let pattern = create_pattern(pattern_text);
|
||||||
// Create one BestMatchesList per worker thread so that each worker can
|
// Create one BestMatchesList per worker thread so that each worker can
|
||||||
@@ -215,8 +237,41 @@ pub fn run(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect();
|
let mut raw_matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect();
|
||||||
sort_matches(&mut matches);
|
sort_matches(&mut raw_matches);
|
||||||
|
|
||||||
|
// Transform into `FileMatch`, optionally computing indices.
|
||||||
|
let mut matcher = if compute_indices {
|
||||||
|
Some(Matcher::new(nucleo_matcher::Config::DEFAULT))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let matches: Vec<FileMatch> = raw_matches
|
||||||
|
.into_iter()
|
||||||
|
.map(|(score, path)| {
|
||||||
|
let indices = if compute_indices {
|
||||||
|
let mut buf = Vec::<char>::new();
|
||||||
|
let haystack: Utf32Str<'_> = Utf32Str::new(&path, &mut buf);
|
||||||
|
let mut idx_vec: Vec<u32> = Vec::new();
|
||||||
|
if let Some(ref mut m) = matcher {
|
||||||
|
// Ignore the score returned from indices – we already have `score`.
|
||||||
|
pattern.indices(haystack, m, &mut idx_vec);
|
||||||
|
}
|
||||||
|
idx_vec.sort_unstable();
|
||||||
|
idx_vec.dedup();
|
||||||
|
Some(idx_vec)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
FileMatch {
|
||||||
|
score,
|
||||||
|
path,
|
||||||
|
indices,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
Ok(FileSearchResults {
|
Ok(FileSearchResults {
|
||||||
matches,
|
matches,
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
|
use std::io::IsTerminal;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use codex_file_search::Cli;
|
use codex_file_search::Cli;
|
||||||
|
use codex_file_search::FileMatch;
|
||||||
use codex_file_search::Reporter;
|
use codex_file_search::Reporter;
|
||||||
use codex_file_search::run_main;
|
use codex_file_search::run_main;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
@@ -11,6 +13,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
let reporter = StdioReporter {
|
let reporter = StdioReporter {
|
||||||
write_output_as_json: cli.json,
|
write_output_as_json: cli.json,
|
||||||
|
show_indices: cli.compute_indices && std::io::stdout().is_terminal(),
|
||||||
};
|
};
|
||||||
run_main(cli, reporter).await?;
|
run_main(cli, reporter).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -18,15 +21,40 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
struct StdioReporter {
|
struct StdioReporter {
|
||||||
write_output_as_json: bool,
|
write_output_as_json: bool,
|
||||||
|
show_indices: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Reporter for StdioReporter {
|
impl Reporter for StdioReporter {
|
||||||
fn report_match(&self, file: &str, score: u32) {
|
fn report_match(&self, file_match: &FileMatch) {
|
||||||
if self.write_output_as_json {
|
if self.write_output_as_json {
|
||||||
let value = json!({ "file": file, "score": score });
|
println!("{}", serde_json::to_string(&file_match).unwrap());
|
||||||
println!("{}", serde_json::to_string(&value).unwrap());
|
} else if self.show_indices {
|
||||||
|
let indices = file_match
|
||||||
|
.indices
|
||||||
|
.as_ref()
|
||||||
|
.expect("--compute-indices was specified");
|
||||||
|
// `indices` is guaranteed to be sorted in ascending order. Instead
|
||||||
|
// of calling `contains` for every character (which would be O(N^2)
|
||||||
|
// in the worst-case), walk through the `indices` vector once while
|
||||||
|
// iterating over the characters.
|
||||||
|
let mut indices_iter = indices.iter().peekable();
|
||||||
|
|
||||||
|
for (i, c) in file_match.path.chars().enumerate() {
|
||||||
|
match indices_iter.peek() {
|
||||||
|
Some(next) if **next == i as u32 => {
|
||||||
|
// ANSI escape code for bold: \x1b[1m ... \x1b[0m
|
||||||
|
print!("\x1b[1m{}\x1b[0m", c);
|
||||||
|
// advance the iterator since we've consumed this index
|
||||||
|
indices_iter.next();
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
print!("{}", c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!();
|
||||||
} else {
|
} else {
|
||||||
println!("{file}");
|
println!("{}", file_match.path);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -165,6 +165,7 @@ impl FileSearchManager {
|
|||||||
cancellation_token: Arc<AtomicBool>,
|
cancellation_token: Arc<AtomicBool>,
|
||||||
search_state: Arc<Mutex<SearchState>>,
|
search_state: Arc<Mutex<SearchState>>,
|
||||||
) {
|
) {
|
||||||
|
let compute_indices = false;
|
||||||
std::thread::spawn(move || {
|
std::thread::spawn(move || {
|
||||||
let matches = file_search::run(
|
let matches = file_search::run(
|
||||||
&query,
|
&query,
|
||||||
@@ -173,11 +174,12 @@ impl FileSearchManager {
|
|||||||
Vec::new(),
|
Vec::new(),
|
||||||
NUM_FILE_SEARCH_THREADS,
|
NUM_FILE_SEARCH_THREADS,
|
||||||
cancellation_token.clone(),
|
cancellation_token.clone(),
|
||||||
|
compute_indices,
|
||||||
)
|
)
|
||||||
.map(|res| {
|
.map(|res| {
|
||||||
res.matches
|
res.matches
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(_, p)| p)
|
.map(|m| m.path)
|
||||||
.collect::<Vec<String>>()
|
.collect::<Vec<String>>()
|
||||||
})
|
})
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|||||||
Reference in New Issue
Block a user