From e2efe8da9c6ebbbe142e5736e130c2df761be61f Mon Sep 17 00:00:00 2001
From: Michael Bolin <mbolin@openai.com>
Date: Sat, 28 Jun 2025 14:39:29 -0700
Subject: [PATCH] feat: introduce --compute-indices flag to codex-file-search
 (#1419)

This is a small quality-of-life feature, the addition of
`--compute-indices` to the CLI, which, if enabled, will compute and set
the `indices` field for each `FileMatch` returned by `run()`. Note we
only bother to compute `indices` once we have the top N results because
there could be a lot of intermediate "top N" results during the search
that are ultimately discarded.

When set, the indices are included in the JSON output when `--json` is
specified and the matching indices are displayed in bold when `--json`
is not specified.
---
 codex-rs/Cargo.lock              |  1 +
 codex-rs/file-search/Cargo.toml  |  1 +
 codex-rs/file-search/src/cli.rs  |  4 ++
 codex-rs/file-search/src/lib.rs  | 67 +++++++++++++++++++++++++++++---
 codex-rs/file-search/src/main.rs | 36 +++++++++++++++--
 codex-rs/tui/src/file_search.rs  |  4 +-
 6 files changed, 102 insertions(+), 11 deletions(-)
diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock
index bfc78b65..035f37e5 100644
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@@ -699,6 +699,7 @@ dependencies = [
  "clap",
  "ignore",
  "nucleo-matcher",
+ "serde",
  "serde_json",
  "tokio",
 ]
diff --git a/codex-rs/file-search/Cargo.toml b/codex-rs/file-search/Cargo.toml
index 1850d5ac..bb5b80b2 100644
--- a/codex-rs/file-search/Cargo.toml
+++ b/codex-rs/file-search/Cargo.toml
@@ -16,5 +16,6 @@ anyhow = "1"
 clap = { version = "4", features = ["derive"] }
 ignore = "0.4.23"
 nucleo-matcher = "0.3.1"
+serde = { version = "1", features = ["derive"] }
 serde_json = "1.0.110"
 tokio = { version = "1", features = ["full"] }
diff --git a/codex-rs/file-search/src/cli.rs b/codex-rs/file-search/src/cli.rs
index 27afcbc1..e3394f92 100644
--- a/codex-rs/file-search/src/cli.rs
+++ b/codex-rs/file-search/src/cli.rs
@@ -20,6 +20,10 @@ pub struct Cli {
     #[clap(long, short = 'C')]
     pub cwd: Option<PathBuf>,
 
+    /// Include matching file indices in the output.
+    #[arg(long, default_value = "false")]
+    pub compute_indices: bool,
+
     // While it is common to default to the number of logical CPUs when creating
     // a thread pool, empirically, the I/O of the filetree traversal offers
     // limited parallelism and is the bottleneck, so using a smaller number of
diff --git a/codex-rs/file-search/src/lib.rs b/codex-rs/file-search/src/lib.rs
index 8f7bce3e..2365c176 100644
--- a/codex-rs/file-search/src/lib.rs
+++ b/codex-rs/file-search/src/lib.rs
@@ -6,6 +6,7 @@ use nucleo_matcher::pattern::AtomKind;
 use nucleo_matcher::pattern::CaseMatching;
 use nucleo_matcher::pattern::Normalization;
 use nucleo_matcher::pattern::Pattern;
+use serde::Serialize;
 use std::cell::UnsafeCell;
 use std::cmp::Reverse;
 use std::collections::BinaryHeap;
@@ -21,13 +22,31 @@ mod cli;
 
 pub use cli::Cli;
 
+/// A single match result returned from the search.
+///
+/// * `score` – Relevance score returned by `nucleo_matcher`.
+/// * `path`  – Path to the matched file (relative to the search directory).
+/// * `indices` – Optional list of character indices that matched the query.
+///   These are only filled when the caller of [`run`] sets
+///   `compute_indices` to `true`.  The indices vector follows the
+///   guidance from `nucleo_matcher::Pattern::indices`: they are
+///   unique and sorted in ascending order so that callers can use
+///   them directly for highlighting.
+#[derive(Debug, Clone, Serialize)]
+pub struct FileMatch {
+    pub score: u32,
+    pub path: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub indices: Option<Vec<u32>>, // Sorted & deduplicated when present
+}
+
 pub struct FileSearchResults {
-    pub matches: Vec<(u32, String)>,
+    pub matches: Vec<FileMatch>,
     pub total_match_count: usize,
 }
 
 pub trait Reporter {
-    fn report_match(&self, file: &str, score: u32);
+    fn report_match(&self, file_match: &FileMatch);
     fn warn_matches_truncated(&self, total_match_count: usize, shown_match_count: usize);
     fn warn_no_search_pattern(&self, search_directory: &Path);
 }
@@ -37,6 +56,7 @@ pub async fn run_main<T: Reporter>(
         pattern,
         limit,
         cwd,
+        compute_indices,
         json: _,
         exclude,
         threads,
@@ -84,12 +104,13 @@ pub async fn run_main<T: Reporter>(
         exclude,
         threads,
         cancel_flag,
+        compute_indices,
     )?;
     let match_count = matches.len();
     let matches_truncated = total_match_count > match_count;
 
-    for (score, file) in matches {
-        reporter.report_match(&file, score);
+    for file_match in matches {
+        reporter.report_match(&file_match);
     }
     if matches_truncated {
         reporter.warn_matches_truncated(total_match_count, match_count);
@@ -107,6 +128,7 @@ pub fn run(
     exclude: Vec<String>,
     threads: NonZero<usize>,
     cancel_flag: Arc<AtomicBool>,
+    compute_indices: bool,
 ) -> anyhow::Result<FileSearchResults> {
     let pattern = create_pattern(pattern_text);
     // Create one BestMatchesList per worker thread so that each worker can
@@ -215,8 +237,41 @@ pub fn run(
         }
     }
 
-    let mut matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect();
-    sort_matches(&mut matches);
+    let mut raw_matches: Vec<(u32, String)> = global_heap.into_iter().map(|r| r.0).collect();
+    sort_matches(&mut raw_matches);
+
+    // Transform into `FileMatch`, optionally computing indices.
+    let mut matcher = if compute_indices {
+        Some(Matcher::new(nucleo_matcher::Config::DEFAULT))
+    } else {
+        None
+    };
+
+    let matches: Vec<FileMatch> = raw_matches
+        .into_iter()
+        .map(|(score, path)| {
+            let indices = if compute_indices {
+                let mut buf = Vec::<char>::new();
+                let haystack: Utf32Str<'_> = Utf32Str::new(&path, &mut buf);
+                let mut idx_vec: Vec<u32> = Vec::new();
+                if let Some(ref mut m) = matcher {
+                    // Ignore the score returned from indices – we already have `score`.
+                    pattern.indices(haystack, m, &mut idx_vec);
+                }
+                idx_vec.sort_unstable();
+                idx_vec.dedup();
+                Some(idx_vec)
+            } else {
+                None
+            };
+
+            FileMatch {
+                score,
+                path,
+                indices,
+            }
+        })
+        .collect();
 
     Ok(FileSearchResults {
         matches,
diff --git a/codex-rs/file-search/src/main.rs b/codex-rs/file-search/src/main.rs
index c25122c1..6635dc03 100644
--- a/codex-rs/file-search/src/main.rs
+++ b/codex-rs/file-search/src/main.rs
@@ -1,7 +1,9 @@
+use std::io::IsTerminal;
 use std::path::Path;
 
 use clap::Parser;
 use codex_file_search::Cli;
+use codex_file_search::FileMatch;
 use codex_file_search::Reporter;
 use codex_file_search::run_main;
 use serde_json::json;
@@ -11,6 +13,7 @@ async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
     let reporter = StdioReporter {
         write_output_as_json: cli.json,
+        show_indices: cli.compute_indices && std::io::stdout().is_terminal(),
     };
     run_main(cli, reporter).await?;
     Ok(())
@@ -18,15 +21,40 @@ async fn main() -> anyhow::Result<()> {
 
 struct StdioReporter {
     write_output_as_json: bool,
+    show_indices: bool,
 }
 
 impl Reporter for StdioReporter {
-    fn report_match(&self, file: &str, score: u32) {
+    fn report_match(&self, file_match: &FileMatch) {
         if self.write_output_as_json {
-            let value = json!({ "file": file, "score": score });
-            println!("{}", serde_json::to_string(&value).unwrap());
+            println!("{}", serde_json::to_string(&file_match).unwrap());
+        } else if self.show_indices {
+            let indices = file_match
+                .indices
+                .as_ref()
+                .expect("--compute-indices was specified");
+            // `indices` is guaranteed to be sorted in ascending order. Instead
+            // of calling `contains` for every character (which would be O(N^2)
+            // in the worst-case), walk through the `indices` vector once while
+            // iterating over the characters.
+            let mut indices_iter = indices.iter().peekable();
+
+            for (i, c) in file_match.path.chars().enumerate() {
+                match indices_iter.peek() {
+                    Some(next) if **next == i as u32 => {
+                        // ANSI escape code for bold: \x1b[1m ... \x1b[0m
+                        print!("\x1b[1m{}\x1b[0m", c);
+                        // advance the iterator since we've consumed this index
+                        indices_iter.next();
+                    }
+                    _ => {
+                        print!("{}", c);
+                    }
+                }
+            }
+            println!();
         } else {
-            println!("{file}");
+            println!("{}", file_match.path);
         }
     }
 
diff --git a/codex-rs/tui/src/file_search.rs b/codex-rs/tui/src/file_search.rs
index 7a76a1f0..77eee35b 100644
--- a/codex-rs/tui/src/file_search.rs
+++ b/codex-rs/tui/src/file_search.rs
@@ -165,6 +165,7 @@ impl FileSearchManager {
         cancellation_token: Arc<AtomicBool>,
         search_state: Arc<Mutex<SearchState>>,
     ) {
+        let compute_indices = false;
         std::thread::spawn(move || {
             let matches = file_search::run(
                 &query,
@@ -173,11 +174,12 @@ impl FileSearchManager {
                 Vec::new(),
                 NUM_FILE_SEARCH_THREADS,
                 cancellation_token.clone(),
+                compute_indices,
             )
             .map(|res| {
                 res.matches
                     .into_iter()
-                    .map(|(_, p)| p)
+                    .map(|m| m.path)
                     .collect::<Vec<String>>()
             })
             .unwrap_or_default();