Files
llmx/codex-rs/common/src/fuzzy_match.rs
easong-openai 2098b40369 Scrollable slash commands (#1830)
Scrollable slash commands. Part 1 of the multi PR.
2025-08-06 21:23:09 -07:00

178 lines
6.1 KiB
Rust

/// Simple case-insensitive subsequence matcher used for fuzzy filtering.
///
/// Returns the indices (character positions) of the matched characters in the
/// ORIGINAL `haystack` string and a score where smaller is better.
///
/// Unicode correctness: we perform the match on a lowercased copy of the
/// haystack and needle but maintain a mapping from each character in the
/// lowercased haystack back to the original character index in `haystack`.
/// This ensures the returned indices can be safely used with
/// `str::chars().enumerate()` consumers for highlighting, even when
/// lowercasing expands certain characters (e.g., ß → ss, İ → i̇).
pub fn fuzzy_match(haystack: &str, needle: &str) -> Option<(Vec<usize>, i32)> {
if needle.is_empty() {
return Some((Vec::new(), i32::MAX));
}
let mut lowered_chars: Vec<char> = Vec::new();
let mut lowered_to_orig_char_idx: Vec<usize> = Vec::new();
for (orig_idx, ch) in haystack.chars().enumerate() {
for lc in ch.to_lowercase() {
lowered_chars.push(lc);
lowered_to_orig_char_idx.push(orig_idx);
}
}
let lowered_needle: Vec<char> = needle.to_lowercase().chars().collect();
let mut result_orig_indices: Vec<usize> = Vec::with_capacity(lowered_needle.len());
let mut last_lower_pos: Option<usize> = None;
let mut cur = 0usize;
for &nc in lowered_needle.iter() {
let mut found_at: Option<usize> = None;
while cur < lowered_chars.len() {
if lowered_chars[cur] == nc {
found_at = Some(cur);
cur += 1;
break;
}
cur += 1;
}
let pos = found_at?;
result_orig_indices.push(lowered_to_orig_char_idx[pos]);
last_lower_pos = Some(pos);
}
let first_lower_pos = if result_orig_indices.is_empty() {
0usize
} else {
let target_orig = result_orig_indices[0];
lowered_to_orig_char_idx
.iter()
.position(|&oi| oi == target_orig)
.unwrap_or(0)
};
// last defaults to first for single-hit; score = extra span between first/last hit
// minus needle len (≥0).
// Strongly reward prefix matches by subtracting 100 when the first hit is at index 0.
let last_lower_pos = last_lower_pos.unwrap_or(first_lower_pos);
let window =
(last_lower_pos as i32 - first_lower_pos as i32 + 1) - (lowered_needle.len() as i32);
let mut score = window.max(0);
if first_lower_pos == 0 {
score -= 100;
}
result_orig_indices.sort_unstable();
result_orig_indices.dedup();
Some((result_orig_indices, score))
}
/// Convenience wrapper to get only the indices for a fuzzy match.
pub fn fuzzy_indices(haystack: &str, needle: &str) -> Option<Vec<usize>> {
fuzzy_match(haystack, needle).map(|(mut idx, _)| {
idx.sort_unstable();
idx.dedup();
idx
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_basic_indices() {
let (idx, score) = match fuzzy_match("hello", "hl") {
Some(v) => v,
None => panic!("expected a match"),
};
assert_eq!(idx, vec![0, 2]);
// 'h' at 0, 'l' at 2 -> window 1; start-of-string bonus applies (-100)
assert_eq!(score, -99);
}
#[test]
fn unicode_dotted_i_istanbul_highlighting() {
let (idx, score) = match fuzzy_match("İstanbul", "is") {
Some(v) => v,
None => panic!("expected a match"),
};
assert_eq!(idx, vec![0, 1]);
// Matches at lowered positions 0 and 2 -> window 1; start-of-string bonus applies
assert_eq!(score, -99);
}
#[test]
fn unicode_german_sharp_s_casefold() {
assert!(fuzzy_match("straße", "strasse").is_none());
}
#[test]
fn prefer_contiguous_match_over_spread() {
let (_idx_a, score_a) = match fuzzy_match("abc", "abc") {
Some(v) => v,
None => panic!("expected a match"),
};
let (_idx_b, score_b) = match fuzzy_match("a-b-c", "abc") {
Some(v) => v,
None => panic!("expected a match"),
};
// Contiguous window -> 0; start-of-string bonus -> -100
assert_eq!(score_a, -100);
// Spread over 5 chars for 3-letter needle -> window 2; with bonus -> -98
assert_eq!(score_b, -98);
assert!(score_a < score_b);
}
#[test]
fn start_of_string_bonus_applies() {
let (_idx_a, score_a) = match fuzzy_match("file_name", "file") {
Some(v) => v,
None => panic!("expected a match"),
};
let (_idx_b, score_b) = match fuzzy_match("my_file_name", "file") {
Some(v) => v,
None => panic!("expected a match"),
};
// Start-of-string contiguous -> window 0; bonus -> -100
assert_eq!(score_a, -100);
// Non-prefix contiguous -> window 0; no bonus -> 0
assert_eq!(score_b, 0);
assert!(score_a < score_b);
}
#[test]
fn empty_needle_matches_with_max_score_and_no_indices() {
let (idx, score) = match fuzzy_match("anything", "") {
Some(v) => v,
None => panic!("empty needle should match"),
};
assert!(idx.is_empty());
assert_eq!(score, i32::MAX);
}
#[test]
fn case_insensitive_matching_basic() {
let (idx, score) = match fuzzy_match("FooBar", "foO") {
Some(v) => v,
None => panic!("expected a match"),
};
assert_eq!(idx, vec![0, 1, 2]);
// Contiguous prefix match (case-insensitive) -> window 0 with bonus
assert_eq!(score, -100);
}
#[test]
fn indices_are_deduped_for_multichar_lowercase_expansion() {
let needle = "\u{0069}\u{0307}"; // "i" + combining dot above
let (idx, score) = match fuzzy_match("İ", needle) {
Some(v) => v,
None => panic!("expected a match"),
};
assert_eq!(idx, vec![0]);
// Lowercasing 'İ' expands to two chars; contiguous prefix -> window 0 with bonus
assert_eq!(score, -100);
}
}